### What problem does this PR solve? #4223 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/nightly
| @@ -264,6 +264,7 @@ class RagTokenizer: | |||
| return [self.stemmer.stem(self.lemmatizer.lemmatize(t)) if re.match(r"[a-zA-Z_-]+$", t) else t for t in tks] | |||
| def tokenize(self, line): | |||
| line = re.sub(r"\W+", " ", line) | |||
| line = self._strQ2B(line).lower() | |||
| line = self._tradi2simp(line) | |||
| zh_num = len([1 for c in line if is_chinese(c)]) | |||