| @@ -235,7 +235,8 @@ class IndexingRunner: | |||
| if len(preview_texts) < 5: | |||
| preview_texts.append(document.page_content) | |||
| tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content) | |||
| tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, | |||
| self.filter_string(document.page_content)) | |||
| return { | |||
| "total_segments": total_segments, | |||
| @@ -345,6 +346,8 @@ class IndexingRunner: | |||
| return text_docs | |||
| def filter_string(self, text): | |||
| text = text.replace('<|', '<') | |||
| text = text.replace('|>', '>') | |||
| pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') | |||
| return pattern.sub('', text) | |||
| @@ -425,7 +428,7 @@ class IndexingRunner: | |||
| return documents | |||
| def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, | |||
| processing_rule: DatasetProcessRule) -> List[Document]: | |||
| processing_rule: DatasetProcessRule) -> List[Document]: | |||
| """ | |||
| Split the text documents into nodes. | |||
| """ | |||