| if len(preview_texts) < 5: | if len(preview_texts) < 5: | ||||
| preview_texts.append(document.page_content) | preview_texts.append(document.page_content) | ||||
| tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, document.page_content) | |||||
| tokens += TokenCalculator.get_num_tokens(self.embedding_model_name, | |||||
| self.filter_string(document.page_content)) | |||||
| return { | return { | ||||
| "total_segments": total_segments, | "total_segments": total_segments, | ||||
| return text_docs | return text_docs | ||||
| def filter_string(self, text): | def filter_string(self, text): | ||||
| text = text.replace('<|', '<') | |||||
| text = text.replace('|>', '>') | |||||
| pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') | pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') | ||||
| return pattern.sub('', text) | return pattern.sub('', text) | ||||
| return documents | return documents | ||||
| def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, | def _split_to_documents(self, text_docs: List[Document], splitter: TextSplitter, | ||||
| processing_rule: DatasetProcessRule) -> List[Document]: | |||||
| processing_rule: DatasetProcessRule) -> List[Document]: | |||||
| """ | """ | ||||
| Split the text documents into nodes. | Split the text documents into nodes. | ||||
| """ | """ |