| return text_docs | return text_docs | ||||
| def filter_string(self, text): | def filter_string(self, text): | ||||
| text = text.replace('<|', '<') | |||||
| text = text.replace('|>', '>') | |||||
| pattern = re.compile('[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]') | |||||
| return pattern.sub('', text) | |||||
| text = re.sub(r'<\|', '<', text) | |||||
| text = re.sub(r'\|>', '>', text) | |||||
| text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text) | |||||
| return text | |||||
| def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter: | def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter: | ||||
| """ | """ |