| from core.llm_generator.llm_generator import LLMGenerator | from core.llm_generator.llm_generator import LLMGenerator | ||||
| from core.model_manager import ModelInstance, ModelManager | from core.model_manager import ModelInstance, ModelManager | ||||
| from core.model_runtime.entities.model_entities import ModelType | from core.model_runtime.entities.model_entities import ModelType | ||||
| from core.rag.cleaner.clean_processor import CleanProcessor | |||||
| from core.rag.datasource.keyword.keyword_factory import Keyword | from core.rag.datasource.keyword.keyword_factory import Keyword | ||||
| from core.rag.docstore.dataset_docstore import DatasetDocumentStore | from core.rag.docstore.dataset_docstore import DatasetDocumentStore | ||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| rules = DatasetProcessRule.AUTOMATIC_RULES | rules = DatasetProcessRule.AUTOMATIC_RULES | ||||
| else: | else: | ||||
| rules = json.loads(processing_rule.rules) if processing_rule.rules else {} | rules = json.loads(processing_rule.rules) if processing_rule.rules else {} | ||||
| document_text = CleanProcessor.clean(text, rules) | |||||
| if "pre_processing_rules" in rules: | |||||
| pre_processing_rules = rules["pre_processing_rules"] | |||||
| for pre_processing_rule in pre_processing_rules: | |||||
| if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True: | |||||
| # Remove extra spaces | |||||
| pattern = r"\n{3,}" | |||||
| text = re.sub(pattern, "\n\n", text) | |||||
| pattern = r"[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}" | |||||
| text = re.sub(pattern, " ", text) | |||||
| elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True: | |||||
| # Remove email | |||||
| pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)" | |||||
| text = re.sub(pattern, "", text) | |||||
| # Remove URL | |||||
| pattern = r"https?://[^\s]+" | |||||
| text = re.sub(pattern, "", text) | |||||
| return text | |||||
| return document_text | |||||
| @staticmethod | @staticmethod | ||||
| def format_split_text(text): | def format_split_text(text): |