### What problem does this PR solve? ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)tags/v0.10.0
| return FileType.PDF.value | return FileType.PDF.value | ||||
| if re.match( | if re.match( | ||||
| r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename): | |||||
| r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): | |||||
| return FileType.DOC.value | return FileType.DOC.value | ||||
| if re.match( | if re.match( | 
| # | # | ||||
| from rag.nlp import find_codec,num_tokens_from_string | from rag.nlp import find_codec,num_tokens_from_string | ||||
| import re | |||||
| class RAGFlowTxtParser: | class RAGFlowTxtParser: | ||||
| def __call__(self, fnm, binary=None, chunk_token_num=128): | def __call__(self, fnm, binary=None, chunk_token_num=128): | ||||
| return self.parser_txt(txt, chunk_token_num) | return self.parser_txt(txt, chunk_token_num) | ||||
| @classmethod | @classmethod | ||||
| def parser_txt(cls, txt, chunk_token_num=128): | |||||
| def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"): | |||||
| if type(txt) != str: | if type(txt) != str: | ||||
| raise TypeError("txt type should be str!") | raise TypeError("txt type should be str!") | ||||
| sections = [] | sections = [] | ||||
| for sec in txt.split("\n"): | |||||
| for sec in re.split(r"[%s]+"%delimiter, txt): | |||||
| if sections and sec in delimiter: | |||||
| sections[-1][0] += sec | |||||
| continue | |||||
| if num_tokens_from_string(sec) > 10 * int(chunk_token_num): | if num_tokens_from_string(sec) > 10 * int(chunk_token_num): | ||||
| sections.append((sec[: int(len(sec) / 2)], "")) | |||||
| sections.append((sec[int(len(sec) / 2) :], "")) | |||||
| sections.append([sec[: int(len(sec) / 2)], ""]) | |||||
| sections.append([sec[int(len(sec) / 2) :], ""]) | |||||
| else: | else: | ||||
| sections.append((sec, "")) | |||||
| sections.append([sec, ""]) | |||||
| return sections | return sections | 
| excel_parser = ExcelParser() | excel_parser = ExcelParser() | ||||
| sections = [(l, "") for l in excel_parser.html(binary) if l] | sections = [(l, "") for l in excel_parser.html(binary) if l] | ||||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): | |||||
| elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): | |||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128)) | |||||
| sections = TxtParser()(filename,binary, | |||||
| parser_config.get("chunk_token_num", 128), | |||||
| parser_config.get("delimiter", "\n!?;。;!?")) | |||||
| callback(0.8, "Finish parsing.") | callback(0.8, "Finish parsing.") | ||||
| elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): | elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): |