### What problem does this PR solve? ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)

il y a 1 an · cafdee536f
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
        return FileType.PDF.value
    if re.match(
             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
             r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
        return FileType.DOC.value
    if re.match(
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
 #
 from rag.nlp import find_codec,num_tokens_from_string
 import re
 class RAGFlowTxtParser:
    def __call__(self, fnm, binary=None, chunk_token_num=128):
        return self.parser_txt(txt, chunk_token_num)
    @classmethod
    def parser_txt(cls, txt, chunk_token_num=128):
    def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。；！？"):
        if type(txt) != str:
            raise TypeError("txt type should be str!")
        sections = []
        for sec in txt.split("\n"):
        for sec in re.split(r"[%s]+"%delimiter, txt):
            if sections and sec in delimiter:
                sections[-1][0] += sec
                continue
            if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
                sections.append((sec[: int(len(sec) / 2)], ""))
                sections.append((sec[int(len(sec) / 2) :], ""))
                sections.append([sec[: int(len(sec) / 2)], ""])
                sections.append([sec[int(len(sec) / 2) :], ""])
            else:
                sections.append((sec, ""))
                sections.append([sec, ""])
        return sections
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
        excel_parser = ExcelParser()
        sections = [(l, "") for l in excel_parser.html(binary) if l]
    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
        sections = TxtParser()(filename,binary,
                               parser_config.get("chunk_token_num", 128),
                               parser_config.get("delimiter", "\n!?;。；！？"))
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):