### What problem does this PR solve? #1161 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)

1 year ago · e35f7610e7
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
    def __call__(self, fnm, from_page=0, to_page=100000):
        self.doc = Document(fnm) if isinstance(
            fnm, str) else Document(BytesIO(fnm))
        pn = 0
        secs = []
        pn = 0 # parsed page
        secs = [] # parsed contents
        for p in self.doc.paragraphs:
            if pn > to_page:
                break
            if from_page <= pn < to_page and p.text.strip():
                secs.append((p.text, p.style.name))
            runs_within_single_paragraph = [] # save runs within the range of pages
            for run in p.runs:
                if 'lastRenderedPageBreak' in run._element.xml:
                    pn += 1
                    continue
                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                if pn > to_page:
                    break
                if from_page <= pn < to_page and p.text.strip():
                    runs_within_single_paragraph.append(run.text) # append run.text first
                # wrap page break checker into a static method
                if RAGFlowDocxParser.has_page_break(run._element.xml):
                    pn += 1
            secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
        tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
        return secs, tbls
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
    return d
 def mdQuestionLevel(s):
    match = re.match(r'#*', s)
    return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
                        break
                    txt += l
        lines = txt.split("\n")
        comma, tab = 0, 0
        last_question, last_answer = "", ""
        question_stack, level_stack = [], []
        code_block = False
                last_answer = f'{last_answer}\n{l}'
            else:   # is a question
                if last_answer:
                    sum_question = ('\n').join(question_stack)
                    sum_question = '\n'.join(question_stack)
                    if sum_question:
                        res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
                    last_answer = ''
                question_stack.append(question)
                level_stack.append(question_level)
        if last_answer:
            sum_question = ('\n').join(question_stack)
            sum_question = '\n'.join(question_stack)
            if sum_question:
                res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
        return res
    raise NotImplementedError(
        "Excel, csv(txt), pdf and markdown format files are supported.")
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
                    sm = []
                keywords.append(re.sub(r"[ \\\"']+", "", tk))
                if len(keywords) >= 12: break
                tk_syns = self.syn.lookup(tk)
                tk = EsQueryer.subSpecialChar(tk)
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
        if not qst:
            if not req.get("sort"):
                s = s.sort(
                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                    {"create_timestamp_flt": {
                        "order": "desc", "unmapped_type": "float"}}
                )
                                      "mode": "avg", "numeric_type": "double"}},
                    {"top_int": {"order": "asc", "unmapped_type": "float",
                                 "mode": "avg", "numeric_type": "double"}},
                    {"create_time": {"order": "desc", "unmapped_type": "date"}},
                    #{"create_time": {"order": "desc", "unmapped_type": "date"}},
                    {"create_timestamp_flt": {
                        "order": "desc", "unmapped_type": "float"}}
                )