### What problem does this PR solve? #1161 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.8.0
| @@ -113,19 +113,24 @@ class RAGFlowDocxParser: | |||
| def __call__(self, fnm, from_page=0, to_page=100000): | |||
| self.doc = Document(fnm) if isinstance( | |||
| fnm, str) else Document(BytesIO(fnm)) | |||
| pn = 0 | |||
| secs = [] | |||
| pn = 0 # parsed page | |||
| secs = [] # parsed contents | |||
| for p in self.doc.paragraphs: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| secs.append((p.text, p.style.name)) | |||
| runs_within_single_paragraph = [] # save runs within the range of pages | |||
| for run in p.runs: | |||
| if 'lastRenderedPageBreak' in run._element.xml: | |||
| pn += 1 | |||
| continue | |||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||
| if pn > to_page: | |||
| break | |||
| if from_page <= pn < to_page and p.text.strip(): | |||
| runs_within_single_paragraph.append(run.text) # append run.text first | |||
| # wrap page break checker into a static method | |||
| if RAGFlowDocxParser.has_page_break(run._element.xml): | |||
| pn += 1 | |||
| secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph | |||
| tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] | |||
| return secs, tbls | |||
| @@ -145,6 +145,7 @@ def beAdoc(d, q, a, eng): | |||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||
| return d | |||
| def mdQuestionLevel(s): | |||
| match = re.match(r'#*', s) | |||
| return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) | |||
| @@ -244,7 +245,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| break | |||
| txt += l | |||
| lines = txt.split("\n") | |||
| comma, tab = 0, 0 | |||
| last_question, last_answer = "", "" | |||
| question_stack, level_stack = [], [] | |||
| code_block = False | |||
| @@ -262,7 +262,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| last_answer = f'{last_answer}\n{l}' | |||
| else: # is a question | |||
| if last_answer: | |||
| sum_question = ('\n').join(question_stack) | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | |||
| last_answer = '' | |||
| @@ -274,12 +274,11 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): | |||
| question_stack.append(question) | |||
| level_stack.append(question_level) | |||
| if last_answer: | |||
| sum_question = ('\n').join(question_stack) | |||
| sum_question = '\n'.join(question_stack) | |||
| if sum_question: | |||
| res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | |||
| return res | |||
| raise NotImplementedError( | |||
| "Excel, csv(txt), pdf and markdown format files are supported.") | |||
| @@ -110,6 +110,7 @@ class EsQueryer: | |||
| sm = [] | |||
| keywords.append(re.sub(r"[ \\\"']+", "", tk)) | |||
| if len(keywords) >= 12: break | |||
| tk_syns = self.syn.lookup(tk) | |||
| tk = EsQueryer.subSpecialChar(tk) | |||
| @@ -98,7 +98,7 @@ class Dealer: | |||
| if not qst: | |||
| if not req.get("sort"): | |||
| s = s.sort( | |||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||
| #{"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||
| {"create_timestamp_flt": { | |||
| "order": "desc", "unmapped_type": "float"}} | |||
| ) | |||
| @@ -108,7 +108,7 @@ class Dealer: | |||
| "mode": "avg", "numeric_type": "double"}}, | |||
| {"top_int": {"order": "asc", "unmapped_type": "float", | |||
| "mode": "avg", "numeric_type": "double"}}, | |||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||
| #{"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||
| {"create_timestamp_flt": { | |||
| "order": "desc", "unmapped_type": "float"}} | |||
| ) | |||