### What problem does this PR solve? #1161 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.8.0
| def __call__(self, fnm, from_page=0, to_page=100000): | def __call__(self, fnm, from_page=0, to_page=100000): | ||||
| self.doc = Document(fnm) if isinstance( | self.doc = Document(fnm) if isinstance( | ||||
| fnm, str) else Document(BytesIO(fnm)) | fnm, str) else Document(BytesIO(fnm)) | ||||
| pn = 0 | |||||
| secs = [] | |||||
| pn = 0 # parsed page | |||||
| secs = [] # parsed contents | |||||
| for p in self.doc.paragraphs: | for p in self.doc.paragraphs: | ||||
| if pn > to_page: | if pn > to_page: | ||||
| break | break | ||||
| if from_page <= pn < to_page and p.text.strip(): | |||||
| secs.append((p.text, p.style.name)) | |||||
| runs_within_single_paragraph = [] # save runs within the range of pages | |||||
| for run in p.runs: | for run in p.runs: | ||||
| if 'lastRenderedPageBreak' in run._element.xml: | |||||
| pn += 1 | |||||
| continue | |||||
| if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: | |||||
| if pn > to_page: | |||||
| break | |||||
| if from_page <= pn < to_page and p.text.strip(): | |||||
| runs_within_single_paragraph.append(run.text) # append run.text first | |||||
| # wrap page break checker into a static method | |||||
| if RAGFlowDocxParser.has_page_break(run._element.xml): | |||||
| pn += 1 | pn += 1 | ||||
| secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph | |||||
| tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] | tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] | ||||
| return secs, tbls | return secs, tbls |
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | ||||
| return d | return d | ||||
| def mdQuestionLevel(s): | def mdQuestionLevel(s): | ||||
| match = re.match(r'#*', s) | match = re.match(r'#*', s) | ||||
| return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) | return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) | ||||
| break | break | ||||
| txt += l | txt += l | ||||
| lines = txt.split("\n") | lines = txt.split("\n") | ||||
| comma, tab = 0, 0 | |||||
| last_question, last_answer = "", "" | last_question, last_answer = "", "" | ||||
| question_stack, level_stack = [], [] | question_stack, level_stack = [], [] | ||||
| code_block = False | code_block = False | ||||
| last_answer = f'{last_answer}\n{l}' | last_answer = f'{last_answer}\n{l}' | ||||
| else: # is a question | else: # is a question | ||||
| if last_answer: | if last_answer: | ||||
| sum_question = ('\n').join(question_stack) | |||||
| sum_question = '\n'.join(question_stack) | |||||
| if sum_question: | if sum_question: | ||||
| res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | ||||
| last_answer = '' | last_answer = '' | ||||
| question_stack.append(question) | question_stack.append(question) | ||||
| level_stack.append(question_level) | level_stack.append(question_level) | ||||
| if last_answer: | if last_answer: | ||||
| sum_question = ('\n').join(question_stack) | |||||
| sum_question = '\n'.join(question_stack) | |||||
| if sum_question: | if sum_question: | ||||
| res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) | ||||
| return res | return res | ||||
| raise NotImplementedError( | raise NotImplementedError( | ||||
| "Excel, csv(txt), pdf and markdown format files are supported.") | "Excel, csv(txt), pdf and markdown format files are supported.") | ||||
| sm = [] | sm = [] | ||||
| keywords.append(re.sub(r"[ \\\"']+", "", tk)) | keywords.append(re.sub(r"[ \\\"']+", "", tk)) | ||||
| if len(keywords) >= 12: break | |||||
| tk_syns = self.syn.lookup(tk) | tk_syns = self.syn.lookup(tk) | ||||
| tk = EsQueryer.subSpecialChar(tk) | tk = EsQueryer.subSpecialChar(tk) |
| if not qst: | if not qst: | ||||
| if not req.get("sort"): | if not req.get("sort"): | ||||
| s = s.sort( | s = s.sort( | ||||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||||
| #{"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||||
| {"create_timestamp_flt": { | {"create_timestamp_flt": { | ||||
| "order": "desc", "unmapped_type": "float"}} | "order": "desc", "unmapped_type": "float"}} | ||||
| ) | ) | ||||
| "mode": "avg", "numeric_type": "double"}}, | "mode": "avg", "numeric_type": "double"}}, | ||||
| {"top_int": {"order": "asc", "unmapped_type": "float", | {"top_int": {"order": "asc", "unmapped_type": "float", | ||||
| "mode": "avg", "numeric_type": "double"}}, | "mode": "avg", "numeric_type": "double"}}, | ||||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||||
| #{"create_time": {"order": "desc", "unmapped_type": "date"}}, | |||||
| {"create_timestamp_flt": { | {"create_timestamp_flt": { | ||||
| "order": "desc", "unmapped_type": "float"}} | "order": "desc", "unmapped_type": "float"}} | ||||
| ) | ) |