* fix position extraction bug * remove delimiter for naive parsertags/v0.1.0
| for id in sres.ids: | for id in sres.ids: | ||||
| d = { | d = { | ||||
| "chunk_id": id, | "chunk_id": id, | ||||
| "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""), | |||||
| "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get( | |||||
| "content_with_weight", ""), | |||||
| "doc_id": sres.field[id]["doc_id"], | "doc_id": sres.field[id]["doc_id"], | ||||
| "docnm_kwd": sres.field[id]["docnm_kwd"], | "docnm_kwd": sres.field[id]["docnm_kwd"], | ||||
| "important_kwd": sres.field[id].get("important_kwd", []), | "important_kwd": sres.field[id].get("important_kwd", []), | ||||
| "available_int": sres.field[id].get("available_int", 1), | "available_int": sres.field[id].get("available_int", 1), | ||||
| "positions": sres.field[id].get("position_int", "").split("\t") | "positions": sres.field[id].get("position_int", "").split("\t") | ||||
| } | } | ||||
| poss = [] | |||||
| for i in range(0, len(d["positions"]), 5): | |||||
| poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])]) | |||||
| d["positions"] = poss | |||||
| if len(d["positions"]) % 5 == 0: | |||||
| poss = [] | |||||
| for i in range(0, len(d["positions"]), 5): | |||||
| poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), | |||||
| float(d["positions"][i + 3]), float(d["positions"][i + 4])]) | |||||
| d["positions"] = poss | |||||
| res["chunks"].append(d) | res["chunks"].append(d) | ||||
| return get_json_result(data=res) | return get_json_result(data=res) | ||||
| except Exception as e: | except Exception as e: | ||||
| return get_data_error_result(retmsg="Document not found!") | return get_data_error_result(retmsg="Document not found!") | ||||
| if doc.parser_id == ParserType.QA: | if doc.parser_id == ParserType.QA: | ||||
| arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1] | |||||
| arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1] | |||||
| if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.") | if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.") | ||||
| q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] | q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] | ||||
| d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a])) | |||||
| d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a])) | |||||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | ||||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | ||||
| md5 = hashlib.md5() | md5 = hashlib.md5() | ||||
| md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | ||||
| chunck_id = md5.hexdigest() | chunck_id = md5.hexdigest() | ||||
| d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]} | |||||
| d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), | |||||
| "content_with_weight": req["content_with_weight"]} | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | ||||
| d["important_kwd"] = req.get("important_kwd", []) | d["important_kwd"] = req.get("important_kwd", []) | ||||
| d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) | d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) |
| tenant_id = CharField(max_length=32, null=False) | tenant_id = CharField(max_length=32, null=False) | ||||
| name = CharField(max_length=255, null=True, help_text="dialog application name") | name = CharField(max_length=255, null=True, help_text="dialog application name") | ||||
| description = TextField(null=True, help_text="Dialog description") | description = TextField(null=True, help_text="Dialog description") | ||||
| icon = CharField(max_length=16, null=False, help_text="dialog icon") | |||||
| icon = TextField(null=True, help_text="icon base64 string") | |||||
| language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese") | language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese") | ||||
| llm_id = CharField(max_length=32, null=False, help_text="default llm ID") | llm_id = CharField(max_length=32, null=False, help_text="default llm ID") | ||||
| llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7, | llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7, |
| self.updown_cnt_mdl.set_param({"device": "cuda"}) | self.updown_cnt_mdl.set_param({"device": "cuda"}) | ||||
| self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", | self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", | ||||
| filename="updown_concat_xgb.model")) | filename="updown_concat_xgb.model")) | ||||
| self.page_from = 0 | |||||
| """ | """ | ||||
| If you have trouble downloading HuggingFace models, -_^ this might help!! | If you have trouble downloading HuggingFace models, -_^ this might help!! | ||||
| "layoutno", ""))) | "layoutno", ""))) | ||||
| left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] | left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] | ||||
| poss.append((pn, left, right, top, bott)) | |||||
| poss.append((pn+self.page_from, left, right, top, bott)) | |||||
| return self.page_images[pn] \ | return self.page_images[pn] \ | ||||
| .crop((left * ZM, top * ZM, | .crop((left * ZM, top * ZM, | ||||
| right * ZM, bott * ZM)) | right * ZM, bott * ZM)) | ||||
| self.garbages = {} | self.garbages = {} | ||||
| self.page_cum_height = [0] | self.page_cum_height = [0] | ||||
| self.page_layout = [] | self.page_layout = [] | ||||
| self.page_from = page_from | |||||
| try: | try: | ||||
| self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) | self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) | ||||
| self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | ||||
| left, right, top, bottom = float(left), float( | left, right, top, bottom = float(left), float( | ||||
| right), float(top), float(bottom) | right), float(top), float(bottom) | ||||
| poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) | poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) | ||||
| if not poss: return | |||||
| if not poss: | |||||
| if need_position: return None, None | |||||
| return | |||||
| max_width = np.max([right-left for (_, left, right, _, _) in poss]) | max_width = np.max([right-left for (_, left, right, _, _) in poss]) | ||||
| GAP = 6 | GAP = 6 | ||||
| bottom, self.page_images[pns[0]].size[1]) | bottom, self.page_images[pns[0]].size[1]) | ||||
| )) | )) | ||||
| ) | ) | ||||
| positions.append((pns[0], left, right, top, min( | |||||
| if 0 < ii < len(poss)-1: | |||||
| positions.append((pns[0]+self.page_from, left, right, top, min( | |||||
| bottom, self.page_images[pns[0]].size[1])/ZM)) | bottom, self.page_images[pns[0]].size[1])/ZM)) | ||||
| bottom -= self.page_images[pns[0]].size[1] | bottom -= self.page_images[pns[0]].size[1] | ||||
| for pn in pns[1:]: | for pn in pns[1:]: | ||||
| self.page_images[pn].size[1]) | self.page_images[pn].size[1]) | ||||
| )) | )) | ||||
| ) | ) | ||||
| positions.append((pn, left, right, 0, min( | |||||
| bottom, self.page_images[pn].size[1]) / ZM)) | |||||
| if 0 < ii < len(poss) - 1: | |||||
| positions.append((pn+self.page_from, left, right, 0, min( | |||||
| bottom, self.page_images[pn].size[1]) / ZM)) | |||||
| bottom -= self.page_images[pn].size[1] | bottom -= self.page_images[pn].size[1] | ||||
| if not imgs: | if not imgs: |
| function task_exe(){ | function task_exe(){ | ||||
| sleep 60; | sleep 60; | ||||
| while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done | |||||
| while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done | |||||
| } | } | ||||
| function watch_broker(){ | function watch_broker(){ |
| self._filter_forpages() | self._filter_forpages() | ||||
| self._merge_with_same_bullet() | self._merge_with_same_bullet() | ||||
| callback(0.75, "Text merging finished.") | callback(0.75, "Text merging finished.") | ||||
| tbls = self._extract_table_figure(True, zoomin, False, True) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| callback(0.8, "Text extraction finished") | callback(0.8, "Text extraction finished") | ||||
| self._concat_downward(concat_between_pages=False) | self._concat_downward(concat_between_pages=False) | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback(0.77, "Text merging finished") | callback(0.77, "Text merging finished") | ||||
| tbls = self._extract_table_figure(True, zoomin, False, True) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| # clean mess | # clean mess | ||||
| for b in self.boxes: | for b in self.boxes: |
| self._concat_downward(concat_between_pages=False) | self._concat_downward(concat_between_pages=False) | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback(0.77, "Text merging finished") | callback(0.77, "Text merging finished") | ||||
| tbls = self._extract_table_figure(True, zoomin, False, True) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) | cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) | ||||
| #self._naive_vertical_merge() | #self._naive_vertical_merge() |
| self._concat_downward(concat_between_pages=False) | self._concat_downward(concat_between_pages=False) | ||||
| self._filter_forpages() | self._filter_forpages() | ||||
| callback(0.75, "Text merging finished.") | callback(0.75, "Text merging finished.") | ||||
| tbls = self._extract_table_figure(True, zoomin, False, True) | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| # clean mess | # clean mess | ||||
| if column_width < self.page_images[0].size[0] / zoomin / 2: | if column_width < self.page_images[0].size[0] / zoomin / 2: |
| tk_nums[-1] += tnum | tk_nums[-1] += tnum | ||||
| for sec, pos in sections: | for sec, pos in sections: | ||||
| add_chunk(sec, pos) | |||||
| continue | |||||
| s, e = 0, 1 | s, e = 0, 1 | ||||
| while e < len(sec): | while e < len(sec): | ||||
| if sec[e] in delimiter: | if sec[e] in delimiter: |
| else: | else: | ||||
| s = s.sort( | s = s.sort( | ||||
| {"page_num_int": {"order": "asc", "unmapped_type": "float"}}, | {"page_num_int": {"order": "asc", "unmapped_type": "float"}}, | ||||
| {"top_int": {"order": "asc", "unmapped_type": "float"}}, | |||||
| {"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}}, | |||||
| {"create_time": {"order": "desc", "unmapped_type": "date"}}, | {"create_time": {"order": "desc", "unmapped_type": "date"}}, | ||||
| {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} | {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} | ||||
| ) | ) |
| pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | ||||
| for s,e in r["parser_config"].get("pages", [(0,100000)]): | for s,e in r["parser_config"].get("pages", [(0,100000)]): | ||||
| e = min(e, pages) | e = min(e, pages) | ||||
| for p in range(s, e, 10): | |||||
| for p in range(s, e, 5): | |||||
| task = new_task() | task = new_task() | ||||
| task["from_page"] = p | task["from_page"] = p | ||||
| task["to_page"] = min(p + 10, e) | |||||
| task["to_page"] = min(p + 5, e) | |||||
| tsks.append(task) | tsks.append(task) | ||||
| else: | else: | ||||
| tsks.append(new_task()) | tsks.append(new_task()) |