| @@ -183,9 +183,7 @@ def chat(dialog, messages, **kwargs): | |||
| ## try to use sql if field mapping is good to go | |||
| if field_map: | |||
| chat_logger.info("Use SQL to retrieval:{}".format(questions[-1])) | |||
| markdown_tbl, chunks = use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl) | |||
| if markdown_tbl: | |||
| return {"answer": markdown_tbl, "reference": {"chunks": chunks, "doc_aggs": []}} | |||
| return use_sql(questions[-1], field_map, dialog.tenant_id, chat_mdl) | |||
| prompt_config = dialog.prompt_config | |||
| for p in prompt_config["parameters"]: | |||
| @@ -311,7 +309,7 @@ def use_sql(question, field_map, tenant_id, chat_mdl): | |||
| clmn_idx = [ii for ii in range(len(tbl["columns"])) if ii not in (docid_idx | docnm_idx)] | |||
| # compose markdown table | |||
| clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|原文|" if docid_idx and docid_idx else "|") | |||
| clmns = "|"+"|".join([re.sub(r"(/.*|([^()]+))", "", field_map.get(tbl["columns"][i]["name"], tbl["columns"][i]["name"])) for i in clmn_idx]) + ("|Source|" if docid_idx and docid_idx else "|") | |||
| line = "|"+"|".join(["------" for _ in range(len(clmn_idx))]) + ("|------|" if docid_idx and docid_idx else "") | |||
| rows = ["|"+"|".join([rmSpace(str(r[i])) for i in clmn_idx]).replace("None", " ") + "|" for r in tbl["rows"]] | |||
| if not docid_idx or not docnm_idx: | |||
| @@ -322,4 +320,8 @@ def use_sql(question, field_map, tenant_id, chat_mdl): | |||
| rows = re.sub(r"T[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]+Z)?\|", "|", rows) | |||
| docid_idx = list(docid_idx)[0] | |||
| docnm_idx = list(docnm_idx)[0] | |||
| return "\n".join([clmns, line, rows]), [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]] | |||
| return { | |||
| "answer": "\n".join([clmns, line, rows]), | |||
| "reference": {"chunks": [{"doc_id": r[docid_idx], "docnm_kwd": r[docnm_idx]} for r in tbl["rows"]], | |||
| "doc_aggs": [{"doc_id": r[docid_idx], "doc_name": r[docnm_idx], "count": 1} for r in tbl["rows"]]} | |||
| } | |||
| @@ -996,7 +996,7 @@ class HuParser: | |||
| if need_position: return None, None | |||
| return | |||
| max_width = np.max([right - left for (_, left, right, _, _) in poss]) | |||
| max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) | |||
| GAP = 6 | |||
| pos = poss[0] | |||
| poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) | |||
| @@ -14,9 +14,6 @@ | |||
| import copy | |||
| import time | |||
| import os | |||
| from huggingface_hub import snapshot_download | |||
| from .operators import * | |||
| import numpy as np | |||
| import onnxruntime as ort | |||
| @@ -24,7 +21,6 @@ import onnxruntime as ort | |||
| from .postprocess import build_post_process | |||
| from rag.settings import cron_logger | |||
| def transform(data, ops=None): | |||
| """ transform """ | |||
| if ops is None: | |||
| @@ -82,7 +78,7 @@ class TextRecognizer(object): | |||
| self.rec_batch_num = 16 | |||
| postprocess_params = { | |||
| 'name': 'CTCLabelDecode', | |||
| "character_dict_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "ocr.res"), | |||
| "character_dict_path": os.path.join(model_dir, "ocr.res"), | |||
| "use_space_char": True | |||
| } | |||
| self.postprocess_op = build_post_process(postprocess_params) | |||
| @@ -16,6 +16,7 @@ import re | |||
| from collections import Counter | |||
| import numpy as np | |||
| from huggingface_hub import snapshot_download | |||
| from api.utils.file_utils import get_project_base_directory | |||
| from rag.nlp import huqie | |||
| @@ -33,7 +34,8 @@ class TableStructureRecognizer(Recognizer): | |||
| ] | |||
| def __init__(self): | |||
| super().__init__(self.labels, "tsr",os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||
| model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc") | |||
| super().__init__(self.labels, "tsr", model_dir)#os.path.join(get_project_base_directory(), "rag/res/deepdoc/")) | |||
| def __call__(self, images, thr=0.2): | |||
| tbls = super().__call__(images, thr) | |||
| @@ -68,7 +68,7 @@ class Pdf(PdfParser): | |||
| callback(0.8, "Text extraction finished") | |||
| return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes] | |||
| return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None | |||
| def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): | |||
| @@ -91,7 +91,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() if kwargs.get("parser_config",{}).get("layout_recognize", True) else PlainParser() | |||
| for txt, poss in pdf_parser(filename if not binary else binary, | |||
| from_page=from_page, to_page=to_page, callback=callback): | |||
| from_page=from_page, to_page=to_page, callback=callback)[0]: | |||
| sections.append(txt + poss) | |||
| elif re.search(r"\.txt$", filename, re.IGNORECASE): | |||
| @@ -136,7 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
| "title": filename, | |||
| "authors": " ", | |||
| "abstract": "", | |||
| "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page), | |||
| "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], | |||
| "tables": [] | |||
| } | |||
| else: | |||
| @@ -66,7 +66,7 @@ class Pdf(PdfParser): | |||
| class PlainPdf(PlainParser): | |||
| def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): | |||
| self.pdf = pdf2_read(filename if not binary else BytesIO(filename)) | |||
| self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) | |||
| page_txt = [] | |||
| for page in self.pdf.pages[from_page: to_page]: | |||
| page_txt.append(page.extract_text()) | |||
| @@ -40,7 +40,7 @@ def remote_call(filename, binary): | |||
| "encrypt_type": "base64", | |||
| "filename": filename, | |||
| "langtype": '', | |||
| "fileori": base64.b64encode(binary.stream.read()).decode('utf-8') | |||
| "fileori": base64.b64encode(binary).decode('utf-8') | |||
| }, | |||
| "c": "resume_parse_module", | |||
| "m": "resume_parse" | |||
| @@ -20,10 +20,10 @@ from openai import OpenAI | |||
| from FlagEmbedding import FlagModel | |||
| import torch | |||
| import numpy as np | |||
| from huggingface_hub import snapshot_download | |||
| from rag.utils import num_tokens_from_string | |||
| flag_model = FlagModel("BAAI/bge-large-zh-v1.5", | |||
| flag_model = FlagModel(snapshot_download("BAAI/bge-large-zh-v1.5", local_files_only=True), | |||
| query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:", | |||
| use_fp16=torch.cuda.is_available()) | |||
| @@ -53,7 +53,7 @@ class EsQueryer: | |||
| if not self.isChinese(txt): | |||
| tks = huqie.qie(txt).split(" ") | |||
| q = tks | |||
| q = copy.deepcopy(tks) | |||
| for i in range(1, len(tks)): | |||
| q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) | |||
| if not q: | |||
| @@ -138,7 +138,7 @@ class EsQueryer: | |||
| def toDict(tks): | |||
| d = {} | |||
| if isinstance(tks, type("")): | |||
| if isinstance(tks, str): | |||
| tks = tks.split(" ") | |||
| for t, c in self.tw.weights(tks): | |||
| if t not in d: | |||
| @@ -234,13 +234,13 @@ class Dealer: | |||
| assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( | |||
| len(ans_v[0]), len(chunk_v[0])) | |||
| chunks_tks = [huqie.qie(ck).split(" ") for ck in chunks] | |||
| chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") for ck in chunks] | |||
| cites = {} | |||
| for i, a in enumerate(pieces_): | |||
| sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | |||
| chunk_v, | |||
| huqie.qie( | |||
| pieces_[i]).split(" "), | |||
| self.qryr.rmWWW(pieces_[i])).split(" "), | |||
| chunks_tks, | |||
| tkweight, vtweight) | |||
| mx = np.max(sim) * 0.99 | |||
| @@ -150,9 +150,10 @@ class Dealer: | |||
| return 6 | |||
| def ner(t): | |||
| if re.match(r"[0-9,.]{2,}$", t): return 2 | |||
| if re.match(r"[a-z]{1,2}$", t): return 0.01 | |||
| if not self.ne or t not in self.ne: | |||
| return 1 | |||
| if re.match(r"[0-9,.]+$", t): return 2 | |||
| m = {"toxic": 2, "func": 1, "corp": 3, "loca": 3, "sch": 3, "stock": 3, | |||
| "firstnm": 1} | |||
| return m[self.ne[t]] | |||
| @@ -170,11 +171,11 @@ class Dealer: | |||
| return 1 | |||
| def freq(t): | |||
| if re.match(r"[0-9\. -]+$", t): | |||
| return 10000 | |||
| if re.match(r"[0-9. -]{2,}$", t): | |||
| return 3 | |||
| s = huqie.freq(t) | |||
| if not s and re.match(r"[a-z\. -]+$", t): | |||
| return 10 | |||
| if not s and re.match(r"[a-z. -]+$", t): | |||
| return 300 | |||
| if not s: | |||
| s = 0 | |||
| @@ -188,12 +189,12 @@ class Dealer: | |||
| return max(s, 10) | |||
| def df(t): | |||
| if re.match(r"[0-9\. -]+$", t): | |||
| return 100000 | |||
| if re.match(r"[0-9. -]{2,}$", t): | |||
| return 5 | |||
| if t in self.df: | |||
| return self.df[t] + 3 | |||
| elif re.match(r"[a-z\. -]+$", t): | |||
| return 3 | |||
| elif re.match(r"[a-z. -]+$", t): | |||
| return 300 | |||
| elif len(t) >= 4: | |||
| s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] | |||
| if len(s) > 1: | |||
| @@ -87,7 +87,9 @@ def dispatch(): | |||
| if r["parser_id"] == "paper": page_size = r["parser_config"].get("task_page_size", 22) | |||
| if r["parser_id"] == "one": page_size = 1000000000 | |||
| if not do_layout: page_size = 1000000000 | |||
| for s,e in r["parser_config"].get("pages", [(1, 100000)]): | |||
| page_ranges = r["parser_config"].get("pages") | |||
| if not page_ranges: page_ranges = [(1, 100000)] | |||
| for s,e in page_ranges: | |||
| s -= 1 | |||
| s = max(0, s) | |||
| e = min(e-1, pages) | |||