### What problem does this PR solve? ### Type of change - [x] Refactoringtags/v0.5.0
| from elasticsearch_dsl import Q | from elasticsearch_dsl import Q | ||||
| from rag.app.qa import rmPrefix, beAdoc | from rag.app.qa import rmPrefix, beAdoc | ||||
| from rag.nlp import search, huqie | |||||
| from rag.nlp import search, rag_tokenizer | |||||
| from rag.utils.es_conn import ELASTICSEARCH | from rag.utils.es_conn import ELASTICSEARCH | ||||
| from rag.utils import rmSpace | from rag.utils import rmSpace | ||||
| from api.db import LLMType, ParserType | from api.db import LLMType, ParserType | ||||
| d = { | d = { | ||||
| "id": req["chunk_id"], | "id": req["chunk_id"], | ||||
| "content_with_weight": req["content_with_weight"]} | "content_with_weight": req["content_with_weight"]} | ||||
| d["content_ltks"] = huqie.qie(req["content_with_weight"]) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) | |||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||||
| d["important_kwd"] = req["important_kwd"] | d["important_kwd"] = req["important_kwd"] | ||||
| d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) | |||||
| if "available_int" in req: | if "available_int" in req: | ||||
| d["available_int"] = req["available_int"] | d["available_int"] = req["available_int"] | ||||
| retmsg="Q&A must be separated by TAB/ENTER key.") | retmsg="Q&A must be separated by TAB/ENTER key.") | ||||
| q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] | q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] | ||||
| d = beAdoc(d, arr[0], arr[1], not any( | d = beAdoc(d, arr[0], arr[1], not any( | ||||
| [huqie.is_chinese(t) for t in q + a])) | |||||
| [rag_tokenizer.is_chinese(t) for t in q + a])) | |||||
| v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) | ||||
| v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] | ||||
| md5 = hashlib.md5() | md5 = hashlib.md5() | ||||
| md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) | ||||
| chunck_id = md5.hexdigest() | chunck_id = md5.hexdigest() | ||||
| d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), | |||||
| d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), | |||||
| "content_with_weight": req["content_with_weight"]} | "content_with_weight": req["content_with_weight"]} | ||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||||
| d["important_kwd"] = req.get("important_kwd", []) | d["important_kwd"] = req.get("important_kwd", []) | ||||
| d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) | |||||
| d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) | |||||
| d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] | ||||
| d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | d["create_timestamp_flt"] = datetime.datetime.now().timestamp() | ||||
| docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ | docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ | ||||
| .join(Document, on=(cls.model.doc_id == Document.id)) \ | .join(Document, on=(cls.model.doc_id == Document.id)) \ | ||||
| .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ | .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ | ||||
| .join(File, on=(File2Document.file_id == File.id)) \ | |||||
| .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \ | |||||
| .where( | .where( | ||||
| Document.status == StatusEnum.VALID.value, | Document.status == StatusEnum.VALID.value, | ||||
| Document.run == TaskStatus.RUNNING.value, | Document.run == TaskStatus.RUNNING.value, | ||||
| ~(Document.type == FileType.VIRTUAL.value), | ~(Document.type == FileType.VIRTUAL.value), | ||||
| cls.model.progress >= 0, | |||||
| cls.model.progress < 1, | cls.model.progress < 1, | ||||
| cls.model.create_time >= current_timestamp() - 180000 | |||||
| cls.model.create_time >= current_timestamp() - 1000 * 600 | |||||
| ) | ) | ||||
| docs = list(docs.dicts()) | docs = list(docs.dicts()) | ||||
| if not docs: return [] | if not docs: return [] |
| import re | import re | ||||
| import pandas as pd | import pandas as pd | ||||
| from collections import Counter | from collections import Counter | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from io import BytesIO | from io import BytesIO | ||||
| for p, n in patt: | for p, n in patt: | ||||
| if re.search(p, b): | if re.search(p, b): | ||||
| return n | return n | ||||
| tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1] | |||||
| tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1] | |||||
| if len(tks) > 3: | if len(tks) > 3: | ||||
| if len(tks) < 12: | if len(tks) < 12: | ||||
| return "Tx" | return "Tx" | ||||
| else: | else: | ||||
| return "Lx" | return "Lx" | ||||
| if len(tks) == 1 and huqie.tag(tks[0]) == "nr": | |||||
| if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": | |||||
| return "Nr" | return "Nr" | ||||
| return "Ot" | return "Ot" |
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer | from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from copy import deepcopy | from copy import deepcopy | ||||
| from huggingface_hub import snapshot_download | from huggingface_hub import snapshot_download | ||||
| h = max(self.__height(up), self.__height(down)) | h = max(self.__height(up), self.__height(down)) | ||||
| y_dis = self._y_dis(up, down) | y_dis = self._y_dis(up, down) | ||||
| LEN = 6 | LEN = 6 | ||||
| tks_down = huqie.qie(down["text"][:LEN]).split(" ") | |||||
| tks_up = huqie.qie(up["text"][-LEN:]).split(" ") | |||||
| tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ") | |||||
| tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ") | |||||
| tks_all = up["text"][-LEN:].strip() \ | tks_all = up["text"][-LEN:].strip() \ | ||||
| + (" " if re.match(r"[a-zA-Z0-9]+", | + (" " if re.match(r"[a-zA-Z0-9]+", | ||||
| up["text"][-1] + down["text"][0]) else "") \ | up["text"][-1] + down["text"][0]) else "") \ | ||||
| + down["text"][:LEN].strip() | + down["text"][:LEN].strip() | ||||
| tks_all = huqie.qie(tks_all).split(" ") | |||||
| tks_all = rag_tokenizer.tokenize(tks_all).split(" ") | |||||
| fea = [ | fea = [ | ||||
| up.get("R", -1) == down.get("R", -1), | up.get("R", -1) == down.get("R", -1), | ||||
| y_dis / h, | y_dis / h, | ||||
| tks_down[-1] == tks_up[-1], | tks_down[-1] == tks_up[-1], | ||||
| max(down["in_row"], up["in_row"]), | max(down["in_row"], up["in_row"]), | ||||
| abs(down["in_row"] - up["in_row"]), | abs(down["in_row"] - up["in_row"]), | ||||
| len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0, | |||||
| len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0 | |||||
| len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0, | |||||
| len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0 | |||||
| ] | ] | ||||
| return fea | return fea | ||||
| if b["text"].strip()[0] != b_["text"].strip()[0] \ | if b["text"].strip()[0] != b_["text"].strip()[0] \ | ||||
| or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ | or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ | ||||
| or huqie.is_chinese(b["text"].strip()[0]) \ | |||||
| or rag_tokenizer.is_chinese(b["text"].strip()[0]) \ | |||||
| or b["top"] > b_["bottom"]: | or b["top"] > b_["bottom"]: | ||||
| i += 1 | i += 1 | ||||
| continue | continue |
| import re,json,os | import re,json,os | ||||
| import pandas as pd | import pandas as pd | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from . import regions | from . import regions | ||||
| current_file_path = os.path.dirname(os.path.abspath(__file__)) | current_file_path = os.path.dirname(os.path.abspath(__file__)) | ||||
| GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) | GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) | ||||
| def corpNorm(nm, add_region=True): | def corpNorm(nm, add_region=True): | ||||
| global CORP_TKS | global CORP_TKS | ||||
| if not nm or type(nm)!=type(""):return "" | if not nm or type(nm)!=type(""):return "" | ||||
| nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower() | |||||
| nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower() | |||||
| nm = re.sub(r"&", "&", nm) | nm = re.sub(r"&", "&", nm) | ||||
| nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) | nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) | ||||
| nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) | nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) | ||||
| nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) | nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) | ||||
| if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm | if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm | ||||
| tks = huqie.qie(nm).split(" ") | |||||
| tks = rag_tokenizer.tokenize(nm).split(" ") | |||||
| reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] | reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] | ||||
| nm = "" | nm = "" | ||||
| for t in tks: | for t in tks: |
| traceback, signal | traceback, signal | ||||
| import numpy as np | import numpy as np | ||||
| from deepdoc.parser.resume.entities import degrees, schools, corporations | from deepdoc.parser.resume.entities import degrees, schools, corporations | ||||
| from rag.nlp import huqie, surname | |||||
| from rag.nlp import rag_tokenizer, surname | |||||
| from xpinyin import Pinyin | from xpinyin import Pinyin | ||||
| from contextlib import contextmanager | from contextlib import contextmanager | ||||
| if n.get("school_name") and isinstance(n["school_name"], str): | if n.get("school_name") and isinstance(n["school_name"], str): | ||||
| sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) | sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) | ||||
| e["sch_nm_kwd"] = sch[-1] | e["sch_nm_kwd"] = sch[-1] | ||||
| fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1]) | |||||
| fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1]) | |||||
| if n.get("discipline_name") and isinstance(n["discipline_name"], str): | if n.get("discipline_name") and isinstance(n["discipline_name"], str): | ||||
| maj.append(n["discipline_name"]) | maj.append(n["discipline_name"]) | ||||
| if "tag_kwd" not in cv: cv["tag_kwd"] = [] | if "tag_kwd" not in cv: cv["tag_kwd"] = [] | ||||
| if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") | if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") | ||||
| if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj)) | |||||
| if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch)) | |||||
| if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch)) | |||||
| if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj)) | |||||
| if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj)) | |||||
| if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch)) | |||||
| if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch)) | |||||
| if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj)) | |||||
| return cv | return cv | ||||
| if n.get("achivement"): desc.append(str(n["achivement"])) | if n.get("achivement"): desc.append(str(n["achivement"])) | ||||
| if pro_nms: | if pro_nms: | ||||
| # cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms)) | |||||
| cv["project_name_tks"] = huqie.qie(pro_nms[0]) | |||||
| # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms)) | |||||
| cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0]) | |||||
| if desc: | if desc: | ||||
| cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc))) | |||||
| cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0])) | |||||
| cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc))) | |||||
| cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0])) | |||||
| return cv | return cv | ||||
| if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] | if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] | ||||
| if fea["position_name"]: | if fea["position_name"]: | ||||
| cv["position_name_tks"] = huqie.qie(fea["position_name"][0]) | |||||
| cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"]) | |||||
| cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:])) | |||||
| cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0]) | |||||
| cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"]) | |||||
| cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:])) | |||||
| if fea["industry_name"]: | if fea["industry_name"]: | ||||
| cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0]) | |||||
| cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"]) | |||||
| cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:])) | |||||
| cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0]) | |||||
| cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"]) | |||||
| cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:])) | |||||
| if fea["corporation_name"]: | if fea["corporation_name"]: | ||||
| cv["corporation_name_kwd"] = fea["corporation_name"][0] | cv["corporation_name_kwd"] = fea["corporation_name"][0] | ||||
| cv["corp_nm_kwd"] = fea["corporation_name"] | cv["corp_nm_kwd"] = fea["corporation_name"] | ||||
| cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0]) | |||||
| cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"]) | |||||
| cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:])) | |||||
| cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0]) | |||||
| cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"]) | |||||
| cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:])) | |||||
| if fea["responsibilities"]: | if fea["responsibilities"]: | ||||
| cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0]) | |||||
| cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:])) | |||||
| cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0]) | |||||
| cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:])) | |||||
| if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if | if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if | ||||
| re.match(r"[^0-9]+$", str(i))] | re.match(r"[^0-9]+$", str(i))] | ||||
| if nms: | if nms: | ||||
| t = k[:-4] | t = k[:-4] | ||||
| cv[f"{t}_kwd"] = nms | cv[f"{t}_kwd"] = nms | ||||
| cv[f"{t}_tks"] = huqie.qie(" ".join(nms)) | |||||
| cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms)) | |||||
| except Exception as e: | except Exception as e: | ||||
| print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) | print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) | ||||
| cv[k] = [] | cv[k] = [] | ||||
| # tokenize fields | # tokenize fields | ||||
| if k in tks_fld: | if k in tks_fld: | ||||
| cv[f"{k}_tks"] = huqie.qie(cv[k]) | |||||
| if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"]) | |||||
| cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k]) | |||||
| if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"]) | |||||
| # keyword fields | # keyword fields | ||||
| if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() | if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() | ||||
| cv["name_kwd"] = name | cv["name_kwd"] = name | ||||
| cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] | cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] | ||||
| cv["name_tks"] = ( | cv["name_tks"] = ( | ||||
| huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") | |||||
| rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") | |||||
| ) if name else "" | ) if name else "" | ||||
| else: | else: | ||||
| cv["integerity_flt"] /= 2. | cv["integerity_flt"] /= 2. | ||||
| cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | ||||
| # long text tokenize | # long text tokenize | ||||
| if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"])) | |||||
| if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"])) | |||||
| # for yes or no field | # for yes or no field | ||||
| fea = [] | fea = [] |
| from huggingface_hub import snapshot_download | from huggingface_hub import snapshot_download | ||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from .recognizer import Recognizer | from .recognizer import Recognizer | ||||
| for p, n in patt: | for p, n in patt: | ||||
| if re.search(p, b["text"].strip()): | if re.search(p, b["text"].strip()): | ||||
| return n | return n | ||||
| tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1] | |||||
| tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1] | |||||
| if len(tks) > 3: | if len(tks) > 3: | ||||
| if len(tks) < 12: | if len(tks) < 12: | ||||
| return "Tx" | return "Tx" | ||||
| else: | else: | ||||
| return "Lx" | return "Lx" | ||||
| if len(tks) == 1 and huqie.tag(tks[0]) == "nr": | |||||
| if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": | |||||
| return "Nr" | return "Nr" | ||||
| return "Ot" | return "Ot" |
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ | ||||
| hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ | ||||
| tokenize_chunks, find_codec | tokenize_chunks, find_codec | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser | from deepdoc.parser import PdfParser, DocxParser, PlainParser | ||||
| """ | """ | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| pdf_parser = None | pdf_parser = None | ||||
| sections, tbls = [], [] | sections, tbls = [], [] | ||||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | if re.search(r"\.docx$", filename, re.IGNORECASE): |
| from api.db import ParserType | from api.db import ParserType | ||||
| from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ | ||||
| make_colon_as_title, add_positions, tokenize_chunks, find_codec | make_colon_as_title, add_positions, tokenize_chunks, find_codec | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from deepdoc.parser import PdfParser, DocxParser, PlainParser | from deepdoc.parser import PdfParser, DocxParser, PlainParser | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| """ | """ | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| pdf_parser = None | pdf_parser = None | ||||
| sections = [] | sections = [] | ||||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | if re.search(r"\.docx$", filename, re.IGNORECASE): |
| import re | import re | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||||
| from deepdoc.parser import PdfParser, PlainParser | from deepdoc.parser import PdfParser, PlainParser | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename | "docnm_kwd": filename | ||||
| } | } | ||||
| doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | |||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| # is it English | # is it English | ||||
| eng = lang.lower() == "english" # pdf_parser.is_english | eng = lang.lower() == "english" # pdf_parser.is_english | ||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||
| import re | import re | ||||
| from deepdoc.parser.pdf_parser import PlainParser | from deepdoc.parser.pdf_parser import PlainParser | ||||
| from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec | |||||
| from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec | |||||
| from deepdoc.parser import PdfParser, ExcelParser, DocxParser | from deepdoc.parser import PdfParser, ExcelParser, DocxParser | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) | "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| res = [] | res = [] | ||||
| pdf_parser = None | pdf_parser = None | ||||
| sections = [] | sections = [] |
| from io import BytesIO | from io import BytesIO | ||||
| import re | import re | ||||
| from rag.app import laws | from rag.app import laws | ||||
| from rag.nlp import huqie, tokenize, find_codec | |||||
| from rag.nlp import rag_tokenizer, tokenize, find_codec | |||||
| from deepdoc.parser import PdfParser, ExcelParser, PlainParser | from deepdoc.parser import PdfParser, ExcelParser, PlainParser | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| tokenize(doc, "\n".join(sections), eng) | tokenize(doc, "\n".join(sections), eng) | ||||
| return [doc] | return [doc] | ||||
| from collections import Counter | from collections import Counter | ||||
| from api.db import ParserType | from api.db import ParserType | ||||
| from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||||
| from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks | |||||
| from deepdoc.parser import PdfParser, PlainParser | from deepdoc.parser import PdfParser, PlainParser | ||||
| import numpy as np | import numpy as np | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| else: | else: | ||||
| raise NotImplementedError("file type not supported yet(pdf supported)") | raise NotImplementedError("file type not supported yet(pdf supported)") | ||||
| doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]), | |||||
| "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)} | |||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) | |||||
| doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]), | |||||
| "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)} | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"]) | |||||
| # is it English | # is it English | ||||
| eng = lang.lower() == "english" # pdf_parser.is_english | eng = lang.lower() == "english" # pdf_parser.is_english | ||||
| print("It's English.....", eng) | print("It's English.....", eng) |
| from PIL import Image | from PIL import Image | ||||
| from rag.nlp import tokenize, is_english | from rag.nlp import tokenize, is_english | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from deepdoc.parser import PdfParser, PptParser, PlainParser | from deepdoc.parser import PdfParser, PptParser, PlainParser | ||||
| from PyPDF2 import PdfReader as pdf2_read | from PyPDF2 import PdfReader as pdf2_read | ||||
| eng = lang.lower() == "english" | eng = lang.lower() == "english" | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| res = [] | res = [] | ||||
| if re.search(r"\.pptx?$", filename, re.IGNORECASE): | if re.search(r"\.pptx?$", filename, re.IGNORECASE): | ||||
| ppt_parser = Ppt() | ppt_parser = Ppt() |
| from nltk import word_tokenize | from nltk import word_tokenize | ||||
| from openpyxl import load_workbook | from openpyxl import load_workbook | ||||
| from rag.nlp import is_english, random_choices, find_codec | from rag.nlp import is_english, random_choices, find_codec | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||
| aprefix = "Answer: " if eng else "回答:" | aprefix = "Answer: " if eng else "回答:" | ||||
| d["content_with_weight"] = "\t".join( | d["content_with_weight"] = "\t".join( | ||||
| [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) | ||||
| d["content_ltks"] = huqie.qie(q) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| d["content_ltks"] = rag_tokenizer.tokenize(q) | |||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||||
| return d | return d | ||||
| res = [] | res = [] | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | if re.search(r"\.xlsx?$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") |
| import pandas as pd | import pandas as pd | ||||
| import requests | import requests | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from deepdoc.parser.resume import refactor | from deepdoc.parser.resume import refactor | ||||
| from deepdoc.parser.resume import step_one, step_two | from deepdoc.parser.resume import step_one, step_two | ||||
| from rag.settings import cron_logger | from rag.settings import cron_logger | ||||
| titles.append(str(v)) | titles.append(str(v)) | ||||
| doc = { | doc = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie("-".join(titles) + "-简历") | |||||
| "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历") | |||||
| } | } | ||||
| doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) | |||||
| doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) | |||||
| pairs = [] | pairs = [] | ||||
| for n, m in field_map.items(): | for n, m in field_map.items(): | ||||
| if not resume.get(n): | if not resume.get(n): | ||||
| doc["content_with_weight"] = "\n".join( | doc["content_with_weight"] = "\n".join( | ||||
| ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) | ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) | ||||
| doc["content_ltks"] = huqie.qie(doc["content_with_weight"]) | |||||
| doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"]) | |||||
| doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"]) | |||||
| doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"]) | |||||
| for n, _ in field_map.items(): | for n, _ in field_map.items(): | ||||
| if n not in resume: | if n not in resume: | ||||
| continue | continue | ||||
| len(resume[n]) == 1 or n not in forbidden_select_fields4resume): | len(resume[n]) == 1 or n not in forbidden_select_fields4resume): | ||||
| resume[n] = resume[n][0] | resume[n] = resume[n][0] | ||||
| if n.find("_tks") > 0: | if n.find("_tks") > 0: | ||||
| resume[n] = huqie.qieqie(resume[n]) | |||||
| resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) | |||||
| doc[n] = resume[n] | doc[n] = resume[n] | ||||
| print(doc) | print(doc) |
| from dateutil.parser import parse as datetime_parse | from dateutil.parser import parse as datetime_parse | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| from rag.nlp import huqie, is_english, tokenize, find_codec | |||||
| from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec | |||||
| from deepdoc.parser import ExcelParser | from deepdoc.parser import ExcelParser | ||||
| for ii, row in df.iterrows(): | for ii, row in df.iterrows(): | ||||
| d = { | d = { | ||||
| "docnm_kwd": filename, | "docnm_kwd": filename, | ||||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||||
| } | } | ||||
| row_txt = [] | row_txt = [] | ||||
| for j in range(len(clmns)): | for j in range(len(clmns)): | ||||
| if pd.isna(row[clmns[j]]): | if pd.isna(row[clmns[j]]): | ||||
| continue | continue | ||||
| fld = clmns_map[j][0] | fld = clmns_map[j][0] | ||||
| d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( | |||||
| d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( | |||||
| row[clmns[j]]) | row[clmns[j]]) | ||||
| row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) | ||||
| if not row_txt: | if not row_txt: |
| from collections import Counter | from collections import Counter | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| from . import huqie | |||||
| from . import rag_tokenizer | |||||
| import re | import re | ||||
| import copy | import copy | ||||
| def tokenize(d, t, eng): | def tokenize(d, t, eng): | ||||
| d["content_with_weight"] = t | d["content_with_weight"] = t | ||||
| t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) | t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) | ||||
| d["content_ltks"] = huqie.qie(t) | |||||
| d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) | |||||
| d["content_ltks"] = rag_tokenizer.tokenize(t) | |||||
| d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) | |||||
| def tokenize_chunks(chunks, doc, eng, pdf_parser): | def tokenize_chunks(chunks, doc, eng, pdf_parser): |
| import copy | import copy | ||||
| from elasticsearch_dsl import Q | from elasticsearch_dsl import Q | ||||
| from rag.nlp import huqie, term_weight, synonym | |||||
| from rag.nlp import rag_tokenizer, term_weight, synonym | |||||
| class EsQueryer: | class EsQueryer: | ||||
| txt = re.sub( | txt = re.sub( | ||||
| r"[ \r\n\t,,。??/`!!&]+", | r"[ \r\n\t,,。??/`!!&]+", | ||||
| " ", | " ", | ||||
| huqie.tradi2simp( | |||||
| huqie.strQ2B( | |||||
| rag_tokenizer.tradi2simp( | |||||
| rag_tokenizer.strQ2B( | |||||
| txt.lower()))).strip() | txt.lower()))).strip() | ||||
| txt = EsQueryer.rmWWW(txt) | txt = EsQueryer.rmWWW(txt) | ||||
| if not self.isChinese(txt): | if not self.isChinese(txt): | ||||
| tks = huqie.qie(txt).split(" ") | |||||
| tks = rag_tokenizer.tokenize(txt).split(" ") | |||||
| q = copy.deepcopy(tks) | q = copy.deepcopy(tks) | ||||
| for i in range(1, len(tks)): | for i in range(1, len(tks)): | ||||
| q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) | q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) | ||||
| boost=1)#, minimum_should_match=min_match) | boost=1)#, minimum_should_match=min_match) | ||||
| ), tks | ), tks | ||||
| def needQieqie(tk): | |||||
| def need_fine_grained_tokenize(tk): | |||||
| if len(tk) < 4: | if len(tk) < 4: | ||||
| return False | return False | ||||
| if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): | if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): | ||||
| logging.info(json.dumps(twts, ensure_ascii=False)) | logging.info(json.dumps(twts, ensure_ascii=False)) | ||||
| tms = [] | tms = [] | ||||
| for tk, w in sorted(twts, key=lambda x: x[1] * -1): | for tk, w in sorted(twts, key=lambda x: x[1] * -1): | ||||
| sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else [] | |||||
| sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else [] | |||||
| sm = [ | sm = [ | ||||
| re.sub( | re.sub( | ||||
| r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", | r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", | ||||
| if len(twts) > 1: | if len(twts) > 1: | ||||
| tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts])) | tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts])) | ||||
| if re.match(r"[0-9a-z ]+$", tt): | if re.match(r"[0-9a-z ]+$", tt): | ||||
| tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt) | |||||
| tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt) | |||||
| syns = " OR ".join( | syns = " OR ".join( | ||||
| ["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns]) | |||||
| ["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns]) | |||||
| if syns: | if syns: | ||||
| tms = f"({tms})^5 OR ({syns})^0.7" | tms = f"({tms})^5 OR ({syns})^0.7" | ||||
| # -*- coding: utf-8 -*- | |||||
| import copy | |||||
| import datrie | |||||
| import math | |||||
| import os | |||||
| import re | |||||
| import string | |||||
| import sys | |||||
| from hanziconv import HanziConv | |||||
| from huggingface_hub import snapshot_download | |||||
| from nltk import word_tokenize | |||||
| from nltk.stem import PorterStemmer, WordNetLemmatizer | |||||
| from api.utils.file_utils import get_project_base_directory | |||||
| class RagTokenizer: | |||||
| def key_(self, line): | |||||
| return str(line.lower().encode("utf-8"))[2:-1] | |||||
| def rkey_(self, line): | |||||
| return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1] | |||||
| def loadDict_(self, fnm): | |||||
| print("[HUQIE]:Build trie", fnm, file=sys.stderr) | |||||
| try: | |||||
| of = open(fnm, "r") | |||||
| while True: | |||||
| line = of.readline() | |||||
| if not line: | |||||
| break | |||||
| line = re.sub(r"[\r\n]+", "", line) | |||||
| line = re.split(r"[ \t]", line) | |||||
| k = self.key_(line[0]) | |||||
| F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5) | |||||
| if k not in self.trie_ or self.trie_[k][0] < F: | |||||
| self.trie_[self.key_(line[0])] = (F, line[2]) | |||||
| self.trie_[self.rkey_(line[0])] = 1 | |||||
| self.trie_.save(fnm + ".trie") | |||||
| of.close() | |||||
| except Exception as e: | |||||
| print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr) | |||||
| def __init__(self, debug=False): | |||||
| self.DEBUG = debug | |||||
| self.DENOMINATOR = 1000000 | |||||
| self.trie_ = datrie.Trie(string.printable) | |||||
| self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie") | |||||
| self.stemmer = PorterStemmer() | |||||
| self.lemmatizer = WordNetLemmatizer() | |||||
| self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)" | |||||
| try: | |||||
| self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie") | |||||
| return | |||||
| except Exception as e: | |||||
| print("[HUQIE]:Build default trie", file=sys.stderr) | |||||
| self.trie_ = datrie.Trie(string.printable) | |||||
| self.loadDict_(self.DIR_ + ".txt") | |||||
| def loadUserDict(self, fnm): | |||||
| try: | |||||
| self.trie_ = datrie.Trie.load(fnm + ".trie") | |||||
| return | |||||
| except Exception as e: | |||||
| self.trie_ = datrie.Trie(string.printable) | |||||
| self.loadDict_(fnm) | |||||
| def addUserDict(self, fnm): | |||||
| self.loadDict_(fnm) | |||||
| def _strQ2B(self, ustring): | |||||
| """把字符串全角转半角""" | |||||
| rstring = "" | |||||
| for uchar in ustring: | |||||
| inside_code = ord(uchar) | |||||
| if inside_code == 0x3000: | |||||
| inside_code = 0x0020 | |||||
| else: | |||||
| inside_code -= 0xfee0 | |||||
| if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符 | |||||
| rstring += uchar | |||||
| else: | |||||
| rstring += chr(inside_code) | |||||
| return rstring | |||||
| def _tradi2simp(self, line): | |||||
| return HanziConv.toSimplified(line) | |||||
| def dfs_(self, chars, s, preTks, tkslist): | |||||
| MAX_L = 10 | |||||
| res = s | |||||
| # if s > MAX_L or s>= len(chars): | |||||
| if s >= len(chars): | |||||
| tkslist.append(preTks) | |||||
| return res | |||||
| # pruning | |||||
| S = s + 1 | |||||
| if s + 2 <= len(chars): | |||||
| t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2]) | |||||
| if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix( | |||||
| self.key_(t2)): | |||||
| S = s + 2 | |||||
| if len(preTks) > 2 and len( | |||||
| preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1: | |||||
| t1 = preTks[-1][0] + "".join(chars[s:s + 1]) | |||||
| if self.trie_.has_keys_with_prefix(self.key_(t1)): | |||||
| S = s + 2 | |||||
| ################ | |||||
| for e in range(S, len(chars) + 1): | |||||
| t = "".join(chars[s:e]) | |||||
| k = self.key_(t) | |||||
| if e > s + 1 and not self.trie_.has_keys_with_prefix(k): | |||||
| break | |||||
| if k in self.trie_: | |||||
| pretks = copy.deepcopy(preTks) | |||||
| if k in self.trie_: | |||||
| pretks.append((t, self.trie_[k])) | |||||
| else: | |||||
| pretks.append((t, (-12, ''))) | |||||
| res = max(res, self.dfs_(chars, e, pretks, tkslist)) | |||||
| if res > s: | |||||
| return res | |||||
| t = "".join(chars[s:s + 1]) | |||||
| k = self.key_(t) | |||||
| if k in self.trie_: | |||||
| preTks.append((t, self.trie_[k])) | |||||
| else: | |||||
| preTks.append((t, (-12, ''))) | |||||
| return self.dfs_(chars, s + 1, preTks, tkslist) | |||||
| def freq(self, tk): | |||||
| k = self.key_(tk) | |||||
| if k not in self.trie_: | |||||
| return 0 | |||||
| return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5) | |||||
| def tag(self, tk): | |||||
| k = self.key_(tk) | |||||
| if k not in self.trie_: | |||||
| return "" | |||||
| return self.trie_[k][1] | |||||
| def score_(self, tfts): | |||||
| B = 30 | |||||
| F, L, tks = 0, 0, [] | |||||
| for tk, (freq, tag) in tfts: | |||||
| F += freq | |||||
| L += 0 if len(tk) < 2 else 1 | |||||
| tks.append(tk) | |||||
| F /= len(tks) | |||||
| L /= len(tks) | |||||
| if self.DEBUG: | |||||
| print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F) | |||||
| return tks, B / len(tks) + L + F | |||||
| def sortTks_(self, tkslist): | |||||
| res = [] | |||||
| for tfts in tkslist: | |||||
| tks, s = self.score_(tfts) | |||||
| res.append((tks, s)) | |||||
| return sorted(res, key=lambda x: x[1], reverse=True) | |||||
| def merge_(self, tks): | |||||
| patts = [ | |||||
| (r"[ ]+", " "), | |||||
| (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"), | |||||
| ] | |||||
| # for p,s in patts: tks = re.sub(p, s, tks) | |||||
| # if split chars is part of token | |||||
| res = [] | |||||
| tks = re.sub(r"[ ]+", " ", tks).split(" ") | |||||
| s = 0 | |||||
| while True: | |||||
| if s >= len(tks): | |||||
| break | |||||
| E = s + 1 | |||||
| for e in range(s + 2, min(len(tks) + 2, s + 6)): | |||||
| tk = "".join(tks[s:e]) | |||||
| if re.search(self.SPLIT_CHAR, tk) and self.freq(tk): | |||||
| E = e | |||||
| res.append("".join(tks[s:E])) | |||||
| s = E | |||||
| return " ".join(res) | |||||
| def maxForward_(self, line): | |||||
| res = [] | |||||
| s = 0 | |||||
| while s < len(line): | |||||
| e = s + 1 | |||||
| t = line[s:e] | |||||
| while e < len(line) and self.trie_.has_keys_with_prefix( | |||||
| self.key_(t)): | |||||
| e += 1 | |||||
| t = line[s:e] | |||||
| while e - 1 > s and self.key_(t) not in self.trie_: | |||||
| e -= 1 | |||||
| t = line[s:e] | |||||
| if self.key_(t) in self.trie_: | |||||
| res.append((t, self.trie_[self.key_(t)])) | |||||
| else: | |||||
| res.append((t, (0, ''))) | |||||
| s = e | |||||
| return self.score_(res) | |||||
| def maxBackward_(self, line): | |||||
| res = [] | |||||
| s = len(line) - 1 | |||||
| while s >= 0: | |||||
| e = s + 1 | |||||
| t = line[s:e] | |||||
| while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)): | |||||
| s -= 1 | |||||
| t = line[s:e] | |||||
| while s + 1 < e and self.key_(t) not in self.trie_: | |||||
| s += 1 | |||||
| t = line[s:e] | |||||
| if self.key_(t) in self.trie_: | |||||
| res.append((t, self.trie_[self.key_(t)])) | |||||
| else: | |||||
| res.append((t, (0, ''))) | |||||
| s -= 1 | |||||
| return self.score_(res[::-1]) | |||||
| def tokenize(self, line): | |||||
| line = self._strQ2B(line).lower() | |||||
| line = self._tradi2simp(line) | |||||
| zh_num = len([1 for c in line if is_chinese(c)]) | |||||
| if zh_num < len(line) * 0.2: | |||||
| return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)]) | |||||
| arr = re.split(self.SPLIT_CHAR, line) | |||||
| res = [] | |||||
| for L in arr: | |||||
| if len(L) < 2 or re.match( | |||||
| r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L): | |||||
| res.append(L) | |||||
| continue | |||||
| # print(L) | |||||
| # use maxforward for the first time | |||||
| tks, s = self.maxForward_(L) | |||||
| tks1, s1 = self.maxBackward_(L) | |||||
| if self.DEBUG: | |||||
| print("[FW]", tks, s) | |||||
| print("[BW]", tks1, s1) | |||||
| diff = [0 for _ in range(max(len(tks1), len(tks)))] | |||||
| for i in range(min(len(tks1), len(tks))): | |||||
| if tks[i] != tks1[i]: | |||||
| diff[i] = 1 | |||||
| if s1 > s: | |||||
| tks = tks1 | |||||
| i = 0 | |||||
| while i < len(tks): | |||||
| s = i | |||||
| while s < len(tks) and diff[s] == 0: | |||||
| s += 1 | |||||
| if s == len(tks): | |||||
| res.append(" ".join(tks[i:])) | |||||
| break | |||||
| if s > i: | |||||
| res.append(" ".join(tks[i:s])) | |||||
| e = s | |||||
| while e < len(tks) and e - s < 5 and diff[e] == 1: | |||||
| e += 1 | |||||
| tkslist = [] | |||||
| self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist) | |||||
| res.append(" ".join(self.sortTks_(tkslist)[0][0])) | |||||
| i = e + 1 | |||||
| res = " ".join(res) | |||||
| if self.DEBUG: | |||||
| print("[TKS]", self.merge_(res)) | |||||
| return self.merge_(res) | |||||
| def fine_grained_tokenize(self, tks): | |||||
| tks = tks.split(" ") | |||||
| zh_num = len([1 for c in tks if c and is_chinese(c[0])]) | |||||
| if zh_num < len(tks) * 0.2: | |||||
| res = [] | |||||
| for tk in tks: | |||||
| res.extend(tk.split("/")) | |||||
| return " ".join(res) | |||||
| res = [] | |||||
| for tk in tks: | |||||
| if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk): | |||||
| res.append(tk) | |||||
| continue | |||||
| tkslist = [] | |||||
| if len(tk) > 10: | |||||
| tkslist.append(tk) | |||||
| else: | |||||
| self.dfs_(tk, 0, [], tkslist) | |||||
| if len(tkslist) < 2: | |||||
| res.append(tk) | |||||
| continue | |||||
| stk = self.sortTks_(tkslist)[1][0] | |||||
| if len(stk) == len(tk): | |||||
| stk = tk | |||||
| else: | |||||
| if re.match(r"[a-z\.-]+$", tk): | |||||
| for t in stk: | |||||
| if len(t) < 3: | |||||
| stk = tk | |||||
| break | |||||
| else: | |||||
| stk = " ".join(stk) | |||||
| else: | |||||
| stk = " ".join(stk) | |||||
| res.append(stk) | |||||
| return " ".join(res) | |||||
| def is_chinese(s): | |||||
| if s >= u'\u4e00' and s <= u'\u9fa5': | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| def is_number(s): | |||||
| if s >= u'\u0030' and s <= u'\u0039': | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| def is_alphabet(s): | |||||
| if (s >= u'\u0041' and s <= u'\u005a') or ( | |||||
| s >= u'\u0061' and s <= u'\u007a'): | |||||
| return True | |||||
| else: | |||||
| return False | |||||
| def naiveQie(txt): | |||||
| tks = [] | |||||
| for t in txt.split(" "): | |||||
| if tks and re.match(r".*[a-zA-Z]$", tks[-1] | |||||
| ) and re.match(r".*[a-zA-Z]$", t): | |||||
| tks.append(" ") | |||||
| tks.append(t) | |||||
| return tks | |||||
| tokenizer = RagTokenizer() | |||||
| tokenize = tokenizer.tokenize | |||||
| fine_grained_tokenize = tokenizer.fine_grained_tokenize | |||||
| tag = tokenizer.tag | |||||
| freq = tokenizer.freq | |||||
| loadUserDict = tokenizer.loadUserDict | |||||
| addUserDict = tokenizer.addUserDict | |||||
| tradi2simp = tokenizer._tradi2simp | |||||
| strQ2B = tokenizer._strQ2B | |||||
| if __name__ == '__main__': | |||||
| tknzr = RagTokenizer(debug=True) | |||||
| # huqie.addUserDict("/tmp/tmp.new.tks.dict") | |||||
| tks = tknzr.tokenize( | |||||
| "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize( | |||||
| "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize( | |||||
| "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize( | |||||
| "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize("虽然我不怎么玩") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize( | |||||
| "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| tks = tknzr.tokenize( | |||||
| "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-") | |||||
| print(tknzr.fine_grained_tokenize(tks)) | |||||
| if len(sys.argv) < 2: | |||||
| sys.exit() | |||||
| tknzr.DEBUG = False | |||||
| tknzr.loadUserDict(sys.argv[1]) | |||||
| of = open(sys.argv[2], "r") | |||||
| while True: | |||||
| line = of.readline() | |||||
| if not line: | |||||
| break | |||||
| print(tknzr.tokenize(line)) | |||||
| of.close() |
| from rag.settings import es_logger | from rag.settings import es_logger | ||||
| from rag.utils import rmSpace | from rag.utils import rmSpace | ||||
| from rag.nlp import huqie, query | |||||
| from rag.nlp import rag_tokenizer, query | |||||
| import numpy as np | import numpy as np | ||||
| kwds = set([]) | kwds = set([]) | ||||
| for k in keywords: | for k in keywords: | ||||
| kwds.add(k) | kwds.add(k) | ||||
| for kk in huqie.qieqie(k).split(" "): | |||||
| for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "): | |||||
| if len(kk) < 2: | if len(kk) < 2: | ||||
| continue | continue | ||||
| if kk in kwds: | if kk in kwds: | ||||
| assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( | assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( | ||||
| len(ans_v[0]), len(chunk_v[0])) | len(ans_v[0]), len(chunk_v[0])) | ||||
| chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") | |||||
| chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ") | |||||
| for ck in chunks] | for ck in chunks] | ||||
| cites = {} | cites = {} | ||||
| thr = 0.63 | thr = 0.63 | ||||
| for i, a in enumerate(pieces_): | for i, a in enumerate(pieces_): | ||||
| sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], | ||||
| chunk_v, | chunk_v, | ||||
| huqie.qie( | |||||
| rag_tokenizer.tokenize( | |||||
| self.qryr.rmWWW(pieces_[i])).split(" "), | self.qryr.rmWWW(pieces_[i])).split(" "), | ||||
| chunks_tks, | chunks_tks, | ||||
| tkweight, vtweight) | tkweight, vtweight) | ||||
| def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): | def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): | ||||
| return self.qryr.hybrid_similarity(ans_embd, | return self.qryr.hybrid_similarity(ans_embd, | ||||
| ins_embd, | ins_embd, | ||||
| huqie.qie(ans).split(" "), | |||||
| huqie.qie(inst).split(" ")) | |||||
| rag_tokenizer.tokenize(ans).split(" "), | |||||
| rag_tokenizer.tokenize(inst).split(" ")) | |||||
| def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, | def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, | ||||
| vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): | vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): | ||||
| for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): | for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): | ||||
| fld, v = r.group(1), r.group(3) | fld, v = r.group(1), r.group(3) | ||||
| match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format( | match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format( | ||||
| fld, huqie.qieqie(huqie.qie(v))) | |||||
| fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v))) | |||||
| replaces.append( | replaces.append( | ||||
| ("{}{}'{}'".format( | ("{}{}'{}'".format( | ||||
| r.group(1), | r.group(1), |
| import re | import re | ||||
| import os | import os | ||||
| import numpy as np | import numpy as np | ||||
| from rag.nlp import huqie | |||||
| from rag.nlp import rag_tokenizer | |||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| txt = re.sub(p, r, txt) | txt = re.sub(p, r, txt) | ||||
| res = [] | res = [] | ||||
| for t in huqie.qie(txt).split(" "): | |||||
| for t in rag_tokenizer.tokenize(txt).split(" "): | |||||
| tk = t | tk = t | ||||
| if (stpwd and tk in self.stop_words) or ( | if (stpwd and tk in self.stop_words) or ( | ||||
| re.match(r"[0-9]$", tk) and not num): | re.match(r"[0-9]$", tk) and not num): | ||||
| return m[self.ne[t]] | return m[self.ne[t]] | ||||
| def postag(t): | def postag(t): | ||||
| t = huqie.tag(t) | |||||
| t = rag_tokenizer.tag(t) | |||||
| if t in set(["r", "c", "d"]): | if t in set(["r", "c", "d"]): | ||||
| return 0.3 | return 0.3 | ||||
| if t in set(["ns", "nt"]): | if t in set(["ns", "nt"]): | ||||
| def freq(t): | def freq(t): | ||||
| if re.match(r"[0-9. -]{2,}$", t): | if re.match(r"[0-9. -]{2,}$", t): | ||||
| return 3 | return 3 | ||||
| s = huqie.freq(t) | |||||
| s = rag_tokenizer.freq(t) | |||||
| if not s and re.match(r"[a-z. -]+$", t): | if not s and re.match(r"[a-z. -]+$", t): | ||||
| return 300 | return 300 | ||||
| if not s: | if not s: | ||||
| s = 0 | s = 0 | ||||
| if not s and len(t) >= 4: | if not s and len(t) >= 4: | ||||
| s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] | |||||
| s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] | |||||
| if len(s) > 1: | if len(s) > 1: | ||||
| s = np.min([freq(tt) for tt in s]) / 6. | s = np.min([freq(tt) for tt in s]) / 6. | ||||
| else: | else: | ||||
| elif re.match(r"[a-z. -]+$", t): | elif re.match(r"[a-z. -]+$", t): | ||||
| return 300 | return 300 | ||||
| elif len(t) >= 4: | elif len(t) >= 4: | ||||
| s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] | |||||
| s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] | |||||
| if len(s) > 1: | if len(s) > 1: | ||||
| return max(3, np.min([df(tt) for tt in s]) / 6.) | return max(3, np.min([df(tt) for tt in s]) / 6.) | ||||
| from api.db.db_models import close_connection | from api.db.db_models import close_connection | ||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from rag.settings import cron_logger | |||||
| from rag.utils.minio_conn import MINIO | from rag.utils.minio_conn import MINIO | ||||
| from rag.utils.redis_conn import REDIS_CONN | from rag.utils.redis_conn import REDIS_CONN | ||||
| def collect(): | def collect(): | ||||
| doc_locations = TaskService.get_ongoing_doc_name() | doc_locations = TaskService.get_ongoing_doc_name() | ||||
| #print(tasks) | |||||
| print(doc_locations) | |||||
| if len(doc_locations) == 0: | if len(doc_locations) == 0: | ||||
| time.sleep(1) | time.sleep(1) | ||||
| return | return | ||||
| if REDIS_CONN.exist(key):continue | if REDIS_CONN.exist(key):continue | ||||
| file_bin = MINIO.get(kb_id, loc) | file_bin = MINIO.get(kb_id, loc) | ||||
| REDIS_CONN.transaction(key, file_bin, 12 * 60) | REDIS_CONN.transaction(key, file_bin, 12 * 60) | ||||
| print("CACHE:", loc) | |||||
| cron_logger.info("CACHE: {}".format(loc)) | |||||
| except Exception as e: | except Exception as e: | ||||
| traceback.print_stack(e) | traceback.print_stack(e) | ||||
| except Exception as e: | except Exception as e: |
| from api.db.db_models import Task | from api.db.db_models import Task | ||||
| from api.db.db_utils import bulk_insert_into_db | from api.db.db_utils import bulk_insert_into_db | ||||
| from api.db.services.file2document_service import File2DocumentService | from api.db.services.file2document_service import File2DocumentService | ||||
| from api.db.services.file_service import FileService | |||||
| from api.db.services.task_service import TaskService | from api.db.services.task_service import TaskService | ||||
| from deepdoc.parser import PdfParser | from deepdoc.parser import PdfParser | ||||
| from deepdoc.parser.excel_parser import RAGFlowExcelParser | from deepdoc.parser.excel_parser import RAGFlowExcelParser |