瀏覽代碼

refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
tags/v0.5.0
KevinHuSh 1 年之前
父節點
當前提交
8c07992b6c
沒有連結到貢獻者的電子郵件帳戶。

+ 8
- 8
api/apps/chunk_app.py 查看文件

from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from rag.app.qa import rmPrefix, beAdoc from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, huqie
from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace from rag.utils import rmSpace
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
d = { d = {
"id": req["chunk_id"], "id": req["chunk_id"],
"content_with_weight": req["content_with_weight"]} "content_with_weight": req["content_with_weight"]}
d["content_ltks"] = huqie.qie(req["content_with_weight"])
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req["important_kwd"] d["important_kwd"] = req["important_kwd"]
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"]))
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
if "available_int" in req: if "available_int" in req:
d["available_int"] = req["available_int"] d["available_int"] = req["available_int"]
retmsg="Q&A must be separated by TAB/ENTER key.") retmsg="Q&A must be separated by TAB/ENTER key.")
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
d = beAdoc(d, arr[0], arr[1], not any( d = beAdoc(d, arr[0], arr[1], not any(
[huqie.is_chinese(t) for t in q + a]))
[rag_tokenizer.is_chinese(t) for t in q + a]))
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest() chunck_id = md5.hexdigest()
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]} "content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req.get("important_kwd", []) d["important_kwd"] = req.get("important_kwd", [])
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["create_timestamp_flt"] = datetime.datetime.now().timestamp()

+ 2
- 3
api/db/services/task_service.py 查看文件

docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
.join(Document, on=(cls.model.doc_id == Document.id)) \ .join(Document, on=(cls.model.doc_id == Document.id)) \
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
.join(File, on=(File2Document.file_id == File.id)) \
.join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
.where( .where(
Document.status == StatusEnum.VALID.value, Document.status == StatusEnum.VALID.value,
Document.run == TaskStatus.RUNNING.value, Document.run == TaskStatus.RUNNING.value,
~(Document.type == FileType.VIRTUAL.value), ~(Document.type == FileType.VIRTUAL.value),
cls.model.progress >= 0,
cls.model.progress < 1, cls.model.progress < 1,
cls.model.create_time >= current_timestamp() - 180000
cls.model.create_time >= current_timestamp() - 1000 * 600
) )
docs = list(docs.dicts()) docs = list(docs.dicts())
if not docs: return [] if not docs: return []

+ 3
- 3
deepdoc/parser/docx_parser.py 查看文件

import re import re
import pandas as pd import pandas as pd
from collections import Counter from collections import Counter
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from io import BytesIO from io import BytesIO




for p, n in patt: for p, n in patt:
if re.search(p, b): if re.search(p, b):
return n return n
tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1]
tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
if len(tks) > 3: if len(tks) > 3:
if len(tks) < 12: if len(tks) < 12:
return "Tx" return "Tx"
else: else:
return "Lx" return "Lx"


if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr" return "Nr"


return "Ot" return "Ot"

+ 7
- 7
deepdoc/parser/pdf_parser.py 查看文件



from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from copy import deepcopy from copy import deepcopy
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download


h = max(self.__height(up), self.__height(down)) h = max(self.__height(up), self.__height(down))
y_dis = self._y_dis(up, down) y_dis = self._y_dis(up, down)
LEN = 6 LEN = 6
tks_down = huqie.qie(down["text"][:LEN]).split(" ")
tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
tks_all = up["text"][-LEN:].strip() \ tks_all = up["text"][-LEN:].strip() \
+ (" " if re.match(r"[a-zA-Z0-9]+", + (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \ up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip() + down["text"][:LEN].strip()
tks_all = huqie.qie(tks_all).split(" ")
tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
fea = [ fea = [
up.get("R", -1) == down.get("R", -1), up.get("R", -1) == down.get("R", -1),
y_dis / h, y_dis / h,
tks_down[-1] == tks_up[-1], tks_down[-1] == tks_up[-1],
max(down["in_row"], up["in_row"]), max(down["in_row"], up["in_row"]),
abs(down["in_row"] - up["in_row"]), abs(down["in_row"] - up["in_row"]),
len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
] ]
return fea return fea




if b["text"].strip()[0] != b_["text"].strip()[0] \ if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
or huqie.is_chinese(b["text"].strip()[0]) \
or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
or b["top"] > b_["bottom"]: or b["top"] > b_["bottom"]:
i += 1 i += 1
continue continue

+ 3
- 3
deepdoc/parser/resume/entities/corporations.py 查看文件

import re,json,os import re,json,os
import pandas as pd import pandas as pd
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from . import regions from . import regions
current_file_path = os.path.dirname(os.path.abspath(__file__)) current_file_path = os.path.dirname(os.path.abspath(__file__))
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
def corpNorm(nm, add_region=True): def corpNorm(nm, add_region=True):
global CORP_TKS global CORP_TKS
if not nm or type(nm)!=type(""):return "" if not nm or type(nm)!=type(""):return ""
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower()
nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
nm = re.sub(r"&amp;", "&", nm) nm = re.sub(r"&amp;", "&", nm)
nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm)
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm


tks = huqie.qie(nm).split(" ")
tks = rag_tokenizer.tokenize(nm).split(" ")
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
nm = "" nm = ""
for t in tks: for t in tks:

+ 26
- 26
deepdoc/parser/resume/step_two.py 查看文件

traceback, signal traceback, signal
import numpy as np import numpy as np
from deepdoc.parser.resume.entities import degrees, schools, corporations from deepdoc.parser.resume.entities import degrees, schools, corporations
from rag.nlp import huqie, surname
from rag.nlp import rag_tokenizer, surname
from xpinyin import Pinyin from xpinyin import Pinyin
from contextlib import contextmanager from contextlib import contextmanager
if n.get("school_name") and isinstance(n["school_name"], str): if n.get("school_name") and isinstance(n["school_name"], str):
sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"]))
e["sch_nm_kwd"] = sch[-1] e["sch_nm_kwd"] = sch[-1]
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1])
fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
if n.get("discipline_name") and isinstance(n["discipline_name"], str): if n.get("discipline_name") and isinstance(n["discipline_name"], str):
maj.append(n["discipline_name"]) maj.append(n["discipline_name"])
if "tag_kwd" not in cv: cv["tag_kwd"] = [] if "tag_kwd" not in cv: cv["tag_kwd"] = []
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj))
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch))
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch))
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj))
if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
return cv return cv
if n.get("achivement"): desc.append(str(n["achivement"])) if n.get("achivement"): desc.append(str(n["achivement"]))
if pro_nms: if pro_nms:
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms))
cv["project_name_tks"] = huqie.qie(pro_nms[0])
# cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
if desc: if desc:
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc)))
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0]))
cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
return cv return cv
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
if fea["position_name"]: if fea["position_name"]:
cv["position_name_tks"] = huqie.qie(fea["position_name"][0])
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"])
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:]))
cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
if fea["industry_name"]: if fea["industry_name"]:
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0])
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"])
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:]))
cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
if fea["corporation_name"]: if fea["corporation_name"]:
cv["corporation_name_kwd"] = fea["corporation_name"][0] cv["corporation_name_kwd"] = fea["corporation_name"][0]
cv["corp_nm_kwd"] = fea["corporation_name"] cv["corp_nm_kwd"] = fea["corporation_name"]
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0])
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"])
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:]))
cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
if fea["responsibilities"]: if fea["responsibilities"]:
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0])
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:]))
cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
re.match(r"[^0-9]+$", str(i))] re.match(r"[^0-9]+$", str(i))]
if nms: if nms:
t = k[:-4] t = k[:-4]
cv[f"{t}_kwd"] = nms cv[f"{t}_kwd"] = nms
cv[f"{t}_tks"] = huqie.qie(" ".join(nms))
cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
except Exception as e: except Exception as e:
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
cv[k] = [] cv[k] = []
# tokenize fields # tokenize fields
if k in tks_fld: if k in tks_fld:
cv[f"{k}_tks"] = huqie.qie(cv[k])
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"])
cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
# keyword fields # keyword fields
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
cv["name_kwd"] = name cv["name_kwd"] = name
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
cv["name_tks"] = ( cv["name_tks"] = (
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
) if name else "" ) if name else ""
else: else:
cv["integerity_flt"] /= 2. cv["integerity_flt"] /= 2.
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
# long text tokenize # long text tokenize
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"]))
if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
# for yes or no field # for yes or no field
fea = [] fea = []

+ 3
- 3
deepdoc/vision/table_structure_recognizer.py 查看文件

from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from .recognizer import Recognizer from .recognizer import Recognizer
for p, n in patt: for p, n in patt:
if re.search(p, b["text"].strip()): if re.search(p, b["text"].strip()):
return n return n
tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
if len(tks) > 3: if len(tks) > 3:
if len(tks) < 12: if len(tks) < 12:
return "Tx" return "Tx"
else: else:
return "Lx" return "Lx"
if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr" return "Nr"
return "Ot" return "Ot"

+ 3
- 3
rag/app/book.py 查看文件

from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
tokenize_chunks, find_codec tokenize_chunks, find_codec
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser from deepdoc.parser import PdfParser, DocxParser, PlainParser
""" """
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections, tbls = [], [] sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):

+ 3
- 3
rag/app/laws.py 查看文件

from api.db import ParserType from api.db import ParserType
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec make_colon_as_title, add_positions, tokenize_chunks, find_codec
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser from deepdoc.parser import PdfParser, DocxParser, PlainParser
from rag.settings import cron_logger from rag.settings import cron_logger
""" """
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections = [] sections = []
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):

+ 3
- 3
rag/app/manual.py 查看文件

import re import re
from api.db import ParserType from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
doc = { doc = {
"docnm_kwd": filename "docnm_kwd": filename
} }
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English # is it English
eng = lang.lower() == "english" # pdf_parser.is_english eng = lang.lower() == "english" # pdf_parser.is_english

+ 3
- 3
rag/app/naive.py 查看文件

from timeit import default_timer as timer from timeit import default_timer as timer
import re import re
from deepdoc.parser.pdf_parser import PlainParser from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger from rag.settings import cron_logger
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
pdf_parser = None pdf_parser = None
sections = [] sections = []

+ 3
- 3
rag/app/one.py 查看文件

from io import BytesIO from io import BytesIO
import re import re
from rag.app import laws from rag.app import laws
from rag.nlp import huqie, tokenize, find_codec
from rag.nlp import rag_tokenizer, tokenize, find_codec
from deepdoc.parser import PdfParser, ExcelParser, PlainParser from deepdoc.parser import PdfParser, ExcelParser, PlainParser
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng) tokenize(doc, "\n".join(sections), eng)
return [doc] return [doc]

+ 5
- 5
rag/app/paper.py 查看文件

from collections import Counter from collections import Counter
from api.db import ParserType from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
import numpy as np import numpy as np
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
else: else:
raise NotImplementedError("file type not supported yet(pdf supported)") raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]),
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
"title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English # is it English
eng = lang.lower() == "english" # pdf_parser.is_english eng = lang.lower() == "english" # pdf_parser.is_english
print("It's English.....", eng) print("It's English.....", eng)

+ 3
- 3
rag/app/presentation.py 查看文件

from PIL import Image from PIL import Image
from rag.nlp import tokenize, is_english from rag.nlp import tokenize, is_english
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
eng = lang.lower() == "english" eng = lang.lower() == "english"
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE): if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt() ppt_parser = Ppt()

+ 4
- 4
rag/app/qa.py 查看文件

from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from rag.nlp import is_english, random_choices, find_codec from rag.nlp import is_english, random_choices, find_codec
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
aprefix = "Answer: " if eng else "回答:" aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join( d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = huqie.qie(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
return d return d
res = [] res = []
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

+ 6
- 6
rag/app/resume.py 查看文件

import pandas as pd import pandas as pd
import requests import requests
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two from deepdoc.parser.resume import step_one, step_two
from rag.settings import cron_logger from rag.settings import cron_logger
titles.append(str(v)) titles.append(str(v))
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie("-".join(titles) + "-简历")
"title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pairs = [] pairs = []
for n, m in field_map.items(): for n, m in field_map.items():
if not resume.get(n): if not resume.get(n):
doc["content_with_weight"] = "\n".join( doc["content_with_weight"] = "\n".join(
["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs])
doc["content_ltks"] = huqie.qie(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"])
doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
for n, _ in field_map.items(): for n, _ in field_map.items():
if n not in resume: if n not in resume:
continue continue
len(resume[n]) == 1 or n not in forbidden_select_fields4resume): len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0] resume[n] = resume[n][0]
if n.find("_tks") > 0: if n.find("_tks") > 0:
resume[n] = huqie.qieqie(resume[n])
resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n] doc[n] = resume[n]
print(doc) print(doc)

+ 3
- 3
rag/app/table.py 查看文件

from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import huqie, is_english, tokenize, find_codec
from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
for ii, row in df.iterrows(): for ii, row in df.iterrows():
d = { d = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
row_txt = [] row_txt = []
for j in range(len(clmns)): for j in range(len(clmns)):
if pd.isna(row[clmns[j]]): if pd.isna(row[clmns[j]]):
continue continue
fld = clmns_map[j][0] fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
row[clmns[j]]) row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt: if not row_txt:

+ 3
- 3
rag/nlp/__init__.py 查看文件

from collections import Counter from collections import Counter
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from . import huqie
from . import rag_tokenizer
import re import re
import copy import copy
def tokenize(d, t, eng): def tokenize(d, t, eng):
d["content_with_weight"] = t d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
d["content_ltks"] = huqie.qie(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser): def tokenize_chunks(chunks, doc, eng, pdf_parser):

+ 8
- 8
rag/nlp/query.py 查看文件

import copy import copy
from elasticsearch_dsl import Q from elasticsearch_dsl import Q


from rag.nlp import huqie, term_weight, synonym
from rag.nlp import rag_tokenizer, term_weight, synonym




class EsQueryer: class EsQueryer:
txt = re.sub( txt = re.sub(
r"[ \r\n\t,,。??/`!!&]+", r"[ \r\n\t,,。??/`!!&]+",
" ", " ",
huqie.tradi2simp(
huqie.strQ2B(
rag_tokenizer.tradi2simp(
rag_tokenizer.strQ2B(
txt.lower()))).strip() txt.lower()))).strip()
txt = EsQueryer.rmWWW(txt) txt = EsQueryer.rmWWW(txt)


if not self.isChinese(txt): if not self.isChinese(txt):
tks = huqie.qie(txt).split(" ")
tks = rag_tokenizer.tokenize(txt).split(" ")
q = copy.deepcopy(tks) q = copy.deepcopy(tks)
for i in range(1, len(tks)): for i in range(1, len(tks)):
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
boost=1)#, minimum_should_match=min_match) boost=1)#, minimum_should_match=min_match)
), tks ), tks


def needQieqie(tk):
def need_fine_grained_tokenize(tk):
if len(tk) < 4: if len(tk) < 4:
return False return False
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
logging.info(json.dumps(twts, ensure_ascii=False)) logging.info(json.dumps(twts, ensure_ascii=False))
tms = [] tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1): for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else []
sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
sm = [ sm = [
re.sub( re.sub(
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
if len(twts) > 1: if len(twts) > 1:
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts])) tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
if re.match(r"[0-9a-z ]+$", tt): if re.match(r"[0-9a-z ]+$", tt):
tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt)
tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)


syns = " OR ".join( syns = " OR ".join(
["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns])
["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
if syns: if syns:
tms = f"({tms})^5 OR ({syns})^0.7" tms = f"({tms})^5 OR ({syns})^0.7"



+ 423
- 0
rag/nlp/rag_tokenizer.py 查看文件

# -*- coding: utf-8 -*-

import copy
import datrie
import math
import os
import re
import string
import sys
from hanziconv import HanziConv
from huggingface_hub import snapshot_download
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory


class RagTokenizer:
def key_(self, line):
return str(line.lower().encode("utf-8"))[2:-1]

def rkey_(self, line):
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]

def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
try:
of = open(fnm, "r")
while True:
line = of.readline()
if not line:
break
line = re.sub(r"[\r\n]+", "", line)
line = re.split(r"[ \t]", line)
k = self.key_(line[0])
F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
if k not in self.trie_ or self.trie_[k][0] < F:
self.trie_[self.key_(line[0])] = (F, line[2])
self.trie_[self.rkey_(line[0])] = 1
self.trie_.save(fnm + ".trie")
of.close()
except Exception as e:
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)

def __init__(self, debug=False):
self.DEBUG = debug
self.DENOMINATOR = 1000000
self.trie_ = datrie.Trie(string.printable)
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")

self.stemmer = PorterStemmer()
self.lemmatizer = WordNetLemmatizer()

self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception as e:
print("[HUQIE]:Build default trie", file=sys.stderr)
self.trie_ = datrie.Trie(string.printable)

self.loadDict_(self.DIR_ + ".txt")

def loadUserDict(self, fnm):
try:
self.trie_ = datrie.Trie.load(fnm + ".trie")
return
except Exception as e:
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(fnm)

def addUserDict(self, fnm):
self.loadDict_(fnm)

def _strQ2B(self, ustring):
"""把字符串全角转半角"""
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
rstring += uchar
else:
rstring += chr(inside_code)
return rstring

def _tradi2simp(self, line):
return HanziConv.toSimplified(line)

def dfs_(self, chars, s, preTks, tkslist):
MAX_L = 10
res = s
# if s > MAX_L or s>= len(chars):
if s >= len(chars):
tkslist.append(preTks)
return res

# pruning
S = s + 1
if s + 2 <= len(chars):
t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
self.key_(t2)):
S = s + 2
if len(preTks) > 2 and len(
preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
t1 = preTks[-1][0] + "".join(chars[s:s + 1])
if self.trie_.has_keys_with_prefix(self.key_(t1)):
S = s + 2

################
for e in range(S, len(chars) + 1):
t = "".join(chars[s:e])
k = self.key_(t)

if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
break

if k in self.trie_:
pretks = copy.deepcopy(preTks)
if k in self.trie_:
pretks.append((t, self.trie_[k]))
else:
pretks.append((t, (-12, '')))
res = max(res, self.dfs_(chars, e, pretks, tkslist))

if res > s:
return res

t = "".join(chars[s:s + 1])
k = self.key_(t)
if k in self.trie_:
preTks.append((t, self.trie_[k]))
else:
preTks.append((t, (-12, '')))

return self.dfs_(chars, s + 1, preTks, tkslist)

def freq(self, tk):
k = self.key_(tk)
if k not in self.trie_:
return 0
return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)

def tag(self, tk):
k = self.key_(tk)
if k not in self.trie_:
return ""
return self.trie_[k][1]

def score_(self, tfts):
B = 30
F, L, tks = 0, 0, []
for tk, (freq, tag) in tfts:
F += freq
L += 0 if len(tk) < 2 else 1
tks.append(tk)
F /= len(tks)
L /= len(tks)
if self.DEBUG:
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
return tks, B / len(tks) + L + F

def sortTks_(self, tkslist):
res = []
for tfts in tkslist:
tks, s = self.score_(tfts)
res.append((tks, s))
return sorted(res, key=lambda x: x[1], reverse=True)

def merge_(self, tks):
patts = [
(r"[ ]+", " "),
(r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
]
# for p,s in patts: tks = re.sub(p, s, tks)

# if split chars is part of token
res = []
tks = re.sub(r"[ ]+", " ", tks).split(" ")
s = 0
while True:
if s >= len(tks):
break
E = s + 1
for e in range(s + 2, min(len(tks) + 2, s + 6)):
tk = "".join(tks[s:e])
if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
E = e
res.append("".join(tks[s:E]))
s = E

return " ".join(res)

def maxForward_(self, line):
res = []
s = 0
while s < len(line):
e = s + 1
t = line[s:e]
while e < len(line) and self.trie_.has_keys_with_prefix(
self.key_(t)):
e += 1
t = line[s:e]

while e - 1 > s and self.key_(t) not in self.trie_:
e -= 1
t = line[s:e]

if self.key_(t) in self.trie_:
res.append((t, self.trie_[self.key_(t)]))
else:
res.append((t, (0, '')))

s = e

return self.score_(res)

def maxBackward_(self, line):
res = []
s = len(line) - 1
while s >= 0:
e = s + 1
t = line[s:e]
while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
s -= 1
t = line[s:e]

while s + 1 < e and self.key_(t) not in self.trie_:
s += 1
t = line[s:e]

if self.key_(t) in self.trie_:
res.append((t, self.trie_[self.key_(t)]))
else:
res.append((t, (0, '')))

s -= 1

return self.score_(res[::-1])

def tokenize(self, line):
line = self._strQ2B(line).lower()
line = self._tradi2simp(line)
zh_num = len([1 for c in line if is_chinese(c)])
if zh_num < len(line) * 0.2:
return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])

arr = re.split(self.SPLIT_CHAR, line)
res = []
for L in arr:
if len(L) < 2 or re.match(
r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
res.append(L)
continue
# print(L)

# use maxforward for the first time
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
print("[FW]", tks, s)
print("[BW]", tks1, s1)

diff = [0 for _ in range(max(len(tks1), len(tks)))]
for i in range(min(len(tks1), len(tks))):
if tks[i] != tks1[i]:
diff[i] = 1

if s1 > s:
tks = tks1

i = 0
while i < len(tks):
s = i
while s < len(tks) and diff[s] == 0:
s += 1
if s == len(tks):
res.append(" ".join(tks[i:]))
break
if s > i:
res.append(" ".join(tks[i:s]))

e = s
while e < len(tks) and e - s < 5 and diff[e] == 1:
e += 1

tkslist = []
self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
res.append(" ".join(self.sortTks_(tkslist)[0][0]))

i = e + 1

res = " ".join(res)
if self.DEBUG:
print("[TKS]", self.merge_(res))
return self.merge_(res)

def fine_grained_tokenize(self, tks):
tks = tks.split(" ")
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
if zh_num < len(tks) * 0.2:
res = []
for tk in tks:
res.extend(tk.split("/"))
return " ".join(res)

res = []
for tk in tks:
if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
res.append(tk)
continue
tkslist = []
if len(tk) > 10:
tkslist.append(tk)
else:
self.dfs_(tk, 0, [], tkslist)
if len(tkslist) < 2:
res.append(tk)
continue
stk = self.sortTks_(tkslist)[1][0]
if len(stk) == len(tk):
stk = tk
else:
if re.match(r"[a-z\.-]+$", tk):
for t in stk:
if len(t) < 3:
stk = tk
break
else:
stk = " ".join(stk)
else:
stk = " ".join(stk)

res.append(stk)

return " ".join(res)


def is_chinese(s):
if s >= u'\u4e00' and s <= u'\u9fa5':
return True
else:
return False


def is_number(s):
if s >= u'\u0030' and s <= u'\u0039':
return True
else:
return False


def is_alphabet(s):
if (s >= u'\u0041' and s <= u'\u005a') or (
s >= u'\u0061' and s <= u'\u007a'):
return True
else:
return False


def naiveQie(txt):
tks = []
for t in txt.split(" "):
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
) and re.match(r".*[a-zA-Z]$", t):
tks.append(" ")
tks.append(t)
return tks


tokenizer = RagTokenizer()
tokenize = tokenizer.tokenize
fine_grained_tokenize = tokenizer.fine_grained_tokenize
tag = tokenizer.tag
freq = tokenizer.freq
loadUserDict = tokenizer.loadUserDict
addUserDict = tokenizer.addUserDict
tradi2simp = tokenizer._tradi2simp
strQ2B = tokenizer._strQ2B

if __name__ == '__main__':
tknzr = RagTokenizer(debug=True)
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
print(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
tknzr.loadUserDict(sys.argv[1])
of = open(sys.argv[2], "r")
while True:
line = of.readline()
if not line:
break
print(tknzr.tokenize(line))
of.close()

+ 7
- 7
rag/nlp/search.py 查看文件



from rag.settings import es_logger from rag.settings import es_logger
from rag.utils import rmSpace from rag.utils import rmSpace
from rag.nlp import huqie, query
from rag.nlp import rag_tokenizer, query
import numpy as np import numpy as np




kwds = set([]) kwds = set([])
for k in keywords: for k in keywords:
kwds.add(k) kwds.add(k)
for kk in huqie.qieqie(k).split(" "):
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
if len(kk) < 2: if len(kk) < 2:
continue continue
if kk in kwds: if kk in kwds:
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0])) len(ans_v[0]), len(chunk_v[0]))


chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ")
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
for ck in chunks] for ck in chunks]
cites = {} cites = {}
thr = 0.63 thr = 0.63
for i, a in enumerate(pieces_): for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v, chunk_v,
huqie.qie(
rag_tokenizer.tokenize(
self.qryr.rmWWW(pieces_[i])).split(" "), self.qryr.rmWWW(pieces_[i])).split(" "),
chunks_tks, chunks_tks,
tkweight, vtweight) tkweight, vtweight)
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
return self.qryr.hybrid_similarity(ans_embd, return self.qryr.hybrid_similarity(ans_embd,
ins_embd, ins_embd,
huqie.qie(ans).split(" "),
huqie.qie(inst).split(" "))
rag_tokenizer.tokenize(ans).split(" "),
rag_tokenizer.tokenize(inst).split(" "))


def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(3) fld, v = r.group(1), r.group(3)
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format( match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
fld, huqie.qieqie(huqie.qie(v)))
fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
replaces.append( replaces.append(
("{}{}'{}'".format( ("{}{}'{}'".format(
r.group(1), r.group(1),

+ 6
- 6
rag/nlp/term_weight.py 查看文件

import re import re
import os import os
import numpy as np import numpy as np
from rag.nlp import huqie
from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory




txt = re.sub(p, r, txt) txt = re.sub(p, r, txt)


res = [] res = []
for t in huqie.qie(txt).split(" "):
for t in rag_tokenizer.tokenize(txt).split(" "):
tk = t tk = t
if (stpwd and tk in self.stop_words) or ( if (stpwd and tk in self.stop_words) or (
re.match(r"[0-9]$", tk) and not num): re.match(r"[0-9]$", tk) and not num):
return m[self.ne[t]] return m[self.ne[t]]


def postag(t): def postag(t):
t = huqie.tag(t)
t = rag_tokenizer.tag(t)
if t in set(["r", "c", "d"]): if t in set(["r", "c", "d"]):
return 0.3 return 0.3
if t in set(["ns", "nt"]): if t in set(["ns", "nt"]):
def freq(t): def freq(t):
if re.match(r"[0-9. -]{2,}$", t): if re.match(r"[0-9. -]{2,}$", t):
return 3 return 3
s = huqie.freq(t)
s = rag_tokenizer.freq(t)
if not s and re.match(r"[a-z. -]+$", t): if not s and re.match(r"[a-z. -]+$", t):
return 300 return 300
if not s: if not s:
s = 0 s = 0


if not s and len(t) >= 4: if not s and len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
if len(s) > 1: if len(s) > 1:
s = np.min([freq(tt) for tt in s]) / 6. s = np.min([freq(tt) for tt in s]) / 6.
else: else:
elif re.match(r"[a-z. -]+$", t): elif re.match(r"[a-z. -]+$", t):
return 300 return 300
elif len(t) >= 4: elif len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
if len(s) > 1: if len(s) > 1:
return max(3, np.min([df(tt) for tt in s]) / 6.) return max(3, np.min([df(tt) for tt in s]) / 6.)



+ 3
- 2
rag/svr/cache_file_svr.py 查看文件

from api.db.db_models import close_connection from api.db.db_models import close_connection
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.settings import cron_logger
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO
from rag.utils.redis_conn import REDIS_CONN from rag.utils.redis_conn import REDIS_CONN
def collect(): def collect():
doc_locations = TaskService.get_ongoing_doc_name() doc_locations = TaskService.get_ongoing_doc_name()
#print(tasks)
print(doc_locations)
if len(doc_locations) == 0: if len(doc_locations) == 0:
time.sleep(1) time.sleep(1)
return return
if REDIS_CONN.exist(key):continue if REDIS_CONN.exist(key):continue
file_bin = MINIO.get(kb_id, loc) file_bin = MINIO.get(kb_id, loc)
REDIS_CONN.transaction(key, file_bin, 12 * 60) REDIS_CONN.transaction(key, file_bin, 12 * 60)
print("CACHE:", loc)
cron_logger.info("CACHE: {}".format(loc))
except Exception as e: except Exception as e:
traceback.print_stack(e) traceback.print_stack(e)
except Exception as e: except Exception as e:

+ 0
- 1
rag/svr/task_broker.py 查看文件

from api.db.db_models import Task from api.db.db_models import Task
from api.db.db_utils import bulk_insert_into_db from api.db.db_utils import bulk_insert_into_db
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from deepdoc.parser import PdfParser from deepdoc.parser import PdfParser
from deepdoc.parser.excel_parser import RAGFlowExcelParser from deepdoc.parser.excel_parser import RAGFlowExcelParser

Loading…
取消
儲存