### What problem does this PR solve? #2415 ### Type of change - [x] Performance Improvement

1年前 · 9d4bb5767c
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -214,7 +214,7 @@ def is_english(texts):
    eng = 0
    if not texts: return False
    for t in texts:
        if re.match(r"[a-zA-Z]{2,}", t.strip()):
        if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()):
            eng += 1
    if eng / len(texts) > 0.8:
        return True
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -24,7 +24,7 @@ from dataclasses import dataclass

 from rag.settings import es_logger
 from rag.utils import rmSpace
 from rag.nlp import rag_tokenizer, query
 from rag.nlp import rag_tokenizer, query, is_english
 import numpy as np


@@ -164,7 +164,7 @@ class Dealer:
            ids=self.es.getDocIds(res),
            query_vector=q_vec,
            aggregation=aggs,
            highlight=self.getHighlight(res),
            highlight=self.getHighlight(res, keywords, "content_with_weight"),
            field=self.getFields(res, src),
            keywords=list(kwds)
        )
@@ -175,26 +175,28 @@ class Dealer:
        bkts = res["aggregations"]["aggs_" + g]["buckets"]
        return [(b["key"], b["doc_count"]) for b in bkts]

    def getHighlight(self, res):
        def rmspace(line):
            eng = set(list("qwertyuioplkjhgfdsazxcvbnm"))
            r = []
            for t in line.split(" "):
                if not t:
                    continue
                if len(r) > 0 and len(
                        t) > 0 and r[-1][-1] in eng and t[0] in eng:
                    r.append(" ")
                r.append(t)
            r = "".join(r)
            return r

    def getHighlight(self, res, keywords, fieldnm):
        ans = {}
        for d in res["hits"]["hits"]:
            hlts = d.get("highlight")
            if not hlts:
                continue
            ans[d["_id"]] = "".join([a for a in list(hlts.items())[0][1]])
            txt = "...".join([a for a in list(hlts.items())[0][1]])
            if not is_english(txt.split(" ")):
                ans[d["_id"]] = txt
                continue

            txt = d["_source"][fieldnm]
            txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE|re.MULTILINE)
            txts = []
            for w in keywords:
                txt = re.sub(r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-])"%re.escape(w), r"\1<em>\2</em>\3", txt, flags=re.IGNORECASE|re.MULTILINE)

            for t in re.split(r"[.?!;\n]", txt):
                if not re.search(r"<em>[^<>]+</em>", t, flags=re.IGNORECASE|re.MULTILINE): continue
                txts.append(t)
            ans[d["_id"]] = "...".join(txts) if txts else "...".join([a for a in list(hlts.items())[0][1]])

        return ans

    def getFields(self, sres, flds):
--- a/rag/utils/__init__.py
+++ b/rag/utils/__init__.py
@@ -32,8 +32,8 @@ def singleton(cls, *args, **kw):


 def rmSpace(txt):
    txt = re.sub(r"([^a-z0-9.,]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
    return re.sub(r"([^ ]) +([^a-z0-9.,])", r"\1\2", txt, flags=re.IGNORECASE)
    txt = re.sub(r"([^a-z0-9.,\)>]) +([^ ])", r"\1\2", txt, flags=re.IGNORECASE)
    return re.sub(r"([^ ]) +([^a-z0-9.,\(<])", r"\1\2", txt, flags=re.IGNORECASE)


 def findMaxDt(fnm):