### What problem does this PR solve? Fix errors detected by Ruff ### Type of change - [x] Refactoring

hace 10 meses · 0d68a6cd1b
--- a/agent/canvas.py
+++ b/agent/canvas.py
            "components": {}
        }
        for k in self.dsl.keys():
            if k in ["components"]:continue
            if k in ["components"]:
                continue
            dsl[k] = deepcopy(self.dsl[k])
        for k, cpn in self.components.items():
    def get_compnent_name(self, cid):
        for n in self.dsl["graph"]["nodes"]:
            if cid == n["id"]: return n["data"]["name"]
            if cid == n["id"]:
                return n["data"]["name"]
        return ""
    def run(self, **kwargs):
            if kwargs.get("stream"):
                for an in ans():
                    yield an
            else: yield ans
            else:
                yield ans
            return
        if not self.path:
        def prepare2run(cpns):
            nonlocal ran, ans
            for c in cpns:
                if self.path[-1] and c == self.path[-1][-1]: continue
                if self.path[-1] and c == self.path[-1][-1]:
                    continue
                cpn = self.components[c]["obj"]
                if cpn.component_name == "Answer":
                    self.answer.append(c)
                    if c not in without_dependent_checking:
                        cpids = cpn.get_dependent_components()
                        if any([cc not in self.path[-1] for cc in cpids]):
                            if c not in waiting: waiting.append(c)
                            if c not in waiting:
                                waiting.append(c)
                            continue
                    yield "*'{}'* is running...🕞".format(self.get_compnent_name(c))
                    ans = cpn.run(self.history, **kwargs)
            logging.debug(f"Canvas.run: {ran} {self.path}")
            cpn_id = self.path[-1][ran]
            cpn = self.get_component(cpn_id)
            if not cpn["downstream"]: break
            if not cpn["downstream"]:
                break
            loop = self._find_loop()
            if loop: raise OverflowError(f"Too much loops: {loop}")
            if loop:
                raise OverflowError(f"Too much loops: {loop}")
            if cpn["obj"].component_name.lower() in ["switch", "categorize", "relevant"]:
                switch_out = cpn["obj"].output()[1].iloc[0, 0]
    def _find_loop(self, max_loops=6):
        path = self.path[-1][::-1]
        if len(path) < 2: return False
        if len(path) < 2:
            return False
        for i in range(len(path)):
            if path[i].lower().find("answer") >= 0:
                path = path[:i]
                break
        if len(path) < 2: return False
        if len(path) < 2:
            return False
        for l in range(2, len(path) // 2):
            pat = ",".join(path[0:l])
        for loc in range(2, len(path) // 2):
            pat = ",".join(path[0:loc])
            path_str = ",".join(path)
            if len(pat) >= len(path_str): return False
            if len(pat) >= len(path_str):
                return False
            loop = max_loops
            while path_str.find(pat) == 0 and loop >= 0:
                loop -= 1
                    return False
                path_str = path_str[len(pat)+1:]
            if loop < 0:
                pat = " => ".join([p.split(":")[0] for p in path[0:l]])
                pat = " => ".join([p.split(":")[0] for p in path[0:loc]])
                return pat + " => " + pat
        return False
--- a/agent/component/__init__.py
+++ b/agent/component/__init__.py
    m = importlib.import_module("agent.component")
    c = getattr(m, class_name)
    return c
 __all__ = [
    "Begin",
    "BeginParam",
    "Generate",
    "GenerateParam",
    "Retrieval",
    "RetrievalParam",
    "Answer",
    "AnswerParam",
    "Categorize",
    "CategorizeParam",
    "Switch",
    "SwitchParam",
    "Relevant",
    "RelevantParam",
    "Message",
    "MessageParam",
    "RewriteQuestion",
    "RewriteQuestionParam",
    "KeywordExtract",
    "KeywordExtractParam",
    "Concentrator",
    "ConcentratorParam",
    "Baidu",
    "BaiduParam",
    "DuckDuckGo",
    "DuckDuckGoParam",
    "Wikipedia",
    "WikipediaParam",
    "PubMed",
    "PubMedParam",
    "ArXiv",
    "ArXivParam",
    "Google",
    "GoogleParam",
    "Bing",
    "BingParam",
    "GoogleScholar",
    "GoogleScholarParam",
    "DeepL",
    "DeepLParam",
    "GitHub",
    "GitHubParam",
    "BaiduFanyi",
    "BaiduFanyiParam",
    "QWeather",
    "QWeatherParam",
    "ExeSQL",
    "ExeSQLParam",
    "YahooFinance",
    "YahooFinanceParam",
    "WenCai",
    "WenCaiParam",
    "Jin10",
    "Jin10Param",
    "TuShare",
    "TuShareParam",
    "AkShare",
    "AkShareParam",
    "Crawler",
    "CrawlerParam",
    "Invoke",
    "InvokeParam",
    "Template",
    "TemplateParam",
    "Email",
    "EmailParam",
    "component_class"
 ]
--- a/agent/component/base.py
+++ b/agent/component/base.py
    def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
        o = getattr(self._param, self._param.output_var_name)
        if not isinstance(o, partial) and not isinstance(o, pd.DataFrame):
            if not isinstance(o, list): o = [o]
            if not isinstance(o, list):
                o = [o]
            o = pd.DataFrame(o)
        if allow_partial or not isinstance(o, partial):
        for oo in o():
            if not isinstance(oo, pd.DataFrame):
                outs = pd.DataFrame(oo if isinstance(oo, list) else [oo])
            else: outs = oo
            else:
                outs = oo
        return self._param.output_var_name, outs
    def reset(self):
                    outs.append(pd.DataFrame([{"content": q["value"]}]))
            if outs:
                df = pd.concat(outs, ignore_index=True)
                if "content" in df: df = df.drop_duplicates(subset=['content']).reset_index(drop=True)
                if "content" in df:
                    df = df.drop_duplicates(subset=['content']).reset_index(drop=True)
                return df
        upstream_outs = []
        for u in reversed_cpnts[::-1]:
            if self.get_component_name(u) in ["switch", "concentrator"]: continue
            if self.get_component_name(u) in ["switch", "concentrator"]:
                continue
            if self.component_name.lower() == "generate" and self.get_component_name(u) == "retrieval":
                o = self._canvas.get_component(u)["obj"].output(allow_partial=False)[1]
                if o is not None:
        reversed_cpnts.extend(self._canvas.path[-1])
        for u in reversed_cpnts[::-1]:
            if self.get_component_name(u) in ["switch", "answer"]: continue
            if self.get_component_name(u) in ["switch", "answer"]:
                continue
            return self._canvas.get_component(u)["obj"].output()[1]
    @staticmethod
--- a/agent/component/categorize.py
+++ b/agent/component/categorize.py
        super().check()
        self.check_empty(self.category_description, "[Categorize] Category examples")
        for k, v in self.category_description.items():
            if not k: raise ValueError("[Categorize] Category name can not be empty!")
            if not v.get("to"): raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
            if not k:
                raise ValueError("[Categorize] Category name can not be empty!")
            if not v.get("to"):
                raise ValueError(f"[Categorize] 'To' of category {k} can not be empty!")
    def get_prompt(self):
        cate_lines = []
        for c, desc in self.category_description.items():
            for l in desc.get("examples", "").split("\n"):
                if not l: continue
                cate_lines.append("Question: {}\tCategory: {}".format(l, c))
            for line in desc.get("examples", "").split("\n"):
                if not line:
                    continue
                cate_lines.append("Question: {}\tCategory: {}".format(line, c))
        descriptions = []
        for c, desc in self.category_description.items():
            if desc.get("description"):
--- a/agent/component/deepl.py
+++ b/agent/component/deepl.py
 #  limitations under the License.
 #
 from abc import ABC
 import re
 from agent.component.base import ComponentBase, ComponentParamBase
 import deepl
--- a/agent/component/exesql.py
+++ b/agent/component/exesql.py
        self.check_empty(self.password, "Database password")
        self.check_positive_integer(self.top_n, "Number of records")
        if self.database == "rag_flow":
            if self.host == "ragflow-mysql": raise ValueError("The host is not accessible.")
            if self.password == "infini_rag_flow": raise ValueError("The host is not accessible.")
            if self.host == "ragflow-mysql":
                raise ValueError("The host is not accessible.")
            if self.password == "infini_rag_flow":
                raise ValueError("The host is not accessible.")
 class ExeSQL(ComponentBase, ABC):
--- a/agent/component/generate.py
+++ b/agent/component/generate.py
    def gen_conf(self):
        conf = {}
        if self.max_tokens > 0: conf["max_tokens"] = self.max_tokens
        if self.temperature > 0: conf["temperature"] = self.temperature
        if self.top_p > 0: conf["top_p"] = self.top_p
        if self.presence_penalty > 0: conf["presence_penalty"] = self.presence_penalty
        if self.frequency_penalty > 0: conf["frequency_penalty"] = self.frequency_penalty
        if self.max_tokens > 0:
            conf["max_tokens"] = self.max_tokens
        if self.temperature > 0:
            conf["temperature"] = self.temperature
        if self.top_p > 0:
            conf["top_p"] = self.top_p
        if self.presence_penalty > 0:
            conf["presence_penalty"] = self.presence_penalty
        if self.frequency_penalty > 0:
            conf["frequency_penalty"] = self.frequency_penalty
        return conf
        recall_docs = []
        for i in idx:
            did = retrieval_res.loc[int(i), "doc_id"]
            if did in doc_ids: continue
            if did in doc_ids:
                continue
            doc_ids.add(did)
            recall_docs.append({"doc_id": did, "doc_name": retrieval_res.loc[int(i), "docnm_kwd"]})
        retrieval_res = []
        self._param.inputs = []
        for para in self._param.parameters:
            if not para.get("component_id"): continue
            if not para.get("component_id"):
                continue
            component_id = para["component_id"].split("@")[0]
            if para["component_id"].lower().find("@") >= 0:
                cpn_id, key = para["component_id"].split("@")
        if retrieval_res:
            retrieval_res = pd.concat(retrieval_res, ignore_index=True)
        else: retrieval_res = pd.DataFrame([])
        else:
            retrieval_res = pd.DataFrame([])
        for n, v in kwargs.items():
            prompt = re.sub(r"\{%s\}" % re.escape(n), str(v).replace("\\", " "), prompt)
            return pd.DataFrame([res])
        msg = self._canvas.get_history(self._param.message_history_window_size)
        if len(msg) < 1: msg.append({"role": "user", "content": ""})
        if len(msg) < 1:
            msg.append({"role": "user", "content": ""})
        _, msg = message_fit_in([{"role": "system", "content": prompt}, *msg], int(chat_mdl.max_length * 0.97))
        if len(msg) < 2: msg.append({"role": "user", "content": ""})
        if len(msg) < 2:
            msg.append({"role": "user", "content": ""})
        ans = chat_mdl.chat(msg[0]["content"], msg[1:], self._param.gen_conf())
        if self._param.cite and "content_ltks" in retrieval_res.columns and "vector" in retrieval_res.columns:
            return
        msg = self._canvas.get_history(self._param.message_history_window_size)
        if len(msg) < 1: msg.append({"role": "user", "content": ""})
        if len(msg) < 1:
            msg.append({"role": "user", "content": ""})
        _, msg = message_fit_in([{"role": "system", "content": prompt}, *msg], int(chat_mdl.max_length * 0.97))
        if len(msg) < 2: msg.append({"role": "user", "content": ""})
        if len(msg) < 2:
            msg.append({"role": "user", "content": ""})
        answer = ""
        for ans in chat_mdl.chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf()):
            res = {"content": ans, "reference": []}
--- a/agent/component/rewrite.py
+++ b/agent/component/rewrite.py
        hist = self._canvas.get_history(4)
        conv = []
        for m in hist:
            if m["role"] not in ["user", "assistant"]: continue
            if m["role"] not in ["user", "assistant"]:
                continue
            conv.append("{}: {}".format(m["role"].upper(), m["content"]))
        conv = "\n".join(conv)
--- a/agent/component/switch.py
+++ b/agent/component/switch.py
    def check(self):
        self.check_empty(self.conditions, "[Switch] conditions")
        for cond in self.conditions:
            if not cond["to"]: raise ValueError(f"[Switch] 'To' can not be empty!")
            if not cond["to"]:
                raise ValueError("[Switch] 'To' can not be empty!")
 class Switch(ComponentBase, ABC):
        res = []
        for cond in self._param.conditions:
            for item in cond["items"]:
                if not item["cpn_id"]: continue
                if not item["cpn_id"]:
                    continue
                if item["cpn_id"].find("begin") >= 0:
                    continue
                cid = item["cpn_id"].split("@")[0]
        for cond in self._param.conditions:
            res = []
            for item in cond["items"]:
                if not item["cpn_id"]:continue
                if not item["cpn_id"]:
                    continue
                cid = item["cpn_id"].split("@")[0]
                if item["cpn_id"].find("@") > 0:
                    cpn_id, key = item["cpn_id"].split("@")
        elif operator == ">":
            try:
                return True if float(input) > float(value) else False
            except Exception as e:
            except Exception:
                return True if input > value else False
        elif operator == "<":
            try:
                return True if float(input) < float(value) else False
            except Exception as e:
            except Exception:
                return True if input < value else False
        elif operator == "≥":
            try:
                return True if float(input) >= float(value) else False
            except Exception as e:
            except Exception:
                return True if input >= value else False
        elif operator == "≤":
            try:
                return True if float(input) <= float(value) else False
            except Exception as e:
            except Exception:
                return True if input <= value else False
        raise ValueError('Not supported operator' + operator)
--- a/agent/component/template.py
+++ b/agent/component/template.py
        self._param.inputs = []
        for para in self._param.parameters:
            if not para.get("component_id"): continue
            if not para.get("component_id"):
                continue
            component_id = para["component_id"].split("@")[0]
            if para["component_id"].lower().find("@") >= 0:
                cpn_id, key = para["component_id"].split("@")
--- a/agent/test/client.py
+++ b/agent/test/client.py
        else:
            print(ans["content"])
        if DEBUG: print(canvas.path)
        if DEBUG:
            print(canvas.path)
        question = input("\n==================== User =====================\n> ")
        canvas.add_user_input(question)
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
    if not objs:
        return get_json_result(
            data=False, message='Token is not valid!"', code=settings.RetCode.AUTHENTICATION_ERROR)
    req = request.json
    try:
        if objs[0].source == "agent":
            e, cvs = UserCanvasService.get_by_id(objs[0].dialog_id)
    e, conv = API4ConversationService.get_by_id(req["conversation_id"])
    if not e:
        return get_data_error_result(message="Conversation not found!")
    if "quote" not in req: req["quote"] = False
    if "quote" not in req:
        req["quote"] = False
    msg = []
    for m in req["messages"]:
        if m["role"] == "assistant" and not msg:
            continue
        msg.append(m)
    if not msg[-1].get("id"): msg[-1]["id"] = get_uuid()
    if not msg[-1].get("id"):
        msg[-1]["id"] = get_uuid()
    message_id = msg[-1]["id"]
    def fillin_conv(ans):
    e, conv = API4ConversationService.get_by_id(req["conversation_id"])
    if not e:
        return get_data_error_result(message="Conversation not found!")
    if "quote" not in req: req["quote"] = True
    if "quote" not in req:
        req["quote"] = True
    msg = []
    msg.append({"role": "user", "content": req["word"]})
    if not msg[-1].get("id"): msg[-1]["id"] = get_uuid()
    if not msg[-1].get("id"):
        msg[-1]["id"] = get_uuid()
    message_id = msg[-1]["id"]
    def fillin_conv(ans):
--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import logging
 import json
 import traceback
 from functools import partial
 from flask import request, Response
 from flask_login import login_required, current_user
 from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
 def save():
    req = request.json
    req["user_id"] = current_user.id
    if not isinstance(req["dsl"], str): req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
    if not isinstance(req["dsl"], str):
        req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
    req["dsl"] = json.loads(req["dsl"])
    if "id" not in req:
        return resp
    for answer in canvas.run(stream=False):
        if answer.get("running_status"): continue
        if answer.get("running_status"):
            continue
        final_ans["content"] = "\n".join(answer["content"]) if "content" in answer else ""
        canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
        if final_ans.get("reference"):
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
        e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
        if not e:
            return get_data_error_result(message="Knowledgebase not found!")
        if kb.pagerank: d["pagerank_fea"] = kb.pagerank
        if kb.pagerank:
            d["pagerank_fea"] = kb.pagerank
        embd_id = DocumentService.get_embd_id(req["doc_id"])
        embd_mdl = LLMBundle(tenant_id, LLMType.EMBEDDING.value, embd_id)
--- a/api/apps/conversation_app.py
+++ b/api/apps/conversation_app.py
        if req["message_id"] == msg.get("id", "") and msg.get("role", "") == "assistant":
            if up_down:
                msg["thumbup"] = True
                if "feedback" in msg: del msg["feedback"]
                if "feedback" in msg:
                    del msg["feedback"]
            else:
                msg["thumbup"] = False
                if feedback: msg["feedback"] = feedback
                if feedback:
                    msg["feedback"] = feedback
            break
    ConversationService.update_by_id(conv["id"], conv)
--- a/api/apps/dialog_app.py
+++ b/api/apps/dialog_app.py
    top_n = req.get("top_n", 6)
    top_k = req.get("top_k", 1024)
    rerank_id = req.get("rerank_id", "")
    if not rerank_id: req["rerank_id"] = ""
    if not rerank_id:
        req["rerank_id"] = ""
    similarity_threshold = req.get("similarity_threshold", 0.1)
    vector_similarity_weight = req.get("vector_similarity_weight", 0.3)
    if vector_similarity_weight is None: vector_similarity_weight = 0.3
    if vector_similarity_weight is None:
        vector_similarity_weight = 0.3
    llm_setting = req.get("llm_setting", {})
    default_prompt = {
        "system": """你是一个智能助手，请总结知识库的内容来回答问题，请列举知识库中的数据详细回答。当所有知识库内容都与问题无关时，你的回答必须包括“知识库中未找到您要的答案！”这句话。回答需要考虑聊天历史。
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License
 #
 import json
 import os.path
 import pathlib
 import re
        raise LookupError("Can't find this knowledgebase!")
    blob = html2pdf(url)
    if not blob: return server_error_response(ValueError("Download failure."))
    if not blob:
        return server_error_response(ValueError("Download failure."))
    root_folder = FileService.get_root_folder(current_user.id)
    pf_id = root_folder["id"]
 def rm():
    req = request.json
    doc_ids = req["doc_id"]
    if isinstance(doc_ids, str): doc_ids = [doc_ids]
    if isinstance(doc_ids, str):
        doc_ids = [doc_ids]
    for doc_id in doc_ids:
        if not DocumentService.accessible4deletion(doc_id, current_user.id):
--- a/api/apps/llm_app.py
+++ b/api/apps/llm_app.py
        llm_set = set([m["llm_name"] + "@" + m["fid"] for m in llms])
        for o in objs:
            if not o.api_key: continue
            if o.llm_name + "@" + o.llm_factory in llm_set: continue
            if not o.api_key:
                continue
            if o.llm_name + "@" + o.llm_factory in llm_set:
                continue
            llms.append({"llm_name": o.llm_name, "model_type": o.model_type, "fid": o.llm_factory, "available": True})
        res = {}
--- a/api/apps/sdk/agent.py
+++ b/api/apps/sdk/agent.py
 #  limitations under the License.
 #
 from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
 from api.db.services.canvas_service import UserCanvasService
 from api.utils.api_utils import get_error_data_result, token_required
 from api.utils.api_utils import get_result
 from flask import request
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
 from rag.nlp import search
 from rag.utils import rmSpace
 from rag.utils.storage_factory import STORAGE_IMPL
 import os
 MAXIMUM_OF_UPLOADING_FILES = 256
    if not req.get("content"):
        return get_error_data_result(message="`content` is required")
    if "important_keywords" in req:
        if type(req["important_keywords"]) != list:
        if not isinstance(req["important_keywords"], list):
            return get_error_data_result(
                "`important_keywords` is required to be a list"
            )
    if "questions" in req:
        if type(req["questions"]) != list:
        if not isinstance(req["questions"], list):
            return get_error_data_result(
                "`questions` is required to be a list"
            )
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
    }
    conv.message.append(question)
    for m in conv.message:
        if m["role"] == "system": continue
        if m["role"] == "assistant" and not msg: continue
        if m["role"] == "system":
            continue
        if m["role"] == "assistant" and not msg:
            continue
        msg.append(m)
    message_id = msg[-1].get("id")
    e, dia = DialogService.get_by_id(conv.dialog_id)
        if m["role"] == "assistant" and not msg:
            continue
        msg.append(m)
    if not msg[-1].get("id"): msg[-1]["id"] = get_uuid()
    if not msg[-1].get("id"):
        msg[-1]["id"] = get_uuid()
    message_id = msg[-1]["id"]
    stream = req.get("stream", True)
        return resp
    for answer in canvas.run(stream=False):
        if answer.get("running_status"): continue
        if answer.get("running_status"):
            continue
        final_ans["content"] = "\n".join(answer["content"]) if "content" in answer else ""
        canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
        if final_ans.get("reference"):
--- a/api/apps/user_app.py
+++ b/api/apps/user_app.py
        headers=headers,
    ).json()
    user_info["email"] = next(
        (email for email in email_info if email["primary"] == True), None
        (email for email in email_info if email["primary"]), None
    )["email"]
    return user_info
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
    for p in cls.__bases__:
        if p in CONTINUOUS_FIELD_TYPE:
            return True
        elif p != Field and p != object:
        elif p is not Field and p is not object:
            if is_continuous_field(p):
                return True
    else:
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
            cnvs = json.load(open(os.path.join(dir, fnm), "r"))
            try:
                CanvasTemplateService.save(**cnvs)
            except:
            except Exception:
                CanvasTemplateService.update_by_id(cnvs["id"], cnvs)
        except Exception:
            logging.exception("Add graph templates error: ")
--- a/api/db/services/__init__.py
+++ b/api/db/services/__init__.py
 #
 import pathlib
 import re
 from .user_service import UserService
 from .user_service import UserService as UserService
 def duplicate_name(query_func, **kwargs):
    fnm = kwargs["name"]
    objs = query_func(**kwargs)
    if not objs: return fnm
    if not objs:
        return fnm
    ext = pathlib.Path(fnm).suffix #.jpg
    nm = re.sub(r"%s$"%ext, "", fnm)
    r = re.search(r"\(([0-9]+)\)$", nm)
        nm = re.sub(r"\([0-9]+\)$", "", nm)
    c += 1
    nm = f"{nm}({c})"
    if ext: nm += f"{ext}"
    if ext:
        nm += f"{ext}"
    kwargs["name"] = nm
    return duplicate_name(query_func, **kwargs)
--- a/api/db/services/api_service.py
+++ b/api/db/services/api_service.py
    @classmethod
    @DB.connection_context()
    def stats(cls, tenant_id, from_date, to_date, source=None):
        if len(to_date) == 10: to_date += " 23:59:59"
        if len(to_date) == 10:
            to_date += " 23:59:59"
        return cls.model.select(
            cls.model.create_date.truncate("day").alias("dt"),
            peewee.fn.COUNT(
--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from datetime import datetime
 import peewee
 from api.db.db_models import DB, API4Conversation, APIToken, Dialog, CanvasTemplate, UserCanvas
 from api.db.db_models import DB, CanvasTemplate, UserCanvas
 from api.db.services.common_service import CommonService
--- a/api/db/services/common_service.py
+++ b/api/db/services/common_service.py
        try:
            obj = cls.model.query(id=pid)[0]
            return True, obj
        except Exception as e:
        except Exception:
            return False, None
    @classmethod
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
        return c, msg
    ll = num_tokens_from_string(msg_[0]["content"])
    l = num_tokens_from_string(msg_[-1]["content"])
    if ll / (ll + l) > 0.8:
    ll2 = num_tokens_from_string(msg_[-1]["content"])
    if ll / (ll + ll2) > 0.8:
        m = msg_[0]["content"]
        m = encoder.decode(encoder.encode(m)[:max_length - l])
        m = encoder.decode(encoder.encode(m)[:max_length - ll2])
        msg[0]["content"] = m
        return max_length, msg
    m = msg_[1]["content"]
    m = encoder.decode(encoder.encode(m)[:max_length - l])
    m = encoder.decode(encoder.encode(m)[:max_length - ll2])
    msg[1]["content"] = m
    return max_length, msg
            idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
            recall_docs = [
                d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
            if not recall_docs: recall_docs = kbinfos["doc_aggs"]
            if not recall_docs:
                recall_docs = kbinfos["doc_aggs"]
            kbinfos["doc_aggs"] = recall_docs
            refs = deepcopy(kbinfos)
        Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
        No other words needed except 'yes' or 'no'.
    """
    if not contents:return False
    if not contents:
        return False
    contents = "Documents: \n" + "   - ".join(contents)
    contents = f"Question: {question}\n" + contents
    if num_tokens_from_string(contents) >= chat_mdl.max_length - 4:
        contents = encoder.decode(encoder.encode(contents)[:chat_mdl.max_length - 4])
    ans = chat_mdl.chat(prompt, [{"role": "user", "content": contents}], {"temperature": 0.01})
    if ans.lower().find("yes") >= 0: return True
    if ans.lower().find("yes") >= 0:
        return True
    return False
    ]
    _, msg = message_fit_in(msg, chat_mdl.max_length)
    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
    if isinstance(kwd, tuple): kwd = kwd[0]
    if kwd.find("**ERROR**") >=0: return ""
    if isinstance(kwd, tuple):
        kwd = kwd[0]
    if kwd.find("**ERROR**") >=0:
        return ""
    return kwd
    ]
    _, msg = message_fit_in(msg, chat_mdl.max_length)
    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
    if isinstance(kwd, tuple): kwd = kwd[0]
    if kwd.find("**ERROR**") >= 0: return ""
    if isinstance(kwd, tuple):
        kwd = kwd[0]
    if kwd.find("**ERROR**") >= 0:
        return ""
    return kwd
        chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
    conv = []
    for m in messages:
        if m["role"] not in ["user", "assistant"]: continue
        if m["role"] not in ["user", "assistant"]:
            continue
        conv.append("{}: {}".format(m["role"].upper(), m["content"]))
    conv = "\n".join(conv)
    today = datetime.date.today().isoformat()
 def tts(tts_mdl, text):
    if not tts_mdl or not text: return
    if not tts_mdl or not text:
        return
    bin = b""
    for chunk in tts_mdl.tts(text):
        bin += chunk
        idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
        recall_docs = [
            d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
        if not recall_docs: recall_docs = kbinfos["doc_aggs"]
        if not recall_docs:
            recall_docs = kbinfos["doc_aggs"]
        kbinfos["doc_aggs"] = recall_docs
        refs = deepcopy(kbinfos)
        for c in refs["chunks"]:
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
            try:
                mind_map = json.dumps(mindmap([c["content_with_weight"] for c in docs if c["doc_id"] == doc_id]).output,
                                      ensure_ascii=False, indent=2)
                if len(mind_map) < 32: raise Exception("Few content: " + mind_map)
                if len(mind_map) < 32:
                    raise Exception("Few content: " + mind_map)
                cks.append({
                    "id": get_uuid(),
                    "doc_id": doc_id,
--- a/api/db/services/file2document_service.py
+++ b/api/db/services/file2document_service.py
 from api.db.db_models import File, File2Document
 from api.db.services.common_service import CommonService
 from api.db.services.document_service import DocumentService
 from api.utils import current_timestamp, datetime_format, get_uuid
 from api.utils import current_timestamp, datetime_format
 class File2DocumentService(CommonService):
    def update_by_file_id(cls, file_id, obj):
        obj["update_time"] = current_timestamp()
        obj["update_date"] = datetime_format(datetime.now())
        num = cls.model.update(obj).where(cls.model.id == file_id).execute()
        # num = cls.model.update(obj).where(cls.model.id == file_id).execute()
        e, obj = cls.get_by_id(cls.model.id)
        return obj
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
               .join(Document, on=(File2Document.document_id == Document.id))
               .join(Knowledgebase, on=(Knowledgebase.id == Document.kb_id))
               .where(cls.model.id == file_id))
        if not kbs: return []
        if not kbs:
            return []
        kbs_info_list = []
        for kb in list(kbs.dicts()):
            kbs_info_list.append({"kb_id": kb['id'], "kb_name": kb['name']})
    @classmethod
    @DB.connection_context()
    def add_file_from_kb(cls, doc, kb_folder_id, tenant_id):
        for _ in File2DocumentService.get_by_document_id(doc["id"]): return
        for _ in File2DocumentService.get_by_document_id(doc["id"]):
            return
        file = {
            "id": get_uuid(),
            "parent_id": kb_folder_id,
--- a/api/db/services/llm_service.py
+++ b/api/db/services/llm_service.py
        model_config = cls.get_api_key(tenant_id, mdlnm)
        mdlnm, fid = TenantLLMService.split_model_name_and_factory(mdlnm)
        if model_config: model_config = model_config.to_dict()
        if model_config:
            model_config = model_config.to_dict()
        if not model_config:
            if llm_type in [LLMType.EMBEDDING, LLMType.RERANK]:
                llm = LLMService.query(llm_name=mdlnm) if not fid else LLMService.query(llm_name=mdlnm, fid=fid)
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
            Tenant.img2txt_id,
            Tenant.asr_id,
            Tenant.llm_id,
            cls.model.update_time]
        docs = cls.model.select(*fields) \
            .join(Document, on=(cls.model.doc_id == Document.id)) \
            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id)) \
            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id)) \
            cls.model.update_time,
        ]
        docs = (
            cls.model.select(*fields)
            .join(Document, on=(cls.model.doc_id == Document.id))
            .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
            .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
            .where(cls.model.id == task_id)
        )
        docs = list(docs.dicts())
        if not docs: return None
        if not docs:
            return None
        msg = "\nTask has been received."
        prog = random.random() / 10.
        prog = random.random() / 10.0
        if docs[0]["retry_count"] >= 3:
            msg = "\nERROR: Task is abandoned after 3 times attempts."
            prog = -1
        cls.model.update(progress_msg=cls.model.progress_msg + msg,
                         progress=prog,
                         retry_count=docs[0]["retry_count"]+1
                         ).where(
            cls.model.id == docs[0]["id"]).execute()
        cls.model.update(
            progress_msg=cls.model.progress_msg + msg,
            progress=prog,
            retry_count=docs[0]["retry_count"] + 1,
        ).where(cls.model.id == docs[0]["id"]).execute()
        if docs[0]["retry_count"] >= 3: return None
        if docs[0]["retry_count"] >= 3:
            return None
        return docs[0]
    @DB.connection_context()
    def get_ongoing_doc_name(cls):
        with DB.lock("get_task", -1):
            docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
                .join(Document, on=(cls.model.doc_id == Document.id)) \
                .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
                .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
            docs = (
                cls.model.select(
                    *[Document.id, Document.kb_id, Document.location, File.parent_id]
                )
                .join(Document, on=(cls.model.doc_id == Document.id))
                .join(
                    File2Document,
                    on=(File2Document.document_id == Document.id),
                    join_type=JOIN.LEFT_OUTER,
                )
                .join(
                    File,
                    on=(File2Document.file_id == File.id),
                    join_type=JOIN.LEFT_OUTER,
                )
                .where(
                    Document.status == StatusEnum.VALID.value,
                    Document.run == TaskStatus.RUNNING.value,
                    ~(Document.type == FileType.VIRTUAL.value),
                    cls.model.progress < 1,
                    cls.model.create_time >= current_timestamp() - 1000 * 600
                    cls.model.create_time >= current_timestamp() - 1000 * 600,
                )
            )
            docs = list(docs.dicts())
            if not docs: return []
            return list(set([(d["parent_id"] if d["parent_id"] else d["kb_id"], d["location"]) for d in docs]))
            if not docs:
                return []
            return list(
                set(
                    [
                        (
                            d["parent_id"] if d["parent_id"] else d["kb_id"],
                            d["location"],
                        )
                        for d in docs
                    ]
                )
            )
    @classmethod
    @DB.connection_context()
    def update_progress(cls, id, info):
        if os.environ.get("MACOS"):
            if info["progress_msg"]:
                cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
                    cls.model.id == id).execute()
                cls.model.update(
                    progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]
                ).where(cls.model.id == id).execute()
            if "progress" in info:
                cls.model.update(progress=info["progress"]).where(
                    cls.model.id == id).execute()
                    cls.model.id == id
                ).execute()
            return
        with DB.lock("update_progress", -1):
            if info["progress_msg"]:
                cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where(
                    cls.model.id == id).execute()
                cls.model.update(
                    progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]
                ).where(cls.model.id == id).execute()
            if "progress" in info:
                cls.model.update(progress=info["progress"]).where(
                    cls.model.id == id).execute()
                    cls.model.id == id
                ).execute()
 def queue_tasks(doc: dict, bucket: str, name: str):
    def new_task():
        return {
            "id": get_uuid(),
            "doc_id": doc["id"]
        }
        return {"id": get_uuid(), "doc_id": doc["id"]}
    tsks = []
    if doc["type"] == FileType.PDF.value:
        if doc["parser_id"] == "paper":
            page_size = doc["parser_config"].get("task_page_size", 22)
        if doc["parser_id"] in ["one", "knowledge_graph"] or not do_layout:
            page_size = 10 ** 9
        page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
            page_size = 10**9
        page_ranges = doc["parser_config"].get("pages") or [(1, 10**5)]
        for s, e in page_ranges:
            s -= 1
            s = max(0, s)
    DocumentService.begin2parse(doc["id"])
    for t in tsks:
        assert REDIS_CONN.queue_product(SVR_QUEUE_NAME, message=t), "Can't access Redis. Please check the Redis' status."
        assert REDIS_CONN.queue_product(
            SVR_QUEUE_NAME, message=t
        ), "Can't access Redis. Please check the Redis' status."
--- a/api/db/services/user_service.py
+++ b/api/db/services/user_service.py
 from api.db.db_models import DB, UserTenant
 from api.db.db_models import User, Tenant
 from api.db.services.common_service import CommonService
 from api.utils import get_uuid, get_format_time, current_timestamp, datetime_format
 from api.utils import get_uuid, current_timestamp, datetime_format
 from api.db import StatusEnum
--- a/api/ragflow_server.py
+++ b/api/ragflow_server.py
 import logging
 import os
 from api.utils.log_utils import initRootLogger
 LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
 initRootLogger("ragflow_server", LOG_LEVELS)
 import os
 import signal
 import sys
 import time
 from api.utils import show_configs
 from rag.settings import print_rag_settings
 LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
 initRootLogger("ragflow_server", LOG_LEVELS)
 def update_progress():
    while True:
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
 from api.db.db_models import APIToken
 from api import settings
 from api import settings
 from api.utils import CustomJSONEncoder, get_uuid
 from api.utils import json_dumps
 from api.constants import REQUEST_WAIT_SEC, REQUEST_MAX_WAIT_SEC
--- a/api/validation.py
+++ b/api/validation.py
    pool = Pool(processes=1)
    thread = pool.apply_async(download_nltk_data)
    binary = thread.get(timeout=60)
 except Exception as e:
 except Exception:
    print('\x1b[6;37;41m WARNING \x1b[0m' + "Downloading NLTK data failure.", flush=True)
--- a/deepdoc/parser/__init__.py
+++ b/deepdoc/parser/__init__.py
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
 from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
 from .txt_parser import RAGFlowTxtParser as TxtParser
 from .txt_parser import RAGFlowTxtParser as TxtParser
 __all__ = [
    "PdfParser",
    "PlainParser",
    "DocxParser",
    "ExcelParser",
    "PptParser",
    "HtmlParser",
    "JsonParser",
    "MarkdownParser",
    "TxtParser",
 ]
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            if not rows: continue
            if not rows:
                continue
            tb_rows_0 = "<tr>"
            for t in list(rows[0]):
                tb = ""
                tb += f"<table><caption>{sheetname}</caption>"
                tb += tb_rows_0
                for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
                for r in list(
                    rows[1 + chunk_i * chunk_rows : 1 + (chunk_i + 1) * chunk_rows]
                ):
                    tb += "<tr>"
                    for i, c in enumerate(r):
                        if c.value is None:
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            if not rows:continue
            if not rows:
                continue
            ti = list(rows[0])
            for r in list(rows[1:]):
                l = []
                fields = []
                for i, c in enumerate(r):
                    if not c.value:
                        continue
                    t = str(ti[i].value) if i < len(ti) else ""
                    t += ("：" if t else "") + str(c.value)
                    l.append(t)
                l = "; ".join(l)
                    fields.append(t)
                line = "; ".join(fields)
                if sheetname.lower().find("sheet") < 0:
                    l += " ——" + sheetname
                res.append(l)
                    line += " ——" + sheetname
                res.append(line)
        return res
    @staticmethod
--- a/deepdoc/parser/html_parser.py
+++ b/deepdoc/parser/html_parser.py
    @classmethod
    def parser_txt(cls, txt):
        if type(txt) != str:
        if not isinstance(txt, str):
            raise TypeError("txt type should be str!")
        html_doc = readability.Document(txt)
        title = html_doc.title()
--- a/deepdoc/parser/json_parser.py
+++ b/deepdoc/parser/json_parser.py
        txt = binary.decode(encoding, errors="ignore")
        json_data = json.loads(txt)
        chunks = self.split_json(json_data, True)   
        sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l]
        sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
        return sections
    @staticmethod
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
                    "x1": np.max([b["x1"] for b in bxs]),
                    "bottom": np.max([b["bottom"] for b in bxs]) - ht
                }
                louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
                louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
                ii = Recognizer.find_overlapped(b, louts, naive=True)
                if ii is not None:
                    b = louts[ii]
                            "layoutno", "")))
                left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
                if right < left: right = left + 1
                if right < left:
                    right = left + 1
                poss.append((pn + self.page_from, left, right, top, bott))
                return self.page_images[pn] \
                    .crop((left * ZM, top * ZM,
        top = bx["top"] - self.page_cum_height[pn[0] - 1]
        bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
        page_images_cnt = len(self.page_images)
        if pn[-1] - 1 >= page_images_cnt: return ""
        if pn[-1] - 1 >= page_images_cnt:
            return ""
        while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
            bott -= self.page_images[pn[-1] - 1].size[1] / ZM
            pn.append(pn[-1] + 1)
                nonlocal mh, pw, lines, widths
                lines.append(line)
                widths.append(width(line))
                width_mean = np.mean(widths)
                mmj = self.proj_match(
                    line["text"]) or line.get(
                    "layout_type",
        else:
            self.is_english = False
        st = timer()
        # st = timer()
        for i, img in enumerate(self.page_images_x2):
            chars = self.page_chars[i] if not self.is_english else []
            self.mean_height.append(
        self.page_cum_height = np.cumsum(self.page_cum_height)
        assert len(self.page_cum_height) == len(self.page_images) + 1
        if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from,
                                                                page_to, callback)
        if len(self.boxes) == 0 and zoomin < 9:
            self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
    def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
        self.__images__(fnm, zoomin)
        if not self.outlines:
            logging.warning("Miss outlines")
        return [(l, "") for l in lines], []
        return [(line, "") for line in lines], []
    def crop(self, ck, need_position):
        raise NotImplementedError
--- a/deepdoc/parser/resume/__init__.py
+++ b/deepdoc/parser/resume/__init__.py
 def refactor(cv):
    for n in ["raw_txt", "parser_name", "inference", "ori_text", "use_time", "time_stat"]:
        if n in cv and cv[n] is not None: del cv[n]
    for n in [
        "raw_txt",
        "parser_name",
        "inference",
        "ori_text",
        "use_time",
        "time_stat",
    ]:
        if n in cv and cv[n] is not None:
            del cv[n]
    cv["is_deleted"] = 0
    if "basic" not in cv: cv["basic"] = {}
    if cv["basic"].get("photo2"): del cv["basic"]["photo2"]
    if "basic" not in cv:
        cv["basic"] = {}
    if cv["basic"].get("photo2"):
        del cv["basic"]["photo2"]
    for n in ["education", "work", "certificate", "project", "language", "skill", "training"]:
        if n not in cv or cv[n] is None: continue
        if type(cv[n]) == type({}): cv[n] = [v for _, v in cv[n].items()]
        if type(cv[n]) != type([]):
    for n in [
        "education",
        "work",
        "certificate",
        "project",
        "language",
        "skill",
        "training",
    ]:
        if n not in cv or cv[n] is None:
            continue
        if isinstance(cv[n], dict):
            cv[n] = [v for _, v in cv[n].items()]
        if not isinstance(cv[n], list):
            del cv[n]
            continue
        vv = []
        for v in cv[n]:
            if "external" in v and v["external"] is not None: del v["external"]
            if "external" in v and v["external"] is not None:
                del v["external"]
            vv.append(v)
        cv[n] = {str(i): vv[i] for i in range(len(vv))}
            cv["basic"][t] = cv["basic"][n]
            del cv["basic"][n]
    work = sorted([v for _, v in cv.get("work", {}).items()], key=lambda x: x.get("start_time", ""))
    edu = sorted([v for _, v in cv.get("education", {}).items()], key=lambda x: x.get("start_time", ""))
    work = sorted(
        [v for _, v in cv.get("work", {}).items()],
        key=lambda x: x.get("start_time", ""),
    )
    edu = sorted(
        [v for _, v in cv.get("education", {}).items()],
        key=lambda x: x.get("start_time", ""),
    )
    if work:
        cv["basic"]["work_start_time"] = work[0].get("start_time", "")
        cv["basic"]["management_experience"] = 'Y' if any(
            [w.get("management_experience", '') == 'Y' for w in work]) else 'N'
        cv["basic"]["management_experience"] = (
            "Y"
            if any([w.get("management_experience", "") == "Y" for w in work])
            else "N"
        )
        cv["basic"]["annual_salary"] = work[-1].get("annual_salary_from", "0")
        for n in ["annual_salary_from", "annual_salary_to", "industry_name", "position_name", "responsibilities",
                  "corporation_type", "scale", "corporation_name"]:
        for n in [
            "annual_salary_from",
            "annual_salary_to",
            "industry_name",
            "position_name",
            "responsibilities",
            "corporation_type",
            "scale",
            "corporation_name",
        ]:
            cv["basic"][n] = work[-1].get(n, "")
    if edu:
        for n in ["school_name", "discipline_name"]:
            if n in edu[-1]: cv["basic"][n] = edu[-1][n]
            if n in edu[-1]:
                cv["basic"][n] = edu[-1][n]
    cv["basic"]["updated_at"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if "contact" not in cv: cv["contact"] = {}
    if not cv["contact"].get("name"): cv["contact"]["name"] = cv["basic"].get("name", "")
    return cv
    if "contact" not in cv:
        cv["contact"] = {}
    if not cv["contact"].get("name"):
        cv["contact"]["name"] = cv["basic"].get("name", "")
    return cv
--- a/deepdoc/parser/resume/entities/corporations.py
+++ b/deepdoc/parser/resume/entities/corporations.py
 current_file_path = os.path.dirname(os.path.abspath(__file__))
 GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
 GOODS = pd.read_csv(
    os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0
 ).fillna(0)
 GOODS["cid"] = GOODS["cid"].astype(str)
 GOODS = GOODS.set_index(["cid"])
 CORP_TKS = json.load(open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r"))
 CORP_TKS = json.load(
    open(os.path.join(current_file_path, "res/corp.tks.freq.json"), "r")
 )
 GOOD_CORP = json.load(open(os.path.join(current_file_path, "res/good_corp.json"), "r"))
 CORP_TAG = json.load(open(os.path.join(current_file_path, "res/corp_tag.json"), "r"))
 def baike(cid, default_v=0):
    global GOODS
    try:
 def corpNorm(nm, add_region=True):
    global CORP_TKS
    if not nm or type(nm)!=type(""):return ""
    if not nm or isinstance(nm, str):
        return ""
    nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
    nm = re.sub(r"&amp;", "&", nm)
    nm = re.sub(r"[\(\)（）\+'\"\t \*\\【】-]+", " ", nm)
    nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
    nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
    if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
    nm = re.sub(
        r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE
    )
    nm = re.sub(
        r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$",
        "",
        nm,
        10000,
        re.IGNORECASE,
    )
    if not nm or (len(nm) < 5 and not regions.isName(nm[0:2])):
        return nm
    tks = rag_tokenizer.tokenize(nm).split()
    reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
    reg = [t for i, t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
    nm = ""
    for t in tks:
        if regions.isName(t) or t in CORP_TKS:continue
        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):nm += " "
        if regions.isName(t) or t in CORP_TKS:
            continue
        if re.match(r"[0-9a-zA-Z\\,.]+", t) and re.match(r".*[0-9a-zA-Z\,.]+$", nm):
            nm += " "
        nm += t
    r = re.search(r"^([^a-z0-9 \(\)&]{2,})[a-z ]{4,}$", nm.strip())
    if r:nm = r.group(1)
    if r:
        nm = r.group(1)
    r = re.search(r"^([a-z ]{3,})[^a-z0-9 \(\)&]{2,}$", nm.strip())
    if r:nm = r.group(1)
    return nm.strip() + (("" if not reg else "(%s)"%reg[0]) if add_region else "")
    if r:
        nm = r.group(1)
    return nm.strip() + (("" if not reg else "(%s)" % reg[0]) if add_region else "")
 def rmNoise(n):
    n = re.sub(r"[,. &（）()]+", "", n)
    return n
 GOOD_CORP = set([corpNorm(rmNoise(c), False) for c in GOOD_CORP])
 for c,v in CORP_TAG.items():
 for c, v in CORP_TAG.items():
    cc = corpNorm(rmNoise(c), False)
    if not cc:
        logging.debug(c)
 CORP_TAG = {corpNorm(rmNoise(c), False):v for c,v in CORP_TAG.items()}
 CORP_TAG = {corpNorm(rmNoise(c), False): v for c, v in CORP_TAG.items()}
 def is_good(nm):
    global GOOD_CORP
    if nm.find("外派")>=0:return False
    if nm.find("外派") >= 0:
        return False
    nm = rmNoise(nm)
    nm = corpNorm(nm, False)
    for n in GOOD_CORP:
        if re.match(r"[0-9a-zA-Z]+$", n):
            if n == nm: return True
        elif nm.find(n)>=0:return True
            if n == nm:
                return True
        elif nm.find(n) >= 0:
            return True
    return False
 def corp_tag(nm):
    global CORP_TAG
    nm = rmNoise(nm)
    nm = corpNorm(nm, False)
    for n in CORP_TAG.keys():
        if re.match(r"[0-9a-zA-Z., ]+$", n):
            if n == nm: return CORP_TAG[n]
        elif nm.find(n)>=0:
            if len(n)<3 and len(nm)/len(n)>=2:continue
            if n == nm:
                return CORP_TAG[n]
        elif nm.find(n) >= 0:
            if len(n) < 3 and len(nm) / len(n) >= 2:
                continue
            return CORP_TAG[n]
    return []
--- a/deepdoc/parser/resume/entities/degrees.py
+++ b/deepdoc/parser/resume/entities/degrees.py
 #  limitations under the License.
 #
 TBL = {"94":"EMBA",
 "6":"MBA",
 "95":"MPA",
 "92":"专升本",
 "4":"专科",
 "90":"中专",
 "91":"中技",
 "86":"初中",
 "3":"博士",
 "10":"博士后",
 "1":"本科",
 "2":"硕士",
 "87":"职高",
 "89":"高中"
 TBL = {
    "94": "EMBA",
    "6": "MBA",
    "95": "MPA",
    "92": "专升本",
    "4": "专科",
    "90": "中专",
    "91": "中技",
    "86": "初中",
    "3": "博士",
    "10": "博士后",
    "1": "本科",
    "2": "硕士",
    "87": "职高",
    "89": "高中",
 }
 TBL_ = {v:k for k,v in TBL.items()}
 TBL_ = {v: k for k, v in TBL.items()}
 def get_name(id):
    return TBL.get(str(id), "")
 def get_id(nm):
    if not nm:return ""
    if not nm:
        return ""
    return TBL_.get(nm.upper().strip(), "")
--- a/deepdoc/parser/resume/entities/industries.py
+++ b/deepdoc/parser/resume/entities/industries.py
--- a/deepdoc/parser/resume/entities/regions.py
+++ b/deepdoc/parser/resume/entities/regions.py
--- a/deepdoc/parser/resume/entities/schools.py
+++ b/deepdoc/parser/resume/entities/schools.py
 import re
 import copy
 import pandas as pd
 current_file_path = os.path.dirname(os.path.abspath(__file__))
 TBL = pd.read_csv(os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0).fillna("")
 TBL = pd.read_csv(
    os.path.join(current_file_path, "res/schools.csv"), sep="\t", header=0
 ).fillna("")
 TBL["name_en"] = TBL["name_en"].map(lambda x: x.lower().strip())
 GOOD_SCH = json.load(open(os.path.join(current_file_path, "res/good_sch.json"), "r"))
 GOOD_SCH = set([re.sub(r"[,. &（）()]+", "", c) for c in GOOD_SCH])
 def loadRank(fnm):
    global TBL
    TBL["rank"] = 1000000
    with open(fnm, "r", encoding='utf-8') as f:
    with open(fnm, "r", encoding="utf-8") as f:
        while True:
            l = f.readline()
            if not l:break
            l = l.strip("\n").split(",")
            line = f.readline()
            if not line:
                break
            line = line.strip("\n").split(",")
            try:
                nm,rk = l[0].strip(),int(l[1])
                #assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
                nm, rk = line[0].strip(), int(line[1])
                # assert len(TBL[((TBL.name_cn == nm) | (TBL.name_en == nm))]),f"<{nm}>"
                TBL.loc[((TBL.name_cn == nm) | (TBL.name_en == nm)), "rank"] = rk
            except Exception:
                pass
 def split(txt):
    tks = []
    for t in re.sub(r"[ \t]+", " ",txt).split():
        if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
           re.match(r"[a-zA-Z]", t) and tks:
    for t in re.sub(r"[ \t]+", " ", txt).split():
        if (
            tks
            and re.match(r".*[a-zA-Z]$", tks[-1])
            and re.match(r"[a-zA-Z]", t)
            and tks
        ):
            tks[-1] = tks[-1] + " " + t
        else:tks.append(t)
        else:
            tks.append(t)
    return tks
 def select(nm):
    global TBL
    if not nm:return 
    if isinstance(nm, list):nm = str(nm[0])
    if not nm:
        return
    if isinstance(nm, list):
        nm = str(nm[0])
    nm = split(nm)[0]
    nm = str(nm).lower().strip()
    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
    nm = re.sub(r"(^the |[,.&（）();；·]+|^(英国|美国|瑞士))", "", nm)
    nm = re.sub(r"大学.*学院", "大学", nm)
    tbl = copy.deepcopy(TBL)
    tbl["hit_alias"] = tbl["alias"].map(lambda x:nm in set(x.split("+")))
    res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | (tbl.hit_alias == True))]
    if res.empty:return
    tbl["hit_alias"] = tbl["alias"].map(lambda x: nm in set(x.split("+")))
    res = tbl[((tbl.name_cn == nm) | (tbl.name_en == nm) | tbl.hit_alias)]
    if res.empty:
        return
    return json.loads(res.to_json(orient="records"))[0]
    nm = re.sub(r"[(（][^()（）]+[)）]", "", nm.lower())
    nm = re.sub(r"[''`‘’“”,. &（）();；]+", "", nm)
    return nm in GOOD_SCH
--- a/deepdoc/parser/resume/step_two.py
+++ b/deepdoc/parser/resume/step_two.py
 from contextlib import contextmanager
 class TimeoutException(Exception): pass
 class TimeoutException(Exception):
    pass
@contextmanager
 def highest_degree(dg):
    if not dg: return ""
    if type(dg) == type(""): dg = [dg]
    if not dg:
        return ""
    if isinstance(dg, str):
        dg = [dg]
    m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8}
    return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0]
    for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))):
        e = {}
        if n.get("end_time"):
            if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"]
            if n["end_time"] > edu_end_dt:
                edu_end_dt = n["end_time"]
            try:
                dt = n["end_time"]
                if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
                if re.match(r"[0-9]{9,}", dt):
                    dt = turnTm2Dt(dt)
                y, m, d = getYMD(dt)
                ed_dt.append(str(y))
                e["end_dt_kwd"] = str(y)
        if n.get("start_time"):
            try:
                dt = n["start_time"]
                if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt)
                if re.match(r"[0-9]{9,}", dt):
                    dt = turnTm2Dt(dt)
                y, m, d = getYMD(dt)
                st_dt.append(str(y))
                e["start_dt_kwd"] = str(y)
        r = schools.select(n.get("school_name", ""))
        if r:
            if str(r.get("type", "")) == "1": fea.append("211")
            if str(r.get("type", "")) == "2": fea.append("211")
            if str(r.get("is_abroad", "")) == "1": fea.append("留学")
            if str(r.get("is_double_first", "")) == "1": fea.append("双一流")
            if str(r.get("is_985", "")) == "1": fea.append("985")
            if str(r.get("is_world_known", "")) == "1": fea.append("海外知名")
            if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"]
            if str(r.get("type", "")) == "1":
                fea.append("211")
            if str(r.get("type", "")) == "2":
                fea.append("211")
            if str(r.get("is_abroad", "")) == "1":
                fea.append("留学")
            if str(r.get("is_double_first", "")) == "1":
                fea.append("双一流")
            if str(r.get("is_985", "")) == "1":
                fea.append("985")
            if str(r.get("is_world_known", "")) == "1":
                fea.append("海外知名")
            if r.get("rank") and cv["school_rank_int"] > r["rank"]:
                cv["school_rank_int"] = r["rank"]
        if n.get("school_name") and isinstance(n["school_name"], str):
            sch.append(re.sub(r"(211|985|重点大学|[,&;；-])", "", n["school_name"]))
            maj.append(n["discipline_name"])
            e["major_kwd"] = n["discipline_name"]
        if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1"
        if not n.get("degree") and "985" in fea and not first_fea:
            n["degree"] = "1"
        if n.get("degree"):
            d = degrees.get_name(n["degree"])
            if d: e["degree_kwd"] = d
            if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)",
                                                                                                     n.get(
                                                                                                         "school_name",
                                                                                                         ""))): d = "专升本"
            if d: deg.append(d)
            if d:
                e["degree_kwd"] = d
            if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", n.get("school_name",""))):
                d = "专升本"
            if d:
                deg.append(d)
            # for first degree
            if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]:
                fdeg = [d]
                if n.get("school_name"): fsch = [n["school_name"]]
                if n.get("discipline_name"): fmaj = [n["discipline_name"]]
                if n.get("school_name"):
                    fsch = [n["school_name"]]
                if n.get("discipline_name"):
                    fmaj = [n["discipline_name"]]
                first_fea = copy.deepcopy(fea)
        edu_nst.append(e)
    else:
        cv["sch_rank_kwd"].append("一般学校")
    if edu_nst: cv["edu_nst"] = edu_nst
    if fea: cv["edu_fea_kwd"] = list(set(fea))
    if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea))
    if maj: cv["major_kwd"] = maj
    if fsch: cv["first_school_name_kwd"] = fsch
    if fdeg: cv["first_degree_kwd"] = fdeg
    if fmaj: cv["first_major_kwd"] = fmaj
    if st_dt: cv["edu_start_kwd"] = st_dt
    if ed_dt: cv["edu_end_kwd"] = ed_dt
    if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt])
    if edu_nst:
        cv["edu_nst"] = edu_nst
    if fea:
        cv["edu_fea_kwd"] = list(set(fea))
    if first_fea:
        cv["edu_first_fea_kwd"] = list(set(first_fea))
    if maj:
        cv["major_kwd"] = maj
    if fsch:
        cv["first_school_name_kwd"] = fsch
    if fdeg:
        cv["first_degree_kwd"] = fdeg
    if fmaj:
        cv["first_major_kwd"] = fmaj
    if st_dt:
        cv["edu_start_kwd"] = st_dt
    if ed_dt:
        cv["edu_end_kwd"] = ed_dt
    if ed_dt:
        cv["edu_end_int"] = max([int(t) for t in ed_dt])
    if deg:
        if "本科" in deg and "专科" in deg:
            deg.append("专升本")
        cv["highest_degree_kwd"] = highest_degree(deg)
    if edu_end_dt:
        try:
            if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt)
            if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
            if re.match(r"[0-9]{9,}", edu_end_dt):
                edu_end_dt = turnTm2Dt(edu_end_dt)
            if edu_end_dt.strip("\n") == "至今":
                edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today()))
            y, m, d = getYMD(edu_end_dt)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
                or not cv.get("degree_kwd"):
            for c in sch:
                if schools.is_good(c):
                    if "tag_kwd" not in cv: cv["tag_kwd"] = []
                    if "tag_kwd" not in cv:
                        cv["tag_kwd"] = []
                    cv["tag_kwd"].append("好学校")
                    cv["tag_kwd"].append("好学历")
                    break
            any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \
                or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \
                or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]):
            if "tag_kwd" not in cv: cv["tag_kwd"] = []
            if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
    if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
    if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
    if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
    if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
            if "tag_kwd" not in cv:
                cv["tag_kwd"] = []
            if "好学历" not in cv["tag_kwd"]:
                cv["tag_kwd"].append("好学历")
    if cv.get("major_kwd"):
        cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
    if cv.get("school_name_kwd"):
        cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
    if cv.get("first_school_name_kwd"):
        cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
    if cv.get("first_major_kwd"):
        cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
    return cv
 def forProj(cv):
    if not cv.get("project_obj"): return cv
    if not cv.get("project_obj"):
        return cv
    pro_nms, desc = [], []
    for i, n in enumerate(
            sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "",
            sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if isinstance(x, dict) else "",
                   reverse=True)):
        if n.get("name"): pro_nms.append(n["name"])
        if n.get("describe"): desc.append(str(n["describe"]))
        if n.get("responsibilities"): desc.append(str(n["responsibilities"]))
        if n.get("achivement"): desc.append(str(n["achivement"]))
        if n.get("name"):
            pro_nms.append(n["name"])
        if n.get("describe"):
            desc.append(str(n["describe"]))
        if n.get("responsibilities"):
            desc.append(str(n["responsibilities"]))
        if n.get("achivement"):
            desc.append(str(n["achivement"]))
    if pro_nms:
        # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
    work_st_tm = ""
    corp_tags = []
    for i, n in enumerate(
            sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "",
            sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if isinstance(x, dict) else "",
                   reverse=True)):
        if type(n) == type(""):
        if isinstance(n, str):
            try:
                n = json_loads(n)
            except Exception:
                continue
        if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"]
        if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm):
            work_st_tm = n["start_time"]
        for c in flds:
            if not n.get(c) or str(n[c]) == '0':
                fea[c].append("")
            fea[c].append(rmHtmlTag(str(n[c]).lower()))
        y, m, d = getYMD(n.get("start_time"))
        if not y or not m: continue
        if not y or not m:
            continue
        st = "%s-%02d-%02d" % (y, int(m), int(d))
        latest_job_tm = st
        y, m, d = getYMD(n.get("end_time"))
        if (not y or not m) and i > 0: continue
        if not y or not m or int(y) > 2022:  y, m, d = getYMD(str(n.get("updated_at", "")))
        if not y or not m: continue
        if (not y or not m) and i > 0:
            continue
        if not y or not m or int(y) > 2022:
            y, m, d = getYMD(str(n.get("updated_at", "")))
        if not y or not m:
            continue
        ed = "%s-%02d-%02d" % (y, int(m), int(d))
        try:
        if n.get("scale"):
            r = re.search(r"^([0-9]+)", str(n["scale"]))
            if r: scales.append(int(r.group(1)))
            if r:
                scales.append(int(r.group(1)))
    if goodcorp:
        if "tag_kwd" not in cv: cv["tag_kwd"] = []
        if "tag_kwd" not in cv:
            cv["tag_kwd"] = []
        cv["tag_kwd"].append("好公司")
    if goodcorp_:
        if "tag_kwd" not in cv: cv["tag_kwd"] = []
        if "tag_kwd" not in cv:
            cv["tag_kwd"] = []
        cv["tag_kwd"].append("好公司(曾)")
    if corp_tags:
        if "tag_kwd" not in cv: cv["tag_kwd"] = []
        if "tag_kwd" not in cv:
            cv["tag_kwd"] = []
        cv["tag_kwd"].extend(corp_tags)
        cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)]
    if latest_job_tm: cv["latest_job_dt"] = latest_job_tm
    if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
    if latest_job_tm:
        cv["latest_job_dt"] = latest_job_tm
    if fea["corporation_id"]:
        cv["corporation_id"] = fea["corporation_id"]
    if fea["position_name"]:
        cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
        cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
    if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
    if fea["subordinates_count"]:
        fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
                                                               re.match(r"[^0-9]+$", str(i))]
    if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
    if fea["subordinates_count"]:
        cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"])
    if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])]
    if not cv.get("corporation_id"): cv["corporation_id"] = []
    if isinstance(cv.get("corporation_id"), int):
        cv["corporation_id"] = [str(cv["corporation_id"])]
    if not cv.get("corporation_id"):
        cv["corporation_id"] = []
    for i in cv.get("corporation_id", []):
        cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0)
    if work_st_tm:
        try:
            if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm)
            if re.match(r"[0-9]{9,}", work_st_tm):
                work_st_tm = turnTm2Dt(work_st_tm)
            y, m, d = getYMD(work_st_tm)
            cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000))
        except Exception as e:
        cv["dua_flt"] = np.mean(duas)
        cv["cur_dua_int"] = duas[0]
        cv["job_num_int"] = len(duas)
    if scales: cv["scale_flt"] = np.max(scales)
    if scales:
        cv["scale_flt"] = np.max(scales)
    return cv
 def turnTm2Dt(b):
    if not b: return
    if not b:
        return
    b = str(b).strip()
    if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
    if re.match(r"[0-9]{10,}", b):
        b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10])))
    return b
 def getYMD(b):
    y, m, d = "", "", "01"
    if not b: return (y, m, d)
    if not b:
        return (y, m, d)
    b = turnTm2Dt(b)
    if re.match(r"[0-9]{4}", b): y = int(b[:4])
    if re.match(r"[0-9]{4}", b):
        y = int(b[:4])
    r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b)
    if r: m = r.group(1)
    if r:
        m = r.group(1)
    r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b)
    if r: d = r.group(1)
    if not d or int(d) == 0 or int(d) > 31: d = "1"
    if not m or int(m) > 12 or int(m) < 1: m = "1"
    if r:
        d = r.group(1)
    if not d or int(d) == 0 or int(d) > 31:
        d = "1"
    if not m or int(m) > 12 or int(m) < 1:
        m = "1"
    return (y, m, d)
        cv["integerity_flt"] *= 0.9
        return cv
    y, m, d = getYMD(cv["birth"])
    if not m or not y: return cv
    if not m or not y:
        return cv
    b = "%s-%02d-%02d" % (y, int(m), int(d))
    cv["birth_dt"] = b
    cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d))
 def parse(cv):
    for k in cv.keys():
        if cv[k] == '\\N': cv[k] = ''
        if cv[k] == '\\N':
            cv[k] = ''
    # cv = cv.asDict()
    tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names",
               "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name",
    rmkeys = []
    for k in cv.keys():
        if cv[k] is None: rmkeys.append(k)
        if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k)
    for k in rmkeys: del cv[k]
        if cv[k] is None:
            rmkeys.append(k)
        if (isinstance(cv[k], list) or isinstance(cv[k], str)) and len(cv[k]) == 0:
            rmkeys.append(k)
    for k in rmkeys:
        del cv[k]
    integerity = 0.
    flds_num = 0.
        flds_num += len(flds)
        for f in flds:
            v = str(cv.get(f, ""))
            if len(v) > 0 and v != '0' and v != '[]': integerity += 1
            if len(v) > 0 and v != '0' and v != '[]':
                integerity += 1
    hasValues(tks_fld)
    hasValues(small_tks_fld)
                     (r"[ （）\(\)人/·0-9-]+", ""),
                     (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]:
            cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE)
        if len(cv["corporation_type"]) < 2: del cv["corporation_type"]
        if len(cv["corporation_type"]) < 2:
            del cv["corporation_type"]
    if cv.get("political_status"):
        for p, r in [
            (r".*(无党派|公民).*", "群众"),
            (r".*团员.*", "团员")]:
            cv["political_status"] = re.sub(p, r, cv["political_status"])
        if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"]
        if not re.search(r"[党团群]", cv["political_status"]):
            del cv["political_status"]
    if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
    if cv.get("phone"):
        cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"]))
    keys = list(cv.keys())
    for k in keys:
                cv[k] = [a for _, a in cv[k].items()]
                nms = []
                for n in cv[k]:
                    if type(n) != type({}) or "name" not in n or not n.get("name"): continue
                    if not isinstance(n, dict) or "name" not in n or not n.get("name"):
                        continue
                    n["name"] = re.sub(r"(（442）|\t )", "", n["name"]).strip().lower()
                    if not n["name"]: continue
                    if not n["name"]:
                        continue
                    nms.append(n["name"])
                if nms:
                    t = k[:-4]
        # tokenize fields
        if k in tks_fld:
            cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
            if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
            if k in small_tks_fld:
                cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
        # keyword fields
        if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
        if k in kwd_fld:
            cv[f"{k}_kwd"] = [n.lower()
                                           for n in re.split(r"[\t,，；;. ]",
                                                             re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1，\2", cv[k])
                                                             ) if n]
        if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k]
        if k in num_fld and cv.get(k):
            cv[f"{k}_int"] = cv[k]
    cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "")
    # for name field
        cv["name_py_pref0_tks"] = ""
        cv["name_py_pref_tks"] = ""
        for py in PY.get_pinyins(nm[:20], ''):
            for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i]
            for i in range(2, len(py) + 1):
                cv["name_py_pref_tks"] += " " + py[:i]
        for py in PY.get_pinyins(nm[:20], ' '):
            py = py.split()
            for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i])
            for i in range(1, len(py) + 1):
                cv["name_py_pref0_tks"] += " " + "".join(py[:i])
        cv["name_kwd"] = name
        cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
        cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S')
    else:
        y, m, d = getYMD(str(cv.get("updated_at", "")))
        if not y: y = "2012"
        if not m: m = "01"
        if not d: d = "01"
        if not y:
            y = "2012"
        if not m:
            m = "01"
        if not d:
            d = "01"
        cv["updated_at_dt"] = "%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
        # long text tokenize
    if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
    if cv.get("responsibilities"):
        cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
    # for yes or no field
    fea = []
    for f, y, n in is_fld:
        if f not in cv: continue
        if cv[f] == '是': fea.append(y)
        if cv[f] == '否': fea.append(n)
        if f not in cv:
            continue
        if cv[f] == '是':
            fea.append(y)
        if cv[f] == '否':
            fea.append(n)
    if fea: cv["tag_kwd"] = fea
    if fea:
        cv["tag_kwd"] = fea
    cv = forEdu(cv)
    cv = forProj(cv)
    cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])]
    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
        for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j
        for j in cv.get("sch_rank_kwd", []):
            cv["corp_proj_sch_deg_kwd"][i] += "+" + j
    for i in range(len(cv["corp_proj_sch_deg_kwd"])):
        if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
        if cv.get("highest_degree_kwd"):
            cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"]
    try:
        if not cv.get("work_exp_flt") and cv.get("work_start_time"):
                cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y)
    except Exception as e:
        logging.exception("parse {} ==> {}".format(e, cv.get("work_start_time")))
    if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
    if "work_exp_flt" not in cv and cv.get("work_experience", 0):
        cv["work_exp_flt"] = int(cv["work_experience"]) / 12.
    keys = list(cv.keys())
    for k in keys:
        if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k]
        if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k):
            del cv[k]
    for k in cv.keys():
        if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue
        if not re.search("_(kwd|id)$", k) or not isinstance(cv[k], list):
            continue
        cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']]))
    keys = [k for k in cv.keys() if re.search(r"_feas*$", k)]
    for k in keys:
        if cv[k] <= 0: del cv[k]
        if cv[k] <= 0:
            del cv[k]
    cv["tob_resume_id"] = str(cv["tob_resume_id"])
    cv["id"] = cv["tob_resume_id"]
    if isinstance(d, list):
        d = [dealWithInt64(t) for t in d]
    if isinstance(d, np.integer): d = int(d)
    if isinstance(d, np.integer):
        d = int(d)
    return d
--- a/deepdoc/parser/txt_parser.py
+++ b/deepdoc/parser/txt_parser.py
        dels = [d for d in dels if d]
        dels = "|".join(dels)
        secs = re.split(r"(%s)" % dels, txt)
        for sec in secs: add_chunk(sec)
        for sec in secs:
            add_chunk(sec)
        return [[c, ""] for c in cks]
--- a/deepdoc/vision/__init__.py
+++ b/deepdoc/vision/__init__.py
 from .layout_recognizer import LayoutRecognizer
 from .table_structure_recognizer import TableStructureRecognizer
 def init_in_out(args):
    from PIL import Image
    import os
        try:
            images.append(Image.open(fnm))
            outputs.append(os.path.split(fnm)[-1])
        except Exception as e:
        except Exception:
            traceback.print_exc()
    if os.path.isdir(args.inputs):
    else:
        images_and_outputs(args.inputs)
    for i in range(len(outputs)): outputs[i] = os.path.join(args.output_dir, outputs[i])
    for i in range(len(outputs)):
        outputs[i] = os.path.join(args.output_dir, outputs[i])
    return images, outputs
    return images, outputs
 __all__ = [
    "OCR",
    "Recognizer",
    "LayoutRecognizer",
    "TableStructureRecognizer",
    "init_in_out",
 ]
--- a/deepdoc/vision/layout_recognizer.py
+++ b/deepdoc/vision/layout_recognizer.py
                    get_project_base_directory(),
                    "rag/res/deepdoc")
            super().__init__(self.labels, domain, model_dir)
        except Exception as e:
        except Exception:
            model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
                                          local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
                                          local_dir_use_symlinks=False)
                    "page_number": pn,
                    } for b in lts if float(b["score"]) >= 0.8 or b["type"] not in self.garbage_layouts]
            lts = self.sort_Y_firstly(lts, np.mean(
                [l["bottom"] - l["top"] for l in lts]) / 2)
                [lt["bottom"] - lt["top"] for lt in lts]) / 2)
            lts = self.layouts_cleanup(bxs, lts)
            page_layout.append(lts)
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
 from api.utils.file_utils import get_project_base_directory
 from .operators import *
 import math
 import numpy as np
 import cv2
 import onnxruntime as ort
 from .postprocess import build_post_process
                        "rag/res/deepdoc")
                self.text_detector = TextDetector(model_dir)
                self.text_recognizer = TextRecognizer(model_dir)
            except Exception as e:
            except Exception:
                model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
                                              local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
                                              local_dir_use_symlinks=False)
--- a/deepdoc/vision/operators.py
+++ b/deepdoc/vision/operators.py
        """
        assert len(self.target_size) == 2
        assert self.target_size[0] > 0 and self.target_size[1] > 0
        im_channel = im.shape[2]
        _im_channel = im.shape[2]
        im_scale_y, im_scale_x = self.generate_scale(im)
        im = cv2.resize(
            im,
            im_scale_y: the resize ratio of Y
        """
        origin_shape = im.shape[:2]
        im_c = im.shape[2]
        _im_c = im.shape[2]
        if self.keep_ratio:
            im_size_min = np.min(origin_shape)
            im_size_max = np.max(origin_shape)
            return data
        images_HR = data["image_hr"]
        label_strs = data["label"]
        _label_strs = data["label"]
        transform = ResizeNormalize((imgW, imgH))
        images_HR = transform(images_HR)
        data["img_hr"] = images_HR
--- a/deepdoc/vision/postprocess.py
+++ b/deepdoc/vision/postprocess.py
        outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST,
                                cv2.CHAIN_APPROX_SIMPLE)
        if len(outs) == 3:
            img, contours, _ = outs[0], outs[1], outs[2]
            _img, contours, _ = outs[0], outs[1], outs[2]
        elif len(outs) == 2:
            contours, _ = outs[0], outs[1]
--- a/deepdoc/vision/recognizer.py
+++ b/deepdoc/vision/recognizer.py
 import logging
 import os
 import math
 import numpy as np
 import cv2
 from copy import deepcopy
 import onnxruntime as ort
 from huggingface_hub import snapshot_download
 from api.utils.file_utils import get_project_base_directory
 from .operators import *
 class Recognizer(object):
    def __init__(self, label_list, task_name, model_dir=None):
        """
            return
        min_dis, min_i = 1000000, None
        for i,b in enumerate(boxes):
            if box.get("layoutno", "0") != b.get("layoutno", "0"): continue
            if box.get("layoutno", "0") != b.get("layoutno", "0"):
                continue
            dis = min(abs(box["x0"] - b["x0"]), abs(box["x1"] - b["x1"]), abs(box["x0"]+box["x1"] - b["x1"] - b["x0"])/2)
            if dis < min_dis:
                min_i = i
        scores = np.max(boxes[:, 4:], axis=1)
        boxes = boxes[scores > thr, :]
        scores = scores[scores > thr]
        if len(boxes) == 0: return []
        if len(boxes) == 0:
            return []
        # Get the class with the highest confidence
        class_ids = np.argmax(boxes[:, 4:], axis=1)
        for i in range(len(image_list)):
            if not isinstance(image_list[i], np.ndarray):
                imgs.append(np.array(image_list[i]))
            else: imgs.append(image_list[i])
            else:
                imgs.append(image_list[i])
        batch_loop_cnt = math.ceil(float(len(imgs)) / batch_size)
        for i in range(batch_loop_cnt):
--- a/graphrag/community_reports_extractor.py
+++ b/graphrag/community_reports_extractor.py
                                ("findings", list),
                                ("rating", float),
                                ("rating_explanation", str),
                            ]): continue
                            ]):
                        continue
                    response["weight"] = weight
                    response["entities"] = ents
                except Exception as e:
                res_str.append(self._get_text_output(response))
                res_dict.append(response)
                over += 1
                if callback: callback(msg=f"Communities: {over}/{total}, elapsed: {timer() - st}s, used tokens: {token_count}")
                if callback:
                    callback(msg=f"Communities: {over}/{total}, elapsed: {timer() - st}s, used tokens: {token_count}")
        return CommunityReportsResult(
            structured_output=res_dict,
--- a/graphrag/entity_embedding.py
+++ b/graphrag/entity_embedding.py
 from typing import Any
 import numpy as np
 import networkx as nx
 from dataclasses import dataclass
 from graphrag.leiden import stable_largest_connected_component
--- a/graphrag/graph_extractor.py
+++ b/graphrag/graph_extractor.py
                source_doc_map[doc_index] = text
                all_records[doc_index] = result
                total_token_count += token_count
                if callback: callback(msg=f"{doc_index+1}/{total}, elapsed: {timer() - st}s, used tokens: {total_token_count}")
                if callback:
                    callback(msg=f"{doc_index+1}/{total}, elapsed: {timer() - st}s, used tokens: {total_token_count}")
            except Exception as e:
                if callback: callback(msg="Knowledge graph extraction error:{}".format(str(e)))
                if callback:
                    callback(msg="Knowledge graph extraction error:{}".format(str(e)))
                logging.exception("error extracting graph")
                self._on_error(
                    e,
        text = perform_variable_replacements(self._extraction_prompt, variables=variables)
        gen_conf = {"temperature": 0.3}
        response = self._llm.chat(text, [{"role": "user", "content": "Output:"}], gen_conf)
        if response.find("**ERROR**") >= 0: raise Exception(response)
        if response.find("**ERROR**") >= 0:
            raise Exception(response)
        token_count = num_tokens_from_string(text + response)
        results = response or ""
            text = perform_variable_replacements(CONTINUE_PROMPT, history=history, variables=variables)
            history.append({"role": "user", "content": text})
            response = self._llm.chat("", history, gen_conf)
            if response.find("**ERROR**") >=0: raise Exception(response)
            if response.find("**ERROR**") >=0:
                raise Exception(response)
            results += response or ""
            # if this is the final glean, don't bother updating the continuation flag
--- a/graphrag/index.py
+++ b/graphrag/index.py
    callback(0.75, "Extracting mind graph.")
    mindmap = MindMapExtractor(llm_bdl)
    mg = mindmap(_chunks).output
    if not len(mg.keys()): return chunks
    if not len(mg.keys()):
        return chunks
    logging.debug(json.dumps(mg, ensure_ascii=False, indent=2))
    chunks.append(
--- a/graphrag/leiden.py
+++ b/graphrag/leiden.py
 ) -> dict[int, dict[str, int]]:
    """Return Leiden root communities."""
    results: dict[int, dict[str, int]] = {}
    if is_empty(graph): return results
    if is_empty(graph):
        return results
    if use_lcc:
        graph = stable_largest_connected_component(graph)
        logging.debug(
            "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
        )
    if not graph.nodes(): return {}
    if not graph.nodes():
        return {}
    node_id_to_community_map = _compute_leiden_communities(
        graph=graph,
            result[community_id]["nodes"].append(node_id)
            result[community_id]["weight"] += graph.nodes[node_id].get("rank", 0) * graph.nodes[node_id].get("weight", 1)
        weights = [comm["weight"] for _, comm in result.items()]
        if not weights:continue
        if not weights:
            continue
        max_weight = max(weights)
        for _, comm in result.items(): comm["weight"] /= max_weight
        for _, comm in result.items():
            comm["weight"] /= max_weight
    return results_by_level
--- a/intergrations/chatgpt-on-wechat/plugins/__init__.py
+++ b/intergrations/chatgpt-on-wechat/plugins/__init__.py
 from .ragflow_chat import *
 from .ragflow_chat import RAGFlowChat
 __all__ = [
    "RAGFlowChat"
 ]
--- a/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py
+++ b/intergrations/chatgpt-on-wechat/plugins/ragflow_chat.py
 import requests
 from bridge.context import ContextType  # Import Context, ContextType
 from bridge.reply import Reply, ReplyType  # Import Reply, ReplyType
 from bridge import *
 from plugins import Plugin, register  # Import Plugin and register
 from plugins.event import Event, EventContext, EventAction  # Import event-related classes
--- a/rag/app/book.py
+++ b/rag/app/book.py
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        sections = txt.split("\n")
        sections = [(l, "") for l in sections if l]
        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = HtmlParser()(filename, binary)
        sections = [(l, "") for l in sections if l]
        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
        sections = [(l, "") for l in sections if l]
        sections = [(line, "") for line in sections if line]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")
--- a/rag/app/email.py
+++ b/rag/app/email.py
    _add_content(msg, msg.get_content_type())
    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
        (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
    ]
    st = timer()
--- a/rag/app/knowledge_graph.py
+++ b/rag/app/knowledge_graph.py
    chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
                                          parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
                                          )
    for c in chunks: c["docnm_kwd"] = filename
    for c in chunks:
        c["docnm_kwd"] = filename
    doc = {
        "docnm_kwd": filename,
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
                    continue
                if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                    pn += 1
        return [l for l in lines if l]
        return [line for line in lines if line]
    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
        self.doc = Document(
            if pn > to_page:
                break
            question_level, p_text = docx_question_level(p, bull)
            if not p_text.strip("\n"):continue
            if not p_text.strip("\n"):
                continue
            lines.append((question_level, p_text))
            for run in p.runs:
                if lines[e][0] <= lines[s][0]:
                    break
                e += 1
            if e - s == 1 and visit[s]: continue
            if e - s == 1 and visit[s]:
                continue
            sec = []
            next_level = lines[s][0] + 1
            while not sec and next_level < 22:
                for i in range(s+1, e):
                    if lines[i][0] != next_level: continue
                    if lines[i][0] != next_level:
                        continue
                    sec.append(lines[i][1])
                    visit[i] = True
                next_level += 1
            sec.insert(0, lines[s][1])
            sections.append("\n".join(sec))
        return [l for l in sections if l]
        return [s for s in sections if s]
    def __str__(self) -> str:
        return f'''
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        sections = txt.split("\n")
        sections = [l for l in sections if l]
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = HtmlParser()(filename, binary)
        sections = [l for l in sections if l]
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")
    elif re.search(r"\.doc$", filename, re.IGNORECASE):
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
        sections = [l for l in sections if l]
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")
    else:
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
        if sections and len(sections[0]) < 3:
            sections = [(t, l, [[0] * 5]) for t, l in sections]
            sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
        # set pivot using the most frequent type of title,
        # then merge between 2 pivot
        if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
        else:
            bull = bullets_category([txt for txt, _, _ in sections])
            most_level, levels = title_frequency(
                bull, [(txt, l) for txt, l, poss in sections])
                bull, [(txt, lvl) for txt, lvl, _ in sections])
        assert len(sections) == len(levels)
        sec_ids = []
        sections = [(txt, sec_ids[i], poss)
                    for i, (txt, _, poss) in enumerate(sections)]
        for (img, rows), poss in tbls:
            if not rows: continue
            if not rows:
                continue
            sections.append((rows if isinstance(rows, str) else rows[0], -1,
                            [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
--- a/rag/app/one.py
+++ b/rag/app/one.py
        sections = [(b["text"], self.get_position(b, zoomin))
                    for i, b in enumerate(self.boxes)]
        for (img, rows), poss in tbls:
            if not rows:continue
            if not rows:
                continue
            sections.append((rows if isinstance(rows, str) else rows[0],
                             [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
        return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
        sections = [l for l in sections if l]
        sections = [s for s in sections if s]
        callback(0.8, "Finish parsing.")
    else:
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
        tbl_bottom = tbls[tbl_index][1][0][4]
        tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
        tbl_text = ''.join(tbls[tbl_index][0][1])
        _tbl_text = ''.join(tbls[tbl_index][0][1])
        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
        txt = get_text(filename, binary)
        lines = txt.split("\n")
        comma, tab = 0, 0
        for l in lines:
            if len(l.split(",")) == 2: comma += 1
            if len(l.split("\t")) == 2: tab += 1
        for line in lines:
            if len(line.split(",")) == 2:
                comma += 1
            if len(line.split("\t")) == 2:
                tab += 1
        delimiter = "\t" if tab >= comma else ","
        fails = []
        while i < len(lines):
            arr = lines[i].split(delimiter)
            if len(arr) != 2:
                if question: answer += "\n" + lines[i]
                if question:
                    answer += "\n" + lines[i]
                else:
                    fails.append(str(i+1))
            elif len(arr) == 2:
                if question and answer: res.append(beAdoc(deepcopy(doc), question, answer, eng))
                if question and answer:
                    res.append(beAdoc(deepcopy(doc), question, answer, eng))
                question, answer = arr
            i += 1
            if len(res) % 999 == 0:
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        if question: res.append(beAdoc(deepcopy(doc), question, answer, eng))
        if question:
            res.append(beAdoc(deepcopy(doc), question, answer, eng))
        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
        callback(0.1, "Start to parse.")
        txt = get_text(filename, binary)
        lines = txt.split("\n")
        last_question, last_answer = "", ""
        _last_question, last_answer = "", ""
        question_stack, level_stack = [], []
        code_block = False
        level_index = [-1] * 7
        for index, l in enumerate(lines):
            if l.strip().startswith('```'):
        for index, line in enumerate(lines):
            if line.strip().startswith('```'):
                code_block = not code_block
            question_level, question = 0, ''
            if not code_block:
                question_level, question = mdQuestionLevel(l)
                question_level, question = mdQuestionLevel(line)
            if not question_level or question_level > 6: # not a question
                last_answer = f'{last_answer}\n{l}'
                last_answer = f'{last_answer}\n{line}'
            else:   # is a question
                if last_answer.strip():
                    sum_question = '\n'.join(question_stack)
--- a/rag/app/table.py
+++ b/rag/app/table.py
        for sheetname in wb.sheetnames:
            ws = wb[sheetname]
            rows = list(ws.rows)
            if not rows:continue
            if not rows:
                continue
            headers = [cell.value for cell in rows[0]]
            missed = set([i for i, h in enumerate(headers) if h is None])
            headers = [
                cell.value for i,
                cell in enumerate(
                    rows[0]) if i not in missed]
            if not headers:continue
            if not headers:
                continue
            data = []
            for i, r in enumerate(rows[1:]):
                rn += 1
 def column_data_type(arr):
    arr = list(arr)
    uni = len(set([a for a in arr if a is not None]))
    counts = {"int": 0, "float": 0, "text": 0, "datetime": 0, "bool": 0}
    trans = {t: f for f, t in
             [(int, "int"), (float, "float"), (trans_datatime, "datetime"), (trans_bool, "bool"), (str, "text")]}
                continue
            if i >= to_page:
                break
            row = [l for l in line.split(kwargs.get("delimiter", "\t"))]
            row = [field for field in line.split(kwargs.get("delimiter", "\t"))]
            if len(row) != len(headers):
                fails.append(str(i))
                continue
--- a/rag/llm/__init__.py
+++ b/rag/llm/__init__.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from .embedding_model import *
 from .chat_model import *
 from .cv_model import *
 from .rerank_model import *
 from .sequence2txt_model import *
 from .tts_model import *
 from .embedding_model import (
    OllamaEmbed,
    LocalAIEmbed,
    OpenAIEmbed,
    AzureEmbed,
    XinferenceEmbed,
    QWenEmbed,
    ZhipuEmbed,
    FastEmbed,
    YoudaoEmbed,
    BaiChuanEmbed,
    JinaEmbed,
    DefaultEmbedding,
    MistralEmbed,
    BedrockEmbed,
    GeminiEmbed,
    NvidiaEmbed,
    LmStudioEmbed,
    OpenAI_APIEmbed,
    CoHereEmbed,
    TogetherAIEmbed,
    PerfXCloudEmbed,
    UpstageEmbed,
    SILICONFLOWEmbed,
    ReplicateEmbed,
    BaiduYiyanEmbed,
    VoyageEmbed,
    HuggingFaceEmbed,
    VolcEngineEmbed,
 )
 from .chat_model import (
    GptTurbo,
    AzureChat,
    ZhipuChat,
    QWenChat,
    OllamaChat,
    LocalAIChat,
    XinferenceChat,
    MoonshotChat,
    DeepSeekChat,
    VolcEngineChat,
    BaiChuanChat,
    MiniMaxChat,
    MistralChat,
    GeminiChat,
    BedrockChat,
    GroqChat,
    OpenRouterChat,
    StepFunChat,
    NvidiaChat,
    LmStudioChat,
    OpenAI_APIChat,
    CoHereChat,
    LeptonAIChat,
    TogetherAIChat,
    PerfXCloudChat,
    UpstageChat,
    NovitaAIChat,
    SILICONFLOWChat,
    YiChat,
    ReplicateChat,
    HunyuanChat,
    SparkChat,
    BaiduYiyanChat,
    AnthropicChat,
    GoogleChat,
    HuggingFaceChat,
 )
 from .cv_model import (
    GptV4,
    AzureGptV4,
    OllamaCV,
    XinferenceCV,
    QWenCV,
    Zhipu4V,
    LocalCV,
    GeminiCV,
    OpenRouterCV,
    LocalAICV,
    NvidiaCV,
    LmStudioCV,
    StepFunCV,
    OpenAI_APICV,
    TogetherAICV,
    YiCV,
    HunyuanCV,
 )
 from .rerank_model import (
    LocalAIRerank,
    DefaultRerank,
    JinaRerank,
    YoudaoRerank,
    XInferenceRerank,
    NvidiaRerank,
    LmStudioRerank,
    OpenAI_APIRerank,
    CoHereRerank,
    TogetherAIRerank,
    SILICONFLOWRerank,
    BaiduYiyanRerank,
    VoyageRerank,
    QWenRerank,
 )
 from .sequence2txt_model import (
    GPTSeq2txt,
    QWenSeq2txt,
    AzureSeq2txt,
    XinferenceSeq2txt,
    TencentCloudSeq2txt,
 )
 from .tts_model import (
    FishAudioTTS,
    QwenTTS,
    OpenAITTS,
    SparkTTS,
    XinferenceTTS,
 )
 EmbeddingModel = {
    "Ollama": OllamaEmbed,
    "BaiduYiyan": BaiduYiyanEmbed,
    "Voyage AI": VoyageEmbed,
    "HuggingFace": HuggingFaceEmbed,
    "VolcEngine":VolcEngineEmbed,
    "VolcEngine": VolcEngineEmbed,
 }
 CvModel = {
    "OpenAI-API-Compatible": OpenAI_APICV,
    "TogetherAI": TogetherAICV,
    "01.AI": YiCV,
    "Tencent Hunyuan": HunyuanCV
    "Tencent Hunyuan": HunyuanCV,
 }
 ChatModel = {
 }
 RerankModel = {
    "LocalAI":LocalAIRerank,
    "LocalAI": LocalAIRerank,
    "BAAI": DefaultRerank,
    "Jina": JinaRerank,
    "Youdao": YoudaoRerank,
    "Tongyi-Qianwen": QWenSeq2txt,
    "Azure-OpenAI": AzureSeq2txt,
    "Xinference": XinferenceSeq2txt,
    "Tencent Cloud": TencentCloudSeq2txt
    "Tencent Cloud": TencentCloudSeq2txt,
 }
 TTSModel = {
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
                stream=True,
                **gen_conf)
            for resp in response:
                if not resp.choices: continue
                if not resp.choices:
                    continue
                if not resp.choices[0].delta.content:
                    resp.choices[0].delta.content = ""
                ans += resp.choices[0].delta.content
                        )
                elif isinstance(resp.usage, dict):
                    total_tokens = resp.usage.get("total_tokens", total_tokens)
                else: total_tokens = resp.usage.total_tokens
                else:
                    total_tokens = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "length":
                    if is_chinese(ans):
 class GptTurbo(Base):
    def __init__(self, key, model_name="gpt-3.5-turbo", base_url="https://api.openai.com/v1"):
        if not base_url: base_url = "https://api.openai.com/v1"
        if not base_url:
            base_url = "https://api.openai.com/v1"
        super().__init__(key, model_name, base_url)
 class MoonshotChat(Base):
    def __init__(self, key, model_name="moonshot-v1-8k", base_url="https://api.moonshot.cn/v1"):
        if not base_url: base_url = "https://api.moonshot.cn/v1"
        if not base_url:
            base_url = "https://api.moonshot.cn/v1"
        super().__init__(key, model_name, base_url)
 class DeepSeekChat(Base):
    def __init__(self, key, model_name="deepseek-chat", base_url="https://api.deepseek.com/v1"):
        if not base_url: base_url = "https://api.deepseek.com/v1"
        if not base_url:
            base_url = "https://api.deepseek.com/v1"
        super().__init__(key, model_name, base_url)
                stream=True,
                **self._format_params(gen_conf))
            for resp in response:
                if not resp.choices: continue
                if not resp.choices:
                    continue
                if not resp.choices[0].delta.content:
                    resp.choices[0].delta.content = ""
                ans += resp.choices[0].delta.content
        if system:
            history.insert(0, {"role": "system", "content": system})
        try:
            if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
            if "presence_penalty" in gen_conf:
                del gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf:
                del gen_conf["frequency_penalty"]
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=history,
    def chat_streamly(self, system, history, gen_conf):
        if system:
            history.insert(0, {"role": "system", "content": system})
        if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
        if "presence_penalty" in gen_conf:
            del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf:
            del gen_conf["frequency_penalty"]
        ans = ""
        tk_count = 0
        try:
                **gen_conf
            )
            for resp in response:
                if not resp.choices[0].delta.content: continue
                if not resp.choices[0].delta.content:
                    continue
                delta = resp.choices[0].delta.content
                ans += delta
                if resp.choices[0].finish_reason == "length":
                    else:
                        ans += LENGTH_NOTIFICATION_EN
                    tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop": tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop":
                    tk_count = resp.usage.total_tokens
                yield ans
        except Exception as e:
            yield ans + "\n**ERROR**: " + str(e)
            history.insert(0, {"role": "system", "content": system})
        try:
            options = {}
            if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
            if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
            if "top_p" in gen_conf: options["top_p"] = gen_conf["top_p"]
            if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
            if "temperature" in gen_conf:
                options["temperature"] = gen_conf["temperature"]
            if "max_tokens" in gen_conf:
                options["num_predict"] = gen_conf["max_tokens"]
            if "top_p" in gen_conf:
                options["top_p"] = gen_conf["top_p"]
            if "presence_penalty" in gen_conf:
                options["presence_penalty"] = gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf:
                options["frequency_penalty"] = gen_conf["frequency_penalty"]
            response = self.client.chat(
                model=self.model_name,
                messages=history,
        if system:
            history.insert(0, {"role": "system", "content": system})
        options = {}
        if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
        if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
        if "top_p" in gen_conf: options["top_p"] = gen_conf["top_p"]
        if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
        if "temperature" in gen_conf:
            options["temperature"] = gen_conf["temperature"]
        if "max_tokens" in gen_conf:
            options["num_predict"] = gen_conf["max_tokens"]
        if "top_p" in gen_conf:
            options["top_p"] = gen_conf["top_p"]
        if "presence_penalty" in gen_conf:
            options["presence_penalty"] = gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf:
            options["frequency_penalty"] = gen_conf["frequency_penalty"]
        ans = ""
        try:
            response = self.client.chat(
                messages=history,
                **gen_conf)
            for resp in response:
                if not resp.choices or not resp.choices[0].delta.content: continue
                if not resp.choices or not resp.choices[0].delta.content:
                    continue
                ans += resp.choices[0].delta.content
                total_tokens += 1
                if resp.choices[0].finish_reason == "length":
        assert model_name in model2version or model_name in version2model, f"The given model name is not supported yet. Support: {list(model2version.keys())}"
        if model_name in model2version:
            model_version = model2version[model_name]
        else: model_version = model_name
        else:
            model_version = model_name
        super().__init__(key, model_version, base_url)
            self.system = system
        if "max_tokens" not in gen_conf:
            gen_conf["max_tokens"] = 4096
        if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
        if "presence_penalty" in gen_conf:
            del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf:
            del gen_conf["frequency_penalty"]
        ans = ""
        try:
            self.system = system
        if "max_tokens" not in gen_conf:
            gen_conf["max_tokens"] = 4096
        if "presence_penalty" in gen_conf: del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf: del gen_conf["frequency_penalty"]
        if "presence_penalty" in gen_conf:
            del gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf:
            del gen_conf["frequency_penalty"]
        ans = ""
        total_tokens = 0
--- a/rag/llm/cv_model.py
+++ b/rag/llm/cv_model.py
 from io import BytesIO
 import json
 import requests
 from transformers import GenerationConfig
 from rag.nlp import is_english
 from api.utils import get_uuid
                stream=True
            )
            for resp in response:
                if not resp.choices[0].delta.content: continue
                if not resp.choices[0].delta.content:
                    continue
                delta = resp.choices[0].delta.content
                ans += delta
                if resp.choices[0].finish_reason == "length":
                    ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                        [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
                    tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop": tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop":
                    tk_count = resp.usage.total_tokens
                yield ans
        except Exception as e:
            yield ans + "\n**ERROR**: " + str(e)
        buffered = BytesIO()
        try:
            image.save(buffered, format="JPEG")
        except Exception as e:
        except Exception:
            image.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode("utf-8")
 class GptV4(Base):
    def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1"):
        if not base_url: base_url="https://api.openai.com/v1"
        if not base_url:
            base_url="https://api.openai.com/v1"
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name
        self.lang = lang
        prompt = self.prompt(b64)
        for i in range(len(prompt)):
            for c in prompt[i]["content"]:
                if "text" in c: c["type"] = "text"
                if "text" in c:
                    c["type"] = "text"
        res = self.client.chat.completions.create(
            model=self.model_name,
        prompt = self.prompt(b64)
        for i in range(len(prompt)):
            for c in prompt[i]["content"]:
                if "text" in c: c["type"] = "text"
                if "text" in c:
                    c["type"] = "text"
        res = self.client.chat.completions.create(
            model=self.model_name,
                stream=True
            )
            for resp in response:
                if not resp.choices[0].delta.content: continue
                if not resp.choices[0].delta.content:
                    continue
                delta = resp.choices[0].delta.content
                ans += delta
                if resp.choices[0].finish_reason == "length":
                    ans += "...\nFor the content length reason, it stopped, continue?" if is_english(
                        [ans]) else "······\n由于长度的原因，回答被截断了，要继续吗？"
                    tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop": tk_count = resp.usage.total_tokens
                if resp.choices[0].finish_reason == "stop":
                    tk_count = resp.usage.total_tokens
                yield ans
        except Exception as e:
            yield ans + "\n**ERROR**: " + str(e)
                if his["role"] == "user":
                    his["images"] = [image]
            options = {}
            if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
            if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
            if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
            if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
            if "temperature" in gen_conf:
                options["temperature"] = gen_conf["temperature"]
            if "max_tokens" in gen_conf:
                options["num_predict"] = gen_conf["max_tokens"]
            if "top_p" in gen_conf:
                options["top_k"] = gen_conf["top_p"]
            if "presence_penalty" in gen_conf:
                options["presence_penalty"] = gen_conf["presence_penalty"]
            if "frequency_penalty" in gen_conf:
                options["frequency_penalty"] = gen_conf["frequency_penalty"]
            response = self.client.chat(
                model=self.model_name,
                messages=history,
            if his["role"] == "user":
                his["images"] = [image]
        options = {}
        if "temperature" in gen_conf: options["temperature"] = gen_conf["temperature"]
        if "max_tokens" in gen_conf: options["num_predict"] = gen_conf["max_tokens"]
        if "top_p" in gen_conf: options["top_k"] = gen_conf["top_p"]
        if "presence_penalty" in gen_conf: options["presence_penalty"] = gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf: options["frequency_penalty"] = gen_conf["frequency_penalty"]
        if "temperature" in gen_conf:
            options["temperature"] = gen_conf["temperature"]
        if "max_tokens" in gen_conf:
            options["num_predict"] = gen_conf["max_tokens"]
        if "top_p" in gen_conf:
            options["top_k"] = gen_conf["top_p"]
        if "presence_penalty" in gen_conf:
            options["presence_penalty"] = gen_conf["presence_penalty"]
        if "frequency_penalty" in gen_conf:
            options["frequency_penalty"] = gen_conf["frequency_penalty"]
        ans = ""
        try:
            response = self.client.chat(
 class GeminiCV(Base):
    def __init__(self, key, model_name="gemini-1.0-pro-vision-latest", lang="Chinese", **kwargs):
        from google.generativeai import client, GenerativeModel, GenerationConfig
        from google.generativeai import client, GenerativeModel
        client.configure(api_key=key)
        _client = client.get_default_generative_client()
        self.model_name = model_name
                if his["role"] == "user":
                    his["parts"] = [his["content"]]
                    his.pop("content")
            history[-1]["parts"].append(f"data:image/jpeg;base64," + image)
            history[-1]["parts"].append("data:image/jpeg;base64," + image)
            response = self.model.generate_content(history, generation_config=GenerationConfig(
                max_output_tokens=gen_conf.get("max_tokens", 1000), temperature=gen_conf.get("temperature", 0.3),
            history[-1]["content"] = system + history[-1]["content"] + "user query: " + history[-1]["content"]
        ans = ""
        tk_count = 0
        try:
            for his in history:
                if his["role"] == "assistant":
                if his["role"] == "user":
                    his["parts"] = [his["content"]]
                    his.pop("content")
            history[-1]["parts"].append(f"data:image/jpeg;base64," + image)
            history[-1]["parts"].append("data:image/jpeg;base64," + image)
            response = self.model.generate_content(history, generation_config=GenerationConfig(
                max_output_tokens=gen_conf.get("max_tokens", 1000), temperature=gen_conf.get("temperature", 0.3),
                top_p=gen_conf.get("top_p", 0.7)), stream=True)
            for resp in response:
                if not resp.text: continue
                if not resp.text:
                    continue
                ans += resp.text
                yield ans
        except Exception as e:
 class StepFunCV(GptV4):
    def __init__(self, key, model_name="step-1v-8k", lang="Chinese", base_url="https://api.stepfun.com/v1"):
        if not base_url: base_url="https://api.stepfun.com/v1"
        if not base_url:
            base_url="https://api.stepfun.com/v1"
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name
        self.lang = lang
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
 #
 import requests
 from openai.lib.azure import AzureOpenAI
 from zhipuai import ZhipuAI
 import io
 from abc import ABC
 from ollama import Client
 from openai import OpenAI
 import os
 import json
 from rag.utils import num_tokens_from_string
 import base64
 class GPTSeq2txt(Base):
    def __init__(self, key, model_name="whisper-1", base_url="https://api.openai.com/v1"):
        if not base_url: base_url = "https://api.openai.com/v1"
        if not base_url:
            base_url = "https://api.openai.com/v1"
        self.client = OpenAI(api_key=key, base_url=base_url)
        self.model_name = model_name
--- a/rag/llm/tts_model.py
+++ b/rag/llm/tts_model.py
 import _thread as thread
 import base64
 import datetime
 import hashlib
 import hmac
 import json
 class OpenAITTS(Base):
    def __init__(self, key, model_name="tts-1", base_url="https://api.openai.com/v1"):
        if not base_url: base_url = "https://api.openai.com/v1"
        if not base_url:
            base_url = "https://api.openai.com/v1"
        self.api_key = key
        self.model_name = model_name
        self.base_url = base_url
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
 def is_english(texts):
    eng = 0
    if not texts: return False
    if not texts:
        return False
    for t in texts:
        if re.match(r"[ `a-zA-Z.,':;/\"?<>!\(\)-]", t.strip()):
            eng += 1
    res = []
    # wrap up as es documents
    for ck in chunks:
        if len(ck.strip()) == 0:continue
        if len(ck.strip()) == 0:
            continue
        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        if pdf_parser:
    res = []
    # wrap up as es documents
    for ck, image in zip(chunks, images):
        if len(ck.strip()) == 0:continue
        if len(ck.strip()) == 0:
            continue
        logging.debug("-- {}".format(ck))
        d = copy.deepcopy(doc)
        d["image"] = image
            d = copy.deepcopy(doc)
            tokenize(d, rows, eng)
            d["content_with_weight"] = rows
            if img: d["image"] = img
            if poss: add_positions(d, poss)
            if img:
                d["image"] = img
            if poss:
                add_positions(d, poss)
            res.append(d)
            continue
        de = "; " if eng else "； "
            if re.search(r"(title|head)", layout) and not not_title(txt.split("@")[0]):
                levels[i] = bullets_size
    most_level = bullets_size+1
    for l, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
        if l <= bullets_size:
            most_level = l
    for level, c in sorted(Counter(levels).items(), key=lambda x:x[1]*-1):
        if level <= bullets_size:
            most_level = level
            break
    return most_level, levels
    def add_chunk(t, pos):
        nonlocal cks, tk_nums, delimiter
        tnum = num_tokens_from_string(t)
        if not pos: pos = ""
        if not pos:
            pos = ""
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num  
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
            keywords.append(tt)
            twts = self.tw.weights([tt])
            syns = self.syn.lookup(tt)
            if syns and len(keywords) < 32: keywords.extend(syns)
            if syns and len(keywords) < 32:
                keywords.extend(syns)
            logging.debug(json.dumps(twts, ensure_ascii=False))
            tms = []
            for tk, w in sorted(twts, key=lambda x: x[1] * -1):
                tk_syns = self.syn.lookup(tk)
                tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
                if len(keywords) < 32: keywords.extend([s for s in tk_syns if s])
                if len(keywords) < 32:
                    keywords.extend([s for s in tk_syns if s])
                tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
                tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
--- a/rag/nlp/rag_tokenizer.py
+++ b/rag/nlp/rag_tokenizer.py
        return HanziConv.toSimplified(line)
    def dfs_(self, chars, s, preTks, tkslist):
        MAX_L = 10
        res = s
        # if s > MAX_L or s>= len(chars):
        if s >= len(chars):
        return sorted(res, key=lambda x: x[1], reverse=True)
    def merge_(self, tks):
        patts = [
            (r"[ ]+", " "),
            (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
        ]
        # for p,s in patts: tks = re.sub(p, s, tks)
        # if split chars is part of token
        res = []
        tks = re.sub(r"[ ]+", " ", tks).split()
            same = 0
            while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
                same += 1
            if same > 0: res.append(" ".join(tks[j: j + same]))
            if same > 0:
                res.append(" ".join(tks[j: j + same]))
            _i = i + same
            _j = j + same
            j = _j + 1
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
            res = {}
            f = open(fnm, "r")
            while True:
                l = f.readline()
                if not l:
                line = f.readline()
                if not line:
                    break
                arr = l.replace("\n", "").split("\t")
                arr = line.replace("\n", "").split("\t")
                if len(arr) < 2:
                    res[arr[0]] = 0
                else:
--- a/rag/raptor.py
+++ b/rag/raptor.py
    def __call__(self, chunks, random_state, callback=None):
        layers = [(0, len(chunks))]
        start, end = 0, len(chunks)
        if len(chunks) <= 1: return
        if len(chunks) <= 1:
            return
        chunks = [(s, a) for s, a in chunks if len(a) > 0]
        def summarize(ck_idx, lock):
                logging.debug(f"SUM: {cnt}")
                embds, _ = self._embd_model.encode([cnt])
                with lock:
                    if not len(embds[0]): return
                    if not len(embds[0]):
                        return
                    chunks.append((cnt, embds[0]))
            except Exception as e:
                logging.exception("summarize got exception")
--- a/rag/svr/cache_file_svr.py
+++ b/rag/svr/cache_file_svr.py
 def main():
    locations = collect()
    if not locations:return
    if not locations:
        return
    logging.info(f"TASKS: {len(locations)}")
    for kb_id, loc in locations:
        try:
            if REDIS_CONN.is_alive():
                try:
                    key = "{}/{}".format(kb_id, loc)
                    if REDIS_CONN.exist(key):continue
                    if REDIS_CONN.exist(key):
                        continue
                    file_bin = STORAGE_IMPL.get(kb_id, loc)
                    REDIS_CONN.transaction(key, file_bin, 12 * 60)
                    logging.info("CACHE: {}".format(loc))
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
 from api.utils.log_utils import initRootLogger
 CONSUMER_NO = "0" if len(sys.argv) < 2 else sys.argv[1]
 CONSUMER_NAME = "task_executor_" + CONSUMER_NO
 LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
 initRootLogger(CONSUMER_NAME, LOG_LEVELS)
 from datetime import datetime
 import json
 import os
 import hashlib
 import copy
 import re
 import sys
 import time
 import threading
 from functools import partial
 from rag.utils.redis_conn import REDIS_CONN, Payload
 from rag.utils.storage_factory import STORAGE_IMPL
 CONSUMER_NO = "0" if len(sys.argv) < 2 else sys.argv[1]
 CONSUMER_NAME = "task_executor_" + CONSUMER_NO
 LOG_LEVELS = os.environ.get("LOG_LEVELS", "")
 initRootLogger(CONSUMER_NAME, LOG_LEVELS)
 BATCH_SIZE = 64
 FACTORY = {
        "doc_id": task["doc_id"],
        "kb_id": str(task["kb_id"])
    }
    if task["pagerank"]: doc["pagerank_fea"] = int(task["pagerank"])
    if task["pagerank"]:
        doc["pagerank_fea"] = int(task["pagerank"])
    el = 0
    for ck in cks:
        d = copy.deepcopy(doc)
        "docnm_kwd": row["name"],
        "title_tks": rag_tokenizer.tokenize(row["name"])
    }
    if row["pagerank"]: doc["pagerank_fea"] = int(row["pagerank"])
    if row["pagerank"]:
        doc["pagerank_fea"] = int(row["pagerank"])
    res = []
    tk_count = 0
    for content, vctr in chunks[original_length:]:
--- a/rag/utils/__init__.py
+++ b/rag/utils/__init__.py
    try:
        with open(fnm, "r") as f:
            while True:
                l = f.readline()
                if not l:
                line = f.readline()
                if not line:
                    break
                l = l.strip("\n")
                if l == 'nan':
                line = line.strip("\n")
                if line == 'nan':
                    continue
                if l > m:
                    m = l
    except Exception as e:
                if line > m:
                    m = line
    except Exception:
        pass
    return m
    try:
        with open(fnm, "r") as f:
            while True:
                l = f.readline()
                if not l:
                line = f.readline()
                if not line:
                    break
                l = l.strip("\n")
                if l == 'nan':
                line = line.strip("\n")
                if line == 'nan':
                    continue
                if int(l) > m:
                    m = int(l)
    except Exception as e:
                if int(line) > m:
                    m = int(line)
    except Exception:
        pass
    return m
--- a/rag/utils/azure_sas_conn.py
+++ b/rag/utils/azure_sas_conn.py
        self.conn = None
    def health(self):
        bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
        _bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
        return self.conn.upload_blob(name=fnm, data=BytesIO(binary), length=len(binary))
    def put(self, bucket, fnm, binary):
--- a/rag/utils/azure_spn_conn.py
+++ b/rag/utils/azure_spn_conn.py
        self.conn = None
    def health(self):
        bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
        _bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
        f = self.conn.create_file(fnm)
        f.append_data(binary, offset=0, length=len(binary))
        return f.flush_data(len(binary))
--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
                    bqry.filter.append(
                        Q("bool", must_not=Q("range", available_int={"lt": 1})))
                continue
            if not v: continue
            if not v:
                continue
            if isinstance(v, list):
                bqry.filter.append(Q("terms", **{k: v}))
            elif isinstance(v, str) or isinstance(v, int):
--- a/sdk/python/ragflow_sdk/__init__.py
+++ b/sdk/python/ragflow_sdk/__init__.py
 from beartype.claw import beartype_this_package
 beartype_this_package()  # <-- raise exceptions in your code
 import importlib.metadata
 __version__ = importlib.metadata.version("ragflow_sdk")
 from .ragflow import RAGFlow
 from .modules.dataset import DataSet
 from .modules.chat import Chat
 from .modules.session import Session
 from .modules.document import Document
 from .modules.chunk import Chunk
 from .modules.agent import Agent
 from .modules.agent import Agent
 __version__ = importlib.metadata.version("ragflow_sdk")
 __all__ = [
    "RAGFlow",
    "DataSet",
    "Chat",
    "Session",
    "Document",
    "Chunk",
    "Agent"
 ]
--- a/sdk/python/ragflow_sdk/modules/session.py
+++ b/sdk/python/ragflow_sdk/modules/session.py
                raise Exception(json_data["message"])
            if line.startswith("data:"):
                json_data = json.loads(line[5:])
                if json_data["data"] != True:
                if not json_data["data"]:
                    answer = json_data["data"]["answer"]
                    reference = json_data["data"]["reference"]
                    temp_dict = {
--- a/sdk/python/test/conftest.py
+++ b/sdk/python/test/conftest.py
 import string
 import random
 import os
 import pytest
 import requests
--- a/sdk/python/test/test_frontend_api/common.py
+++ b/sdk/python/test/test_frontend_api/common.py
 def upload_file(auth, dataset_id, path):
    authorization = {"Authorization": auth}
    url = f"{HOST_ADDRESS}/v1/document/upload"
    base_name = os.path.basename(path)
    json_req = {
        "kb_id": dataset_id,
    }
--- a/sdk/python/test/test_frontend_api/get_email.py
+++ b/sdk/python/test/test_frontend_api/get_email.py
 def test_get_email(get_email):
    print(f"\nEmail account:",flush=True)
    print("\nEmail account:",flush=True)
    print(f"{get_email}\n",flush=True)
--- a/sdk/python/test/test_frontend_api/test_chunk.py
+++ b/sdk/python/test/test_frontend_api/test_chunk.py
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 from common import HOST_ADDRESS, create_dataset, list_dataset, rm_dataset, update_dataset, upload_file, DATASET_NAME_LIMIT
 from common import create_dataset, list_dataset, rm_dataset, upload_file
 from common import list_document, get_docs_info, parse_docs
 from time import sleep
 from timeit import default_timer as timer
 import re
 import pytest
 import random
 import string
 def test_parse_txt_document(get_auth):
--- a/sdk/python/test/test_frontend_api/test_dataset.py
+++ b/sdk/python/test/test_frontend_api/test_dataset.py
 from common import HOST_ADDRESS, create_dataset, list_dataset, rm_dataset, update_dataset, DATASET_NAME_LIMIT
 from common import create_dataset, list_dataset, rm_dataset, update_dataset, DATASET_NAME_LIMIT
 import re
 import pytest
 import random
 import string
 def test_dataset_1k_dataset(get_auth):
    # create dataset
    authorization = {"Authorization": get_auth}
    url = f"{HOST_ADDRESS}/v1/kb/create"
    for i in range(1000):
        res = create_dataset(get_auth, f"test_create_dataset_{i}")
        assert res.get("code") == 0, f"{res.get('message')}"
        dataset_id = item.get("id")
        dataset_list.append(dataset_id)
        match = re.match(pattern, dataset_name)
        assert match != None
        assert match is not None
    for dataset_id in dataset_list:
        res = rm_dataset(get_auth, dataset_id)
--- a/sdk/python/test/test_sdk_api/get_email.py
+++ b/sdk/python/test/test_sdk_api/get_email.py
 def test_get_email(get_email):
    print(f"\nEmail account:",flush=True)
    print("\nEmail account:",flush=True)
    print(f"{get_email}\n",flush=True)
--- a/sdk/python/test/test_sdk_api/t_agent.py
+++ b/sdk/python/test/test_sdk_api/t_agent.py
 from ragflow_sdk import RAGFlow,Agent
 from ragflow_sdk import RAGFlow
 from common import HOST_ADDRESS
 import pytest