### What problem does this PR solve? #3361 ### Type of change - [x] Performance Improvementtags/v0.14.0
| outs = [] | outs = [] | ||||
| for q in self._param.query: | for q in self._param.query: | ||||
| if q["component_id"]: | if q["component_id"]: | ||||
| if q["component_id"].split("@")[0].lower().find("begin") > 0: | |||||
| cpn_id, key = q["component_id"].split("@") | |||||
| for p in self._canvas.get_component(cpn_id)["obj"]._param.query: | |||||
| if p["key"] == key: | |||||
| outs.append(pd.DataFrame([{"content": p["value"]}])) | |||||
| self._param.inputs.append({"component_id": q["component_id"], | |||||
| "content": p["value"]}) | |||||
| break | |||||
| else: | |||||
| assert False, f"Can't find parameter '{key}' for {cpn_id}" | |||||
| continue | |||||
| outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1]) | outs.append(self._canvas.get_component(q["component_id"])["obj"].output(allow_partial=False)[1]) | ||||
| self._param.inputs.append({"component_id": q["component_id"], | self._param.inputs.append({"component_id": q["component_id"], | ||||
| "content": "\n".join([str(d["content"]) for d in outs[-1].to_dict('records')])}) | |||||
| "content": "\n".join( | |||||
| [str(d["content"]) for d in outs[-1].to_dict('records')])}) | |||||
| elif q["value"]: | elif q["value"]: | ||||
| self._param.inputs.append({"component_id": None, "content": q["value"]}) | self._param.inputs.append({"component_id": None, "content": q["value"]}) | ||||
| outs.append(pd.DataFrame([{"content": q["value"]}])) | outs.append(pd.DataFrame([{"content": q["value"]}])) |
| retrieval_res = [] | retrieval_res = [] | ||||
| self._param.inputs = [] | self._param.inputs = [] | ||||
| for para in self._param.parameters: | for para in self._param.parameters: | ||||
| if para["component_id"].split("@")[0].lower().find("begin") > 0: | |||||
| cpn_id, key = para["component_id"].split("@") | |||||
| for p in self._canvas.get_component(cpn_id)["obj"]._param.query: | |||||
| if p["key"] == key: | |||||
| kwargs[para["key"]] = p["value"] | |||||
| self._param.inputs.append( | |||||
| {"component_id": para["component_id"], "content": kwargs[para["key"]]}) | |||||
| break | |||||
| else: | |||||
| assert False, f"Can't find parameter '{key}' for {cpn_id}" | |||||
| continue | |||||
| cpn = self._canvas.get_component(para["component_id"])["obj"] | cpn = self._canvas.get_component(para["component_id"])["obj"] | ||||
| if cpn.component_name.lower() == "answer": | if cpn.component_name.lower() == "answer": | ||||
| kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"] | kwargs[para["key"]] = self._canvas.get_history(1)[0]["content"] |
| from api.db.services.file_service import FileService | from api.db.services.file_service import FileService | ||||
| from api.db.services.task_service import TaskService, queue_tasks | from api.db.services.task_service import TaskService, queue_tasks | ||||
| from api.db.services.user_service import UserTenantService | from api.db.services.user_service import UserTenantService | ||||
| from deepdoc.parser.html_parser import RAGFlowHtmlParser | |||||
| from rag.nlp import search | from rag.nlp import search | ||||
| from api.db.services import duplicate_name | from api.db.services import duplicate_name | ||||
| from api.db.services.knowledgebase_service import KnowledgebaseService | from api.db.services.knowledgebase_service import KnowledgebaseService | ||||
| doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id) | doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id) | ||||
| return get_json_result(data=doc_ids) | return get_json_result(data=doc_ids) | ||||
| @manager.route('/parse', methods=['POST']) | |||||
| @login_required | |||||
| def parse(): | |||||
| url = request.json.get("url") | |||||
| if url: | |||||
| if not is_valid_url(url): | |||||
| return get_json_result( | |||||
| data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR) | |||||
| from selenium.webdriver import Chrome, ChromeOptions | |||||
| options = ChromeOptions() | |||||
| options.add_argument('--headless') | |||||
| options.add_argument('--disable-gpu') | |||||
| options.add_argument('--no-sandbox') | |||||
| options.add_argument('--disable-dev-shm-usage') | |||||
| driver = Chrome(options=options) | |||||
| driver.get(url) | |||||
| sections = RAGFlowHtmlParser()(driver.page_source) | |||||
| return get_json_result(data="\n".join(sections)) | |||||
| if 'file' not in request.files: | |||||
| return get_json_result( | |||||
| data=False, message='No file part!', code=RetCode.ARGUMENT_ERROR) | |||||
| file_objs = request.files.getlist('file') | |||||
| txt = FileService.parse_docs(file_objs, current_user.id) | |||||
| return get_json_result(data=txt) |
| if not self.isChinese(txt): | if not self.isChinese(txt): | ||||
| txt = FulltextQueryer.rmWWW(txt) | txt = FulltextQueryer.rmWWW(txt) | ||||
| tks = rag_tokenizer.tokenize(txt).split(" ") | tks = rag_tokenizer.tokenize(txt).split(" ") | ||||
| tks_w = self.tw.weights(tks) | |||||
| keywords = [t for t in tks if t] | |||||
| tks_w = self.tw.weights(tks, preprocess=False) | |||||
| tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w] | tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w] | ||||
| tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk] | tks_w = [(re.sub(r"^[a-z0-9]$", "", tk), w) for tk, w in tks_w if tk] | ||||
| tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk] | tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk] | ||||
| q = ["{}^{:.4f}".format(tk, w) for tk, w in tks_w if tk] | |||||
| syns = [] | |||||
| for tk, w in tks_w: | |||||
| syn = self.syn.lookup(tk) | |||||
| syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ") | |||||
| keywords.extend(syn) | |||||
| syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn] | |||||
| syns.append(" ".join(syn)) | |||||
| q = ["({}^{:.4f}".format(tk, w) + " %s)".format(syn) for (tk, w), syn in zip(tks_w, syns)] | |||||
| for i in range(1, len(tks_w)): | for i in range(1, len(tks_w)): | ||||
| q.append( | q.append( | ||||
| '"%s %s"^%.4f' | '"%s %s"^%.4f' | ||||
| query = " ".join(q) | query = " ".join(q) | ||||
| return MatchTextExpr( | return MatchTextExpr( | ||||
| self.query_fields, query, 100 | self.query_fields, query, 100 | ||||
| ), tks | |||||
| ), keywords | |||||
| def need_fine_grained_tokenize(tk): | def need_fine_grained_tokenize(tk): | ||||
| if len(tk) < 3: | if len(tk) < 3: |
| import os | import os | ||||
| import time | import time | ||||
| import re | import re | ||||
| from nltk.corpus import wordnet | |||||
| from api.utils.file_utils import get_project_base_directory | from api.utils.file_utils import get_project_base_directory | ||||
| from api.utils.log_utils import logger | from api.utils.log_utils import logger | ||||
| logger.error("Fail to load synonym!" + str(e)) | logger.error("Fail to load synonym!" + str(e)) | ||||
| def lookup(self, tk): | def lookup(self, tk): | ||||
| if re.match(r"[a-z]+$", tk): | |||||
| res = list(set([re.sub("_", " ", syn.name().split(".")[0]) for syn in wordnet.synsets("love")]) - set([tk])) | |||||
| return [t for t in res if t] | |||||
| self.lookup_num += 1 | self.lookup_num += 1 | ||||
| self.load() | self.load() | ||||
| res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), []) | res = self.dictionary.get(re.sub(r"[ \t]+", " ", tk.lower()), []) |