### What problem does this PR solve? ### Type of change - [x] Refactoringtags/nightly
| import random | import random | ||||
| from abc import ABC | from abc import ABC | ||||
| from functools import partial | from functools import partial | ||||
| from typing import Tuple, Union | |||||
| import pandas as pd | import pandas as pd | ||||
| def set_exception(self, e): | def set_exception(self, e): | ||||
| self.exception = e | self.exception = e | ||||
| def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]: | |||||
| if allow_partial: | |||||
| return super.output() | |||||
| for r, c in self._canvas.history[::-1]: | |||||
| if r == "user": | |||||
| return self._param.output_var_name, pd.DataFrame([{"content": c}]) | |||||
| self._param.output_var_name, pd.DataFrame([]) | |||||
| canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id}) | canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id}) | ||||
| canvas.history.append(("assistant", final_ans["content"])) | canvas.history.append(("assistant", final_ans["content"])) | ||||
| if not canvas.path[-1]: | |||||
| canvas.path.pop(-1) | |||||
| if final_ans.get("reference"): | if final_ans.get("reference"): | ||||
| canvas.reference.append(final_ans["reference"]) | canvas.reference.append(final_ans["reference"]) | ||||
| cvs.dsl = json.loads(str(canvas)) | cvs.dsl = json.loads(str(canvas)) | ||||
| UserCanvasService.update_by_id(req["id"], cvs.to_dict()) | UserCanvasService.update_by_id(req["id"], cvs.to_dict()) | ||||
| except Exception as e: | except Exception as e: | ||||
| cvs.dsl = json.loads(str(canvas)) | cvs.dsl = json.loads(str(canvas)) | ||||
| if not canvas.path[-1]: | |||||
| canvas.path.pop(-1) | |||||
| UserCanvasService.update_by_id(req["id"], cvs.to_dict()) | UserCanvasService.update_by_id(req["id"], cvs.to_dict()) | ||||
| traceback.print_exc() | traceback.print_exc() | ||||
| yield "data:" + json.dumps({"code": 500, "message": str(e), | yield "data:" + json.dumps({"code": 500, "message": str(e), | 
| } | } | ||||
| if not DialogService.save(**dia): | if not DialogService.save(**dia): | ||||
| return get_data_error_result(message="Fail to new a dialog!") | return get_data_error_result(message="Fail to new a dialog!") | ||||
| e, dia = DialogService.get_by_id(dia["id"]) | |||||
| if not e: | |||||
| return get_data_error_result(message="Fail to new a dialog!") | |||||
| return get_json_result(data=dia.to_json()) | |||||
| return get_json_result(data=dia) | |||||
| else: | else: | ||||
| del req["dialog_id"] | del req["dialog_id"] | ||||
| if "kb_names" in req: | if "kb_names" in req: | ||||
| if not e: | if not e: | ||||
| return get_data_error_result(message="Fail to update a dialog!") | return get_data_error_result(message="Fail to update a dialog!") | ||||
| dia = dia.to_dict() | dia = dia.to_dict() | ||||
| dia.update(req) | |||||
| dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"]) | ||||
| return get_json_result(data=dia) | return get_json_result(data=dia) | ||||
| except Exception as e: | except Exception as e: | 
| return get_data_error_result( | return get_data_error_result( | ||||
| message="Database error (Document removal)!") | message="Database error (Document removal)!") | ||||
| f2d = File2DocumentService.get_by_document_id(doc.id) | f2d = File2DocumentService.get_by_document_id(doc.id) | ||||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||||
| if f2d: | |||||
| FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) | |||||
| File2DocumentService.delete_by_document_id(doc.id) | File2DocumentService.delete_by_document_id(doc.id) | ||||
| FileService.filter_delete( | FileService.filter_delete( | ||||
| [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) | [File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name]) | 
| if len(e.args) > 1: | if len(e.args) > 1: | ||||
| return get_json_result( | return get_json_result( | ||||
| code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) | code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1]) | ||||
| if repr(e).find("index_not_found_exception") >= 0: | |||||
| return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, | |||||
| message="No chunk found, please upload file and parse it.") | |||||
| return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e)) | return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e)) | ||||
| -Steps- | -Steps- | ||||
| 1. Identify all entities. For each identified entity, extract the following information: | 1. Identify all entities. For each identified entity, extract the following information: | ||||
| - entity_name: Name of the entity, capitalized | |||||
| - entity_name: Name of the entity, capitalized, in language of 'Text' | |||||
| - entity_type: One of the following types: [{entity_types}] | - entity_type: One of the following types: [{entity_types}] | ||||
| - entity_description: Comprehensive description of the entity's attributes and activities | |||||
| - entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text' | |||||
| Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description> | Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description> | ||||
| 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. | 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. | ||||
| For each pair of related entities, extract the following information: | For each pair of related entities, extract the following information: | ||||
| - source_entity: name of the source entity, as identified in step 1 | - source_entity: name of the source entity, as identified in step 1 | ||||
| - target_entity: name of the target entity, as identified in step 1 | - target_entity: name of the target entity, as identified in step 1 | ||||
| - relationship_description: explanation as to why you think the source entity and the target entity are related to each other | |||||
| - relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text' | |||||
| - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity | - relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity | ||||
| Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>) | Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>) | ||||
| 3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. | |||||
| 3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter. | |||||
| 4. When finished, output {completion_delimiter} | 4. When finished, output {completion_delimiter} | ||||
| return bin | return bin | ||||
| def set_llm_cache(llmnm, txt, v: str, history, genconf): | |||||
| def set_llm_cache(llmnm, txt, v, history, genconf): | |||||
| hasher = xxhash.xxh64() | hasher = xxhash.xxh64() | ||||
| hasher.update(str(llmnm).encode("utf-8")) | hasher.update(str(llmnm).encode("utf-8")) | ||||
| hasher.update(str(txt).encode("utf-8")) | hasher.update(str(txt).encode("utf-8")) | 
| if re.search(r"\.docx$", filename, re.IGNORECASE): | if re.search(r"\.docx$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") | ||||
| for txt in Docx()(filename, binary): | |||||
| sections.append(txt) | |||||
| callback(0.8, "Finish parsing.") | |||||
| chunks = sections | |||||
| return tokenize_chunks(chunks, doc, eng, pdf_parser) | |||||
| chunks = Docx()(filename, binary) | |||||
| callback(0.7, "Finish parsing.") | |||||
| return tokenize_chunks(chunks, doc, eng, None) | |||||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | elif re.search(r"\.pdf$", filename, re.IGNORECASE): | ||||
| pdf_parser = Pdf() if kwargs.get( | pdf_parser = Pdf() if kwargs.get( | 
| sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] | sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] | ||||
| # set pivot using the most frequent type of title, | # set pivot using the most frequent type of title, | ||||
| # then merge between 2 pivot | # then merge between 2 pivot | ||||
| if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: | |||||
| if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: | |||||
| max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) | max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) | ||||
| most_level = max(0, max_lvl - 1) | most_level = max(0, max_lvl - 1) | ||||
| levels = [] | levels = [] | ||||
| res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) | ||||
| return res | return res | ||||
| if re.search(r"\.docx$", filename, re.IGNORECASE): | |||||
| elif re.search(r"\.docx$", filename, re.IGNORECASE): | |||||
| docx_parser = Docx() | docx_parser = Docx() | ||||
| ti_list, tbls = docx_parser(filename, binary, | ti_list, tbls = docx_parser(filename, binary, | ||||
| from_page=0, to_page=10000, callback=callback) | from_page=0, to_page=10000, callback=callback) | 
| "datetime": "_dt", | "datetime": "_dt", | ||||
| "bool": "_kwd"} | "bool": "_kwd"} | ||||
| for df in dfs: | for df in dfs: | ||||
| for n in ["id", "index", "idx"]: | |||||
| for n in ["id", "_id", "index", "idx"]: | |||||
| if n in df.columns: | if n in df.columns: | ||||
| del df[n] | del df[n] | ||||
| clmns = df.columns.values | clmns = df.columns.values |