Przeglądaj źródła

Code refactor. (#4291)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
tags/nightly
Kevin Hu 10 miesięcy temu
rodzic
commit
8fb18f37f6
No account linked to committer's email address

+ 10
- 0
agent/component/answer.py Wyświetl plik

@@ -16,6 +16,7 @@
import random
from abc import ABC
from functools import partial
from typing import Tuple, Union

import pandas as pd

@@ -76,4 +77,13 @@ class Answer(ComponentBase, ABC):
def set_exception(self, e):
self.exception = e

def output(self, allow_partial=True) -> Tuple[str, Union[pd.DataFrame, partial]]:
if allow_partial:
return super.output()

for r, c in self._canvas.history[::-1]:
if r == "user":
return self._param.output_var_name, pd.DataFrame([{"content": c}])

self._param.output_var_name, pd.DataFrame([])


+ 4
- 0
api/apps/canvas_app.py Wyświetl plik

@@ -146,12 +146,16 @@ def run():

canvas.messages.append({"role": "assistant", "content": final_ans["content"], "id": message_id})
canvas.history.append(("assistant", final_ans["content"]))
if not canvas.path[-1]:
canvas.path.pop(-1)
if final_ans.get("reference"):
canvas.reference.append(final_ans["reference"])
cvs.dsl = json.loads(str(canvas))
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
except Exception as e:
cvs.dsl = json.loads(str(canvas))
if not canvas.path[-1]:
canvas.path.pop(-1)
UserCanvasService.update_by_id(req["id"], cvs.to_dict())
traceback.print_exc()
yield "data:" + json.dumps({"code": 500, "message": str(e),

+ 2
- 4
api/apps/dialog_app.py Wyświetl plik

@@ -103,10 +103,7 @@ def set_dialog():
}
if not DialogService.save(**dia):
return get_data_error_result(message="Fail to new a dialog!")
e, dia = DialogService.get_by_id(dia["id"])
if not e:
return get_data_error_result(message="Fail to new a dialog!")
return get_json_result(data=dia.to_json())
return get_json_result(data=dia)
else:
del req["dialog_id"]
if "kb_names" in req:
@@ -117,6 +114,7 @@ def set_dialog():
if not e:
return get_data_error_result(message="Fail to update a dialog!")
dia = dia.to_dict()
dia.update(req)
dia["kb_ids"], dia["kb_names"] = get_kb_names(dia["kb_ids"])
return get_json_result(data=dia)
except Exception as e:

+ 2
- 1
api/apps/kb_app.py Wyświetl plik

@@ -185,7 +185,8 @@ def rm():
return get_data_error_result(
message="Database error (Document removal)!")
f2d = File2DocumentService.get_by_document_id(doc.id)
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
if f2d:
FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
File2DocumentService.delete_by_document_id(doc.id)
FileService.filter_delete(
[File.source_type == FileSource.KNOWLEDGEBASE, File.type == "folder", File.name == kbs[0].name])

+ 4
- 0
api/utils/api_utils.py Wyświetl plik

@@ -120,6 +120,10 @@ def server_error_response(e):
if len(e.args) > 1:
return get_json_result(
code=settings.RetCode.EXCEPTION_ERROR, message=repr(e.args[0]), data=e.args[1])
if repr(e).find("index_not_found_exception") >= 0:
return get_json_result(code=settings.RetCode.EXCEPTION_ERROR,
message="No chunk found, please upload file and parse it.")

return get_json_result(code=settings.RetCode.EXCEPTION_ERROR, message=repr(e))



+ 4
- 4
graphrag/graph_prompt.py Wyświetl plik

@@ -11,20 +11,20 @@ Given a text document that is potentially relevant to this activity and a list o

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_name: Name of the entity, capitalized, in language of 'Text'
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
- entity_description: Comprehensive description of the entity's attributes and activities in language of 'Text'
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other in language of 'Text'
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_strength>)

3. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
3. Return output as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.

4. When finished, output {completion_delimiter}


+ 1
- 1
graphrag/utils.py Wyświetl plik

@@ -81,7 +81,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
return bin


def set_llm_cache(llmnm, txt, v: str, history, genconf):
def set_llm_cache(llmnm, txt, v, history, genconf):
hasher = xxhash.xxh64()
hasher.update(str(llmnm).encode("utf-8"))
hasher.update(str(txt).encode("utf-8"))

+ 3
- 5
rag/app/laws.py Wyświetl plik

@@ -153,11 +153,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
for txt in Docx()(filename, binary):
sections.append(txt)
callback(0.8, "Finish parsing.")
chunks = sections
return tokenize_chunks(chunks, doc, eng, pdf_parser)
chunks = Docx()(filename, binary)
callback(0.7, "Finish parsing.")
return tokenize_chunks(chunks, doc, eng, None)

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() if kwargs.get(

+ 2
- 2
rag/app/manual.py Wyświetl plik

@@ -193,7 +193,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections]
# set pivot using the most frequent type of title,
# then merge between 2 pivot
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1:
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1)
levels = []
@@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
return res

if re.search(r"\.docx$", filename, re.IGNORECASE):
elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)

+ 1
- 1
rag/app/table.py Wyświetl plik

@@ -185,7 +185,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
"datetime": "_dt",
"bool": "_kwd"}
for df in dfs:
for n in ["id", "index", "idx"]:
for n in ["id", "_id", "index", "idx"]:
if n in df.columns:
del df[n]
clmns = df.columns.values

Ładowanie…
Anuluj
Zapisz