|
|
|
@@ -99,6 +99,7 @@ def docinfos(tenant_id): |
|
|
|
"chunk_num": "chunk_count", |
|
|
|
"kb_id": "knowledgebase_id", |
|
|
|
"token_num": "token_count", |
|
|
|
"parser_id":"parser_method", |
|
|
|
} |
|
|
|
renamed_doc = {} |
|
|
|
for key, value in doc.to_dict().items(): |
|
|
|
@@ -125,10 +126,14 @@ def save_doc(tenant_id): |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
#other value can't be changed |
|
|
|
if "chunk_num" in req: |
|
|
|
if req["chunk_num"] != doc.chunk_num: |
|
|
|
if "chunk_count" in req: |
|
|
|
if req["chunk_count"] != doc.chunk_num: |
|
|
|
return get_data_error_result( |
|
|
|
retmsg="Can't change chunk_count.") |
|
|
|
if "token_count" in req: |
|
|
|
if req["token_count"] != doc.token_num: |
|
|
|
return get_data_error_result( |
|
|
|
retmsg="Can't change token_count.") |
|
|
|
if "progress" in req: |
|
|
|
if req['progress'] != doc.progress: |
|
|
|
return get_data_error_result( |
|
|
|
@@ -158,9 +163,9 @@ def save_doc(tenant_id): |
|
|
|
FileService.update_by_id(file.id, {"name": req["name"]}) |
|
|
|
except Exception as e: |
|
|
|
return server_error_response(e) |
|
|
|
if "parser_id" in req: |
|
|
|
if "parser_method" in req: |
|
|
|
try: |
|
|
|
if doc.parser_id.lower() == req["parser_id"].lower(): |
|
|
|
if doc.parser_id.lower() == req["parser_method"].lower(): |
|
|
|
if "parser_config" in req: |
|
|
|
if req["parser_config"] == doc.parser_config: |
|
|
|
return get_json_result(data=True) |
|
|
|
@@ -172,7 +177,7 @@ def save_doc(tenant_id): |
|
|
|
return get_data_error_result(retmsg="Not supported yet!") |
|
|
|
|
|
|
|
e = DocumentService.update_by_id(doc.id, |
|
|
|
{"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", |
|
|
|
{"parser_id": req["parser_method"], "progress": 0, "progress_msg": "", |
|
|
|
"run": TaskStatus.UNSTART.value}) |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
@@ -183,7 +188,7 @@ def save_doc(tenant_id): |
|
|
|
doc.process_duation * -1) |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"]) |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["id"]) |
|
|
|
if not tenant_id: |
|
|
|
return get_data_error_result(retmsg="Tenant not found!") |
|
|
|
ELASTICSEARCH.deleteByQuery( |
|
|
|
@@ -272,7 +277,7 @@ def rename(): |
|
|
|
|
|
|
|
@manager.route("/<document_id>", methods=["GET"]) |
|
|
|
@token_required |
|
|
|
def download_document(dataset_id, document_id,tenant_id): |
|
|
|
def download_document(document_id,tenant_id): |
|
|
|
try: |
|
|
|
# Check whether there is this document |
|
|
|
exist, document = DocumentService.get_by_id(document_id) |
|
|
|
@@ -304,7 +309,7 @@ def download_document(dataset_id, document_id,tenant_id): |
|
|
|
@manager.route('/dataset/<dataset_id>/documents', methods=['GET']) |
|
|
|
@token_required |
|
|
|
def list_docs(dataset_id, tenant_id): |
|
|
|
kb_id = request.args.get("kb_id") |
|
|
|
kb_id = request.args.get("knowledgebase_id") |
|
|
|
if not kb_id: |
|
|
|
return get_json_result( |
|
|
|
data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR) |
|
|
|
@@ -334,6 +339,7 @@ def list_docs(dataset_id, tenant_id): |
|
|
|
"chunk_num": "chunk_count", |
|
|
|
"kb_id": "knowledgebase_id", |
|
|
|
"token_num": "token_count", |
|
|
|
"parser_id":"parser_method" |
|
|
|
} |
|
|
|
renamed_doc = {} |
|
|
|
for key, value in doc.items(): |
|
|
|
@@ -349,10 +355,10 @@ def list_docs(dataset_id, tenant_id): |
|
|
|
@token_required |
|
|
|
def rm(tenant_id): |
|
|
|
req = request.args |
|
|
|
if "doc_id" not in req: |
|
|
|
if "document_id" not in req: |
|
|
|
return get_data_error_result( |
|
|
|
retmsg="doc_id is required") |
|
|
|
doc_ids = req["doc_id"] |
|
|
|
doc_ids = req["document_id"] |
|
|
|
if isinstance(doc_ids, str): doc_ids = [doc_ids] |
|
|
|
root_folder = FileService.get_root_folder(tenant_id) |
|
|
|
pf_id = root_folder["id"] |
|
|
|
@@ -413,7 +419,7 @@ def show_parsing_status(tenant_id, document_id): |
|
|
|
def run(tenant_id): |
|
|
|
req = request.json |
|
|
|
try: |
|
|
|
for id in req["doc_ids"]: |
|
|
|
for id in req["document_ids"]: |
|
|
|
info = {"run": str(req["run"]), "progress": 0} |
|
|
|
if str(req["run"]) == TaskStatus.RUNNING.value: |
|
|
|
info["progress_msg"] = "" |
|
|
|
@@ -442,15 +448,15 @@ def run(tenant_id): |
|
|
|
|
|
|
|
@manager.route('/chunk/list', methods=['POST']) |
|
|
|
@token_required |
|
|
|
@validate_request("doc_id") |
|
|
|
@validate_request("document_id") |
|
|
|
def list_chunk(tenant_id): |
|
|
|
req = request.json |
|
|
|
doc_id = req["doc_id"] |
|
|
|
doc_id = req["document_id"] |
|
|
|
page = int(req.get("page", 1)) |
|
|
|
size = int(req.get("size", 30)) |
|
|
|
question = req.get("keywords", "") |
|
|
|
try: |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"]) |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["document_id"]) |
|
|
|
if not tenant_id: |
|
|
|
return get_data_error_result(retmsg="Tenant not found!") |
|
|
|
e, doc = DocumentService.get_by_id(doc_id) |
|
|
|
@@ -509,15 +515,15 @@ def list_chunk(tenant_id): |
|
|
|
|
|
|
|
@manager.route('/chunk/create', methods=['POST']) |
|
|
|
@token_required |
|
|
|
@validate_request("doc_id", "content_with_weight") |
|
|
|
@validate_request("document_id", "content") |
|
|
|
def create(tenant_id): |
|
|
|
req = request.json |
|
|
|
md5 = hashlib.md5() |
|
|
|
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) |
|
|
|
md5.update((req["content"] + req["document_id"]).encode("utf-8")) |
|
|
|
|
|
|
|
chunk_id = md5.hexdigest() |
|
|
|
d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), |
|
|
|
"content_with_weight": req["content_with_weight"]} |
|
|
|
d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]), |
|
|
|
"content_with_weight": req["content"]} |
|
|
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) |
|
|
|
d["important_kwd"] = req.get("important_kwd", []) |
|
|
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) |
|
|
|
@@ -525,22 +531,22 @@ def create(tenant_id): |
|
|
|
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() |
|
|
|
|
|
|
|
try: |
|
|
|
e, doc = DocumentService.get_by_id(req["doc_id"]) |
|
|
|
e, doc = DocumentService.get_by_id(req["document_id"]) |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
d["kb_id"] = [doc.kb_id] |
|
|
|
d["docnm_kwd"] = doc.name |
|
|
|
d["doc_id"] = doc.id |
|
|
|
|
|
|
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"]) |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["document_id"]) |
|
|
|
if not tenant_id: |
|
|
|
return get_data_error_result(retmsg="Tenant not found!") |
|
|
|
|
|
|
|
embd_id = DocumentService.get_embd_id(req["doc_id"]) |
|
|
|
embd_id = DocumentService.get_embd_id(req["document_id"]) |
|
|
|
embd_mdl = TenantLLMService.model_instance( |
|
|
|
tenant_id, LLMType.EMBEDDING.value, embd_id) |
|
|
|
|
|
|
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) |
|
|
|
v, c = embd_mdl.encode([doc.name, req["content"]]) |
|
|
|
v = 0.1 * v[0] + 0.9 * v[1] |
|
|
|
d["q_%d_vec" % len(v)] = v.tolist() |
|
|
|
ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) |
|
|
|
@@ -568,14 +574,14 @@ def create(tenant_id): |
|
|
|
|
|
|
|
@manager.route('/chunk/rm', methods=['POST']) |
|
|
|
@token_required |
|
|
|
@validate_request("chunk_ids", "doc_id") |
|
|
|
@validate_request("chunk_ids", "document_id") |
|
|
|
def rm_chunk(tenant_id): |
|
|
|
req = request.json |
|
|
|
try: |
|
|
|
if not ELASTICSEARCH.deleteByQuery( |
|
|
|
Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)): |
|
|
|
return get_data_error_result(retmsg="Index updating failure") |
|
|
|
e, doc = DocumentService.get_by_id(req["doc_id"]) |
|
|
|
e, doc = DocumentService.get_by_id(req["document_id"]) |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
deleted_chunk_ids = req["chunk_ids"] |
|
|
|
@@ -587,30 +593,30 @@ def rm_chunk(tenant_id): |
|
|
|
|
|
|
|
@manager.route('/chunk/set', methods=['POST']) |
|
|
|
@token_required |
|
|
|
@validate_request("doc_id", "chunk_id", "content_with_weight", |
|
|
|
"important_kwd") |
|
|
|
@validate_request("document_id", "chunk_id", "content", |
|
|
|
"important_keywords") |
|
|
|
def set(tenant_id): |
|
|
|
req = request.json |
|
|
|
d = { |
|
|
|
"id": req["chunk_id"], |
|
|
|
"content_with_weight": req["content_with_weight"]} |
|
|
|
d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) |
|
|
|
"content_with_weight": req["content"]} |
|
|
|
d["content_ltks"] = rag_tokenizer.tokenize(req["content"]) |
|
|
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) |
|
|
|
d["important_kwd"] = req["important_kwd"] |
|
|
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) |
|
|
|
d["important_kwd"] = req["important_keywords"] |
|
|
|
d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"])) |
|
|
|
if "available_int" in req: |
|
|
|
d["available_int"] = req["available_int"] |
|
|
|
|
|
|
|
try: |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["doc_id"]) |
|
|
|
tenant_id = DocumentService.get_tenant_id(req["document_id"]) |
|
|
|
if not tenant_id: |
|
|
|
return get_data_error_result(retmsg="Tenant not found!") |
|
|
|
|
|
|
|
embd_id = DocumentService.get_embd_id(req["doc_id"]) |
|
|
|
embd_id = DocumentService.get_embd_id(req["document_id"]) |
|
|
|
embd_mdl = TenantLLMService.model_instance( |
|
|
|
tenant_id, LLMType.EMBEDDING.value, embd_id) |
|
|
|
|
|
|
|
e, doc = DocumentService.get_by_id(req["doc_id"]) |
|
|
|
e, doc = DocumentService.get_by_id(req["document_id"]) |
|
|
|
if not e: |
|
|
|
return get_data_error_result(retmsg="Document not found!") |
|
|
|
|
|
|
|
@@ -618,7 +624,7 @@ def set(tenant_id): |
|
|
|
arr = [ |
|
|
|
t for t in re.split( |
|
|
|
r"[\n\t]", |
|
|
|
req["content_with_weight"]) if len(t) > 1] |
|
|
|
req["content"]) if len(t) > 1] |
|
|
|
if len(arr) != 2: |
|
|
|
return get_data_error_result( |
|
|
|
retmsg="Q&A must be separated by TAB/ENTER key.") |
|
|
|
@@ -626,7 +632,7 @@ def set(tenant_id): |
|
|
|
d = beAdoc(d, arr[0], arr[1], not any( |
|
|
|
[rag_tokenizer.is_chinese(t) for t in q + a])) |
|
|
|
|
|
|
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) |
|
|
|
v, c = embd_mdl.encode([doc.name, req["content"]]) |
|
|
|
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] |
|
|
|
d["q_%d_vec" % len(v)] = v.tolist() |
|
|
|
ELASTICSEARCH.upsert([d], search.index_name(tenant_id)) |
|
|
|
@@ -636,13 +642,13 @@ def set(tenant_id): |
|
|
|
|
|
|
|
@manager.route('/retrieval_test', methods=['POST']) |
|
|
|
@token_required |
|
|
|
@validate_request("kb_id", "question") |
|
|
|
@validate_request("knowledgebase_id", "question") |
|
|
|
def retrieval_test(tenant_id): |
|
|
|
req = request.json |
|
|
|
page = int(req.get("page", 1)) |
|
|
|
size = int(req.get("size", 30)) |
|
|
|
question = req["question"] |
|
|
|
kb_id = req["kb_id"] |
|
|
|
kb_id = req["knowledgebase_id"] |
|
|
|
if isinstance(kb_id, str): kb_id = [kb_id] |
|
|
|
doc_ids = req.get("doc_ids", []) |
|
|
|
similarity_threshold = float(req.get("similarity_threshold", 0.2)) |
|
|
|
@@ -693,6 +699,7 @@ def retrieval_test(tenant_id): |
|
|
|
"content_with_weight": "content", |
|
|
|
"doc_id": "document_id", |
|
|
|
"important_kwd": "important_keywords", |
|
|
|
"docnm_kwd":"document_keyword" |
|
|
|
} |
|
|
|
rename_chunk={} |
|
|
|
for key, value in chunk.items(): |