Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553
  1. import pathlib
  2. import re
  3. import datetime
  4. import json
  5. import traceback
  6. from botocore.docs.method import document_model_driven_method
  7. from flask import request
  8. from flask_login import login_required, current_user
  9. from elasticsearch_dsl import Q
  10. from sphinx.addnodes import document
  11. from rag.app.qa import rmPrefix, beAdoc
  12. from rag.nlp import search, rag_tokenizer, keyword_extraction
  13. from rag.utils.es_conn import ELASTICSEARCH
  14. from rag.utils import rmSpace
  15. from api.db import LLMType, ParserType
  16. from api.db.services.knowledgebase_service import KnowledgebaseService
  17. from api.db.services.llm_service import TenantLLMService
  18. from api.db.services.user_service import UserTenantService
  19. from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
  20. from api.db.services.document_service import DocumentService
  21. from api.settings import RetCode, retrievaler, kg_retrievaler
  22. from api.utils.api_utils import get_result
  23. import hashlib
  24. import re
  25. from api.utils.api_utils import get_result, token_required, get_error_data_result
  26. from api.db.db_models import Task, File
  27. from api.db.services.task_service import TaskService, queue_tasks
  28. from api.db.services.user_service import TenantService, UserTenantService
  29. from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
  30. from api.utils.api_utils import get_result, get_result, get_error_data_result
  31. from functools import partial
  32. from io import BytesIO
  33. from elasticsearch_dsl import Q
  34. from flask import request, send_file
  35. from flask_login import login_required
  36. from api.db import FileSource, TaskStatus, FileType
  37. from api.db.db_models import File
  38. from api.db.services.document_service import DocumentService
  39. from api.db.services.file2document_service import File2DocumentService
  40. from api.db.services.file_service import FileService
  41. from api.db.services.knowledgebase_service import KnowledgebaseService
  42. from api.settings import RetCode, retrievaler
  43. from api.utils.api_utils import construct_json_result, construct_error_response
  44. from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
  45. from rag.nlp import search
  46. from rag.utils import rmSpace
  47. from rag.utils.es_conn import ELASTICSEARCH
  48. from rag.utils.storage_factory import STORAGE_IMPL
  49. MAXIMUM_OF_UPLOADING_FILES = 256
  50. MAXIMUM_OF_UPLOADING_FILES = 256
  51. @manager.route('/dataset/<dataset_id>/document', methods=['POST'])
  52. @token_required
  53. def upload(dataset_id, tenant_id):
  54. if 'file' not in request.files:
  55. return get_error_data_result(
  56. retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
  57. file_objs = request.files.getlist('file')
  58. for file_obj in file_objs:
  59. if file_obj.filename == '':
  60. return get_result(
  61. retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
  62. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  63. if not e:
  64. raise LookupError(f"Can't find the knowledgebase with ID {dataset_id}!")
  65. err, _ = FileService.upload_document(kb, file_objs, tenant_id)
  66. if err:
  67. return get_result(
  68. retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
  69. return get_result()
  70. @manager.route('/dataset/<dataset_id>/info/<document_id>', methods=['PUT'])
  71. @token_required
  72. def update_doc(tenant_id, dataset_id, document_id):
  73. req = request.json
  74. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  75. return get_error_data_result(retmsg='You do not own the dataset.')
  76. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  77. if not doc:
  78. return get_error_data_result(retmsg='The dataset not own the document.')
  79. doc = doc[0]
  80. if "chunk_count" in req:
  81. if req["chunk_count"] != doc.chunk_num:
  82. return get_error_data_result(retmsg="Can't change chunk_count.")
  83. if "token_count" in req:
  84. if req["token_count"] != doc.token_num:
  85. return get_error_data_result(retmsg="Can't change token_count.")
  86. if "progress" in req:
  87. if req['progress'] != doc.progress:
  88. return get_error_data_result(retmsg="Can't change progress.")
  89. if "name" in req and req["name"] != doc.name:
  90. if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
  91. return get_result(retmsg="The extension of file can't be changed", retcode=RetCode.ARGUMENT_ERROR)
  92. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  93. if d.name == req["name"]:
  94. return get_error_data_result(
  95. retmsg="Duplicated document name in the same knowledgebase.")
  96. if not DocumentService.update_by_id(
  97. document_id, {"name": req["name"]}):
  98. return get_error_data_result(
  99. retmsg="Database error (Document rename)!")
  100. informs = File2DocumentService.get_by_document_id(document_id)
  101. if informs:
  102. e, file = FileService.get_by_id(informs[0].file_id)
  103. FileService.update_by_id(file.id, {"name": req["name"]})
  104. if "parser_method" in req:
  105. if doc.parser_id.lower() == req["parser_method"].lower():
  106. if "parser_config" in req:
  107. if req["parser_config"] == doc.parser_config:
  108. return get_result(retcode=RetCode.SUCCESS)
  109. else:
  110. return get_result(retcode=RetCode.SUCCESS)
  111. if doc.type == FileType.VISUAL or re.search(
  112. r"\.(ppt|pptx|pages)$", doc.name):
  113. return get_error_data_result(retmsg="Not supported yet!")
  114. e = DocumentService.update_by_id(doc.id,
  115. {"parser_id": req["parser_method"], "progress": 0, "progress_msg": "",
  116. "run": TaskStatus.UNSTART.value})
  117. if not e:
  118. return get_error_data_result(retmsg="Document not found!")
  119. if doc.token_num > 0:
  120. e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
  121. doc.process_duation * -1)
  122. if not e:
  123. return get_error_data_result(retmsg="Document not found!")
  124. tenant_id = DocumentService.get_tenant_id(req["id"])
  125. if not tenant_id:
  126. return get_error_data_result(retmsg="Tenant not found!")
  127. ELASTICSEARCH.deleteByQuery(
  128. Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
  129. if "parser_config" in req:
  130. DocumentService.update_parser_config(doc.id, req["parser_config"])
  131. return get_result()
  132. @manager.route('/dataset/<dataset_id>/document/<document_id>', methods=['GET'])
  133. @token_required
  134. def download(tenant_id, dataset_id, document_id):
  135. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  136. return get_error_data_result(retmsg=f'You do not own the dataset {dataset_id}.')
  137. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  138. if not doc:
  139. return get_error_data_result(retmsg=f'The dataset not own the document {doc.id}.')
  140. # The process of downloading
  141. doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
  142. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  143. if not file_stream:
  144. return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
  145. file = BytesIO(file_stream)
  146. # Use send_file with a proper filename and MIME type
  147. return send_file(
  148. file,
  149. as_attachment=True,
  150. download_name=doc[0].name,
  151. mimetype='application/octet-stream' # Set a default MIME type
  152. )
  153. @manager.route('/dataset/<dataset_id>/info', methods=['GET'])
  154. @token_required
  155. def list_docs(dataset_id, tenant_id):
  156. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  157. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
  158. id = request.args.get("id")
  159. if not DocumentService.query(id=id,kb_id=dataset_id):
  160. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  161. offset = int(request.args.get("offset", 1))
  162. keywords = request.args.get("keywords","")
  163. limit = int(request.args.get("limit", 1024))
  164. orderby = request.args.get("orderby", "create_time")
  165. if request.args.get("desc") == "False":
  166. desc = False
  167. else:
  168. desc = True
  169. docs, tol = DocumentService.get_list(dataset_id, offset, limit, orderby, desc, keywords, id)
  170. # rename key's name
  171. renamed_doc_list = []
  172. for doc in docs:
  173. key_mapping = {
  174. "chunk_num": "chunk_count",
  175. "kb_id": "knowledgebase_id",
  176. "token_num": "token_count",
  177. "parser_id": "parser_method"
  178. }
  179. renamed_doc = {}
  180. for key, value in doc.items():
  181. new_key = key_mapping.get(key, key)
  182. renamed_doc[new_key] = value
  183. renamed_doc_list.append(renamed_doc)
  184. return get_result(data={"total": tol, "docs": renamed_doc_list})
  185. @manager.route('/dataset/<dataset_id>/document', methods=['DELETE'])
  186. @token_required
  187. def delete(tenant_id,dataset_id):
  188. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  189. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
  190. req = request.json
  191. if not req.get("ids"):
  192. return get_error_data_result(retmsg="ids is required")
  193. doc_ids = req["ids"]
  194. root_folder = FileService.get_root_folder(tenant_id)
  195. pf_id = root_folder["id"]
  196. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  197. errors = ""
  198. for doc_id in doc_ids:
  199. try:
  200. e, doc = DocumentService.get_by_id(doc_id)
  201. if not e:
  202. return get_error_data_result(retmsg="Document not found!")
  203. tenant_id = DocumentService.get_tenant_id(doc_id)
  204. if not tenant_id:
  205. return get_error_data_result(retmsg="Tenant not found!")
  206. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  207. if not DocumentService.remove_document(doc, tenant_id):
  208. return get_error_data_result(
  209. retmsg="Database error (Document removal)!")
  210. f2d = File2DocumentService.get_by_document_id(doc_id)
  211. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  212. File2DocumentService.delete_by_document_id(doc_id)
  213. STORAGE_IMPL.rm(b, n)
  214. except Exception as e:
  215. errors += str(e)
  216. if errors:
  217. return get_result(retmsg=errors, retcode=RetCode.SERVER_ERROR)
  218. return get_result()
  219. @manager.route('/dataset/<dataset_id>/chunk', methods=['POST'])
  220. @token_required
  221. def parse(tenant_id,dataset_id):
  222. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  223. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  224. req = request.json
  225. for id in req["document_ids"]:
  226. if not DocumentService.query(id=id,kb_id=dataset_id):
  227. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  228. info = {"run": "1", "progress": 0}
  229. info["progress_msg"] = ""
  230. info["chunk_num"] = 0
  231. info["token_num"] = 0
  232. DocumentService.update_by_id(id, info)
  233. # if str(req["run"]) == TaskStatus.CANCEL.value:
  234. ELASTICSEARCH.deleteByQuery(
  235. Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
  236. TaskService.filter_delete([Task.doc_id == id])
  237. e, doc = DocumentService.get_by_id(id)
  238. doc = doc.to_dict()
  239. doc["tenant_id"] = tenant_id
  240. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  241. queue_tasks(doc, bucket, name)
  242. return get_result()
  243. @manager.route('/dataset/<dataset_id>/chunk', methods=['DELETE'])
  244. @token_required
  245. def stop_parsing(tenant_id,dataset_id):
  246. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  247. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  248. req = request.json
  249. for id in req["document_ids"]:
  250. if not DocumentService.query(id=id,kb_id=dataset_id):
  251. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  252. info = {"run": "2", "progress": 0}
  253. DocumentService.update_by_id(id, info)
  254. # if str(req["run"]) == TaskStatus.CANCEL.value:
  255. tenant_id = DocumentService.get_tenant_id(id)
  256. ELASTICSEARCH.deleteByQuery(
  257. Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
  258. return get_result()
  259. @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk', methods=['GET'])
  260. @token_required
  261. def list_chunk(tenant_id,dataset_id,document_id):
  262. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  263. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  264. doc=DocumentService.query(id=document_id, kb_id=dataset_id)
  265. if not doc:
  266. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  267. doc=doc[0]
  268. req = request.args
  269. doc_id = document_id
  270. page = int(req.get("offset", 1))
  271. size = int(req.get("limit", 30))
  272. question = req.get("keywords", "")
  273. try:
  274. query = {
  275. "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
  276. }
  277. if "available_int" in req:
  278. query["available_int"] = int(req["available_int"])
  279. sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
  280. res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
  281. origin_chunks = []
  282. for id in sres.ids:
  283. d = {
  284. "chunk_id": id,
  285. "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
  286. id].get(
  287. "content_with_weight", ""),
  288. "doc_id": sres.field[id]["doc_id"],
  289. "docnm_kwd": sres.field[id]["docnm_kwd"],
  290. "important_kwd": sres.field[id].get("important_kwd", []),
  291. "img_id": sres.field[id].get("img_id", ""),
  292. "available_int": sres.field[id].get("available_int", 1),
  293. "positions": sres.field[id].get("position_int", "").split("\t")
  294. }
  295. if len(d["positions"]) % 5 == 0:
  296. poss = []
  297. for i in range(0, len(d["positions"]), 5):
  298. poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
  299. float(d["positions"][i + 3]), float(d["positions"][i + 4])])
  300. d["positions"] = poss
  301. origin_chunks.append(d)
  302. ##rename keys
  303. for chunk in origin_chunks:
  304. key_mapping = {
  305. "chunk_id": "id",
  306. "content_with_weight": "content",
  307. "doc_id": "document_id",
  308. "important_kwd": "important_keywords",
  309. "img_id": "image_id",
  310. }
  311. renamed_chunk = {}
  312. for key, value in chunk.items():
  313. new_key = key_mapping.get(key, key)
  314. renamed_chunk[new_key] = value
  315. res["chunks"].append(renamed_chunk)
  316. return get_result(data=res)
  317. except Exception as e:
  318. if str(e).find("not_found") > 0:
  319. return get_result(retmsg=f'No chunk found!',
  320. retcode=RetCode.DATA_ERROR)
  321. return server_error_response(e)
  322. @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk', methods=['POST'])
  323. @token_required
  324. def create(tenant_id,dataset_id,document_id):
  325. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  326. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  327. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  328. if not doc:
  329. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  330. req = request.json
  331. if not req.get("content"):
  332. return get_error_data_result(retmsg="`content` is required")
  333. md5 = hashlib.md5()
  334. md5.update((req["content"] + document_id).encode("utf-8"))
  335. chunk_id = md5.hexdigest()
  336. d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
  337. "content_with_weight": req["content"]}
  338. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  339. d["important_kwd"] = req.get("important_kwd", [])
  340. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
  341. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  342. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  343. d["kb_id"] = [doc.kb_id]
  344. d["docnm_kwd"] = doc.name
  345. d["doc_id"] = doc.id
  346. embd_id = DocumentService.get_embd_id(document_id)
  347. embd_mdl = TenantLLMService.model_instance(
  348. tenant_id, LLMType.EMBEDDING.value, embd_id)
  349. v, c = embd_mdl.encode([doc.name, req["content"]])
  350. v = 0.1 * v[0] + 0.9 * v[1]
  351. d["q_%d_vec" % len(v)] = v.tolist()
  352. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  353. DocumentService.increment_chunk_num(
  354. doc.id, doc.kb_id, c, 1, 0)
  355. d["chunk_id"] = chunk_id
  356. # rename keys
  357. key_mapping = {
  358. "chunk_id": "id",
  359. "content_with_weight": "content",
  360. "doc_id": "document_id",
  361. "important_kwd": "important_keywords",
  362. "kb_id": "dataset_id",
  363. "create_timestamp_flt": "create_timestamp",
  364. "create_time": "create_time",
  365. "document_keyword": "document",
  366. }
  367. renamed_chunk = {}
  368. for key, value in d.items():
  369. if key in key_mapping:
  370. new_key = key_mapping.get(key, key)
  371. renamed_chunk[new_key] = value
  372. return get_result(data={"chunk": renamed_chunk})
  373. # return get_result(data={"chunk_id": chunk_id})
  374. @manager.route('dataset/{dataset_id}/document/{document_id}/chunk', methods=['DELETE'])
  375. @token_required
  376. def rm_chunk(tenant_id,dataset_id,document_id):
  377. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  378. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  379. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  380. if not doc:
  381. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  382. req = request.json
  383. if not req.get("chunk_ids"):
  384. return get_error_data_result("`chunk_ids` is required")
  385. if not ELASTICSEARCH.deleteByQuery(
  386. Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
  387. return get_error_data_result(retmsg="Index updating failure")
  388. deleted_chunk_ids = req["chunk_ids"]
  389. chunk_number = len(deleted_chunk_ids)
  390. DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
  391. return get_result()
  392. @manager.route('/dataset/{dataset_id}/document/{document_id}/chunk/{chunk_id}', methods=['PUT'])
  393. @token_required
  394. def set(tenant_id,dataset_id,document_id,chunk_id):
  395. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  396. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  397. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  398. if not doc:
  399. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  400. req = request.json
  401. if not req.get("content"):
  402. return get_error_data_result("`content` is required")
  403. if not req.get("important_keywords"):
  404. return get_error_data_result("`important_keywords` is required")
  405. d = {
  406. "id": chunk_id,
  407. "content_with_weight": req["content"]}
  408. d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
  409. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  410. d["important_kwd"] = req["important_keywords"]
  411. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  412. if "available" in req:
  413. d["available_int"] = req["available"]
  414. embd_id = DocumentService.get_embd_id(document_id)
  415. embd_mdl = TenantLLMService.model_instance(
  416. tenant_id, LLMType.EMBEDDING.value, embd_id)
  417. if doc.parser_id == ParserType.QA:
  418. arr = [
  419. t for t in re.split(
  420. r"[\n\t]",
  421. req["content"]) if len(t) > 1]
  422. if len(arr) != 2:
  423. return get_error_data_result(
  424. retmsg="Q&A must be separated by TAB/ENTER key.")
  425. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  426. d = beAdoc(d, arr[0], arr[1], not any(
  427. [rag_tokenizer.is_chinese(t) for t in q + a]))
  428. v, c = embd_mdl.encode([doc.name, req["content"]])
  429. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  430. d["q_%d_vec" % len(v)] = v.tolist()
  431. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  432. return get_result()
  433. @manager.route('/retrieval', methods=['GET'])
  434. @token_required
  435. def retrieval_test(tenant_id):
  436. req = request.args
  437. if not req.get("datasets"):
  438. return get_error_data_result("`datasets` is required.")
  439. for id in req.get("datasets"):
  440. if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
  441. return get_error_data_result(f"You don't own the dataset {id}.")
  442. if not req.get("question"):
  443. return get_error_data_result("`question` is required.")
  444. page = int(req.get("offset", 1))
  445. size = int(req.get("limit", 30))
  446. question = req["question"]
  447. kb_id = req["datasets"]
  448. if isinstance(kb_id, str): kb_id = [kb_id]
  449. doc_ids = req.get("documents", [])
  450. similarity_threshold = float(req.get("similarity_threshold", 0.2))
  451. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  452. top = int(req.get("top_k", 1024))
  453. try:
  454. e, kb = KnowledgebaseService.get_by_id(kb_id[0])
  455. if not e:
  456. return get_error_data_result(retmsg="Knowledgebase not found!")
  457. embd_mdl = TenantLLMService.model_instance(
  458. kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
  459. rerank_mdl = None
  460. if req.get("rerank_id"):
  461. rerank_mdl = TenantLLMService.model_instance(
  462. kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
  463. if req.get("keyword", False):
  464. chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
  465. question += keyword_extraction(chat_mdl, question)
  466. retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
  467. ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
  468. similarity_threshold, vector_similarity_weight, top,
  469. doc_ids, rerank_mdl=rerank_mdl, highlight=req.get("highlight"))
  470. for c in ranks["chunks"]:
  471. if "vector" in c:
  472. del c["vector"]
  473. ##rename keys
  474. renamed_chunks = []
  475. for chunk in ranks["chunks"]:
  476. key_mapping = {
  477. "chunk_id": "id",
  478. "content_with_weight": "content",
  479. "doc_id": "document_id",
  480. "important_kwd": "important_keywords",
  481. "docnm_kwd": "document_keyword"
  482. }
  483. rename_chunk = {}
  484. for key, value in chunk.items():
  485. new_key = key_mapping.get(key, key)
  486. rename_chunk[new_key] = value
  487. renamed_chunks.append(rename_chunk)
  488. ranks["chunks"] = renamed_chunks
  489. return get_result(data=ranks)
  490. except Exception as e:
  491. if str(e).find("not_found") > 0:
  492. return get_result(retmsg=f'No chunk found! Check the chunk status please!',
  493. retcode=RetCode.DATA_ERROR)
  494. return server_error_response(e)