Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567
  1. import pathlib
  2. import re
  3. import datetime
  4. import json
  5. import traceback
  6. from botocore.docs.method import document_model_driven_method
  7. from flask import request
  8. from flask_login import login_required, current_user
  9. from elasticsearch_dsl import Q
  10. from pygments import highlight
  11. from sphinx.addnodes import document
  12. from rag.app.qa import rmPrefix, beAdoc
  13. from rag.nlp import search, rag_tokenizer, keyword_extraction
  14. from rag.utils.es_conn import ELASTICSEARCH
  15. from rag.utils import rmSpace
  16. from api.db import LLMType, ParserType
  17. from api.db.services.knowledgebase_service import KnowledgebaseService
  18. from api.db.services.llm_service import TenantLLMService
  19. from api.db.services.user_service import UserTenantService
  20. from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
  21. from api.db.services.document_service import DocumentService
  22. from api.settings import RetCode, retrievaler, kg_retrievaler
  23. from api.utils.api_utils import get_result
  24. import hashlib
  25. import re
  26. from api.utils.api_utils import get_result, token_required, get_error_data_result
  27. from api.db.db_models import Task, File
  28. from api.db.services.task_service import TaskService, queue_tasks
  29. from api.db.services.user_service import TenantService, UserTenantService
  30. from api.utils.api_utils import server_error_response, get_error_data_result, validate_request
  31. from api.utils.api_utils import get_result, get_result, get_error_data_result
  32. from functools import partial
  33. from io import BytesIO
  34. from elasticsearch_dsl import Q
  35. from flask import request, send_file
  36. from flask_login import login_required
  37. from api.db import FileSource, TaskStatus, FileType
  38. from api.db.db_models import File
  39. from api.db.services.document_service import DocumentService
  40. from api.db.services.file2document_service import File2DocumentService
  41. from api.db.services.file_service import FileService
  42. from api.db.services.knowledgebase_service import KnowledgebaseService
  43. from api.settings import RetCode, retrievaler
  44. from api.utils.api_utils import construct_json_result, construct_error_response
  45. from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
  46. from rag.nlp import search
  47. from rag.utils import rmSpace
  48. from rag.utils.es_conn import ELASTICSEARCH
  49. from rag.utils.storage_factory import STORAGE_IMPL
  50. MAXIMUM_OF_UPLOADING_FILES = 256
  51. MAXIMUM_OF_UPLOADING_FILES = 256
  52. @manager.route('/dataset/<dataset_id>/document', methods=['POST'])
  53. @token_required
  54. def upload(dataset_id, tenant_id):
  55. if 'file' not in request.files:
  56. return get_error_data_result(
  57. retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
  58. file_objs = request.files.getlist('file')
  59. for file_obj in file_objs:
  60. if file_obj.filename == '':
  61. return get_result(
  62. retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
  63. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  64. if not e:
  65. raise LookupError(f"Can't find the knowledgebase with ID {dataset_id}!")
  66. err, _ = FileService.upload_document(kb, file_objs, tenant_id)
  67. if err:
  68. return get_result(
  69. retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
  70. return get_result()
  71. @manager.route('/dataset/<dataset_id>/info/<document_id>', methods=['PUT'])
  72. @token_required
  73. def update_doc(tenant_id, dataset_id, document_id):
  74. req = request.json
  75. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  76. return get_error_data_result(retmsg='You do not own the dataset.')
  77. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  78. if not doc:
  79. return get_error_data_result(retmsg='The dataset not own the document.')
  80. doc = doc[0]
  81. if "chunk_count" in req:
  82. if req["chunk_count"] != doc.chunk_num:
  83. return get_error_data_result(retmsg="Can't change chunk_count.")
  84. if "token_count" in req:
  85. if req["token_count"] != doc.token_num:
  86. return get_error_data_result(retmsg="Can't change token_count.")
  87. if "progress" in req:
  88. if req['progress'] != doc.progress:
  89. return get_error_data_result(retmsg="Can't change progress.")
  90. if "name" in req and req["name"] != doc.name:
  91. if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(doc.name.lower()).suffix:
  92. return get_result(retmsg="The extension of file can't be changed", retcode=RetCode.ARGUMENT_ERROR)
  93. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  94. if d.name == req["name"]:
  95. return get_error_data_result(
  96. retmsg="Duplicated document name in the same knowledgebase.")
  97. if not DocumentService.update_by_id(
  98. document_id, {"name": req["name"]}):
  99. return get_error_data_result(
  100. retmsg="Database error (Document rename)!")
  101. informs = File2DocumentService.get_by_document_id(document_id)
  102. if informs:
  103. e, file = FileService.get_by_id(informs[0].file_id)
  104. FileService.update_by_id(file.id, {"name": req["name"]})
  105. if "parser_method" in req:
  106. if doc.parser_id.lower() == req["parser_method"].lower():
  107. if "parser_config" in req:
  108. if req["parser_config"] == doc.parser_config:
  109. return get_result(retcode=RetCode.SUCCESS)
  110. else:
  111. return get_result(retcode=RetCode.SUCCESS)
  112. if doc.type == FileType.VISUAL or re.search(
  113. r"\.(ppt|pptx|pages)$", doc.name):
  114. return get_error_data_result(retmsg="Not supported yet!")
  115. e = DocumentService.update_by_id(doc.id,
  116. {"parser_id": req["parser_method"], "progress": 0, "progress_msg": "",
  117. "run": TaskStatus.UNSTART.value})
  118. if not e:
  119. return get_error_data_result(retmsg="Document not found!")
  120. if doc.token_num > 0:
  121. e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
  122. doc.process_duation * -1)
  123. if not e:
  124. return get_error_data_result(retmsg="Document not found!")
  125. tenant_id = DocumentService.get_tenant_id(req["id"])
  126. if not tenant_id:
  127. return get_error_data_result(retmsg="Tenant not found!")
  128. ELASTICSEARCH.deleteByQuery(
  129. Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
  130. if "parser_config" in req:
  131. DocumentService.update_parser_config(doc.id, req["parser_config"])
  132. return get_result()
  133. @manager.route('/dataset/<dataset_id>/document/<document_id>', methods=['GET'])
  134. @token_required
  135. def download(tenant_id, dataset_id, document_id):
  136. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  137. return get_error_data_result(retmsg=f'You do not own the dataset {dataset_id}.')
  138. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  139. if not doc:
  140. return get_error_data_result(retmsg=f'The dataset not own the document {document_id}.')
  141. # The process of downloading
  142. doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address
  143. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  144. if not file_stream:
  145. return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
  146. file = BytesIO(file_stream)
  147. # Use send_file with a proper filename and MIME type
  148. return send_file(
  149. file,
  150. as_attachment=True,
  151. download_name=doc[0].name,
  152. mimetype='application/octet-stream' # Set a default MIME type
  153. )
  154. @manager.route('/dataset/<dataset_id>/info', methods=['GET'])
  155. @token_required
  156. def list_docs(dataset_id, tenant_id):
  157. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  158. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
  159. id = request.args.get("id")
  160. if not DocumentService.query(id=id,kb_id=dataset_id):
  161. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  162. offset = int(request.args.get("offset", 1))
  163. keywords = request.args.get("keywords","")
  164. limit = int(request.args.get("limit", 1024))
  165. orderby = request.args.get("orderby", "create_time")
  166. if request.args.get("desc") == "False":
  167. desc = False
  168. else:
  169. desc = True
  170. docs, tol = DocumentService.get_list(dataset_id, offset, limit, orderby, desc, keywords, id)
  171. # rename key's name
  172. renamed_doc_list = []
  173. for doc in docs:
  174. key_mapping = {
  175. "chunk_num": "chunk_count",
  176. "kb_id": "knowledgebase_id",
  177. "token_num": "token_count",
  178. "parser_id": "parser_method"
  179. }
  180. renamed_doc = {}
  181. for key, value in doc.items():
  182. new_key = key_mapping.get(key, key)
  183. renamed_doc[new_key] = value
  184. renamed_doc_list.append(renamed_doc)
  185. return get_result(data={"total": tol, "docs": renamed_doc_list})
  186. @manager.route('/dataset/<dataset_id>/document', methods=['DELETE'])
  187. @token_required
  188. def delete(tenant_id,dataset_id):
  189. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  190. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}. ")
  191. req = request.json
  192. if not req.get("ids"):
  193. return get_error_data_result(retmsg="ids is required")
  194. doc_ids = req["ids"]
  195. root_folder = FileService.get_root_folder(tenant_id)
  196. pf_id = root_folder["id"]
  197. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  198. errors = ""
  199. for doc_id in doc_ids:
  200. try:
  201. e, doc = DocumentService.get_by_id(doc_id)
  202. if not e:
  203. return get_error_data_result(retmsg="Document not found!")
  204. tenant_id = DocumentService.get_tenant_id(doc_id)
  205. if not tenant_id:
  206. return get_error_data_result(retmsg="Tenant not found!")
  207. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  208. if not DocumentService.remove_document(doc, tenant_id):
  209. return get_error_data_result(
  210. retmsg="Database error (Document removal)!")
  211. f2d = File2DocumentService.get_by_document_id(doc_id)
  212. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  213. File2DocumentService.delete_by_document_id(doc_id)
  214. STORAGE_IMPL.rm(b, n)
  215. except Exception as e:
  216. errors += str(e)
  217. if errors:
  218. return get_result(retmsg=errors, retcode=RetCode.SERVER_ERROR)
  219. return get_result()
  220. @manager.route('/dataset/<dataset_id>/chunk', methods=['POST'])
  221. @token_required
  222. def parse(tenant_id,dataset_id):
  223. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  224. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  225. req = request.json
  226. for id in req["document_ids"]:
  227. if not DocumentService.query(id=id,kb_id=dataset_id):
  228. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  229. info = {"run": "1", "progress": 0}
  230. info["progress_msg"] = ""
  231. info["chunk_num"] = 0
  232. info["token_num"] = 0
  233. DocumentService.update_by_id(id, info)
  234. # if str(req["run"]) == TaskStatus.CANCEL.value:
  235. ELASTICSEARCH.deleteByQuery(
  236. Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
  237. TaskService.filter_delete([Task.doc_id == id])
  238. e, doc = DocumentService.get_by_id(id)
  239. doc = doc.to_dict()
  240. doc["tenant_id"] = tenant_id
  241. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  242. queue_tasks(doc, bucket, name)
  243. return get_result()
  244. @manager.route('/dataset/<dataset_id>/chunk', methods=['DELETE'])
  245. @token_required
  246. def stop_parsing(tenant_id,dataset_id):
  247. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  248. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  249. req = request.json
  250. for id in req["document_ids"]:
  251. if not DocumentService.query(id=id,kb_id=dataset_id):
  252. return get_error_data_result(retmsg=f"You don't own the document {id}.")
  253. info = {"run": "2", "progress": 0}
  254. DocumentService.update_by_id(id, info)
  255. # if str(req["run"]) == TaskStatus.CANCEL.value:
  256. tenant_id = DocumentService.get_tenant_id(id)
  257. ELASTICSEARCH.deleteByQuery(
  258. Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
  259. return get_result()
  260. @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['GET'])
  261. @token_required
  262. def list_chunk(tenant_id,dataset_id,document_id):
  263. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  264. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  265. doc=DocumentService.query(id=document_id, kb_id=dataset_id)
  266. if not doc:
  267. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  268. doc=doc[0]
  269. req = request.args
  270. doc_id = document_id
  271. page = int(req.get("offset", 1))
  272. size = int(req.get("limit", 30))
  273. question = req.get("keywords", "")
  274. try:
  275. query = {
  276. "doc_ids": [doc_id], "page": page, "size": size, "question": question, "sort": True
  277. }
  278. if "available_int" in req:
  279. query["available_int"] = int(req["available_int"])
  280. sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
  281. res = {"total": sres.total, "chunks": [], "doc": doc.to_dict()}
  282. origin_chunks = []
  283. for id in sres.ids:
  284. d = {
  285. "chunk_id": id,
  286. "content_with_weight": rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[
  287. id].get(
  288. "content_with_weight", ""),
  289. "doc_id": sres.field[id]["doc_id"],
  290. "docnm_kwd": sres.field[id]["docnm_kwd"],
  291. "important_kwd": sres.field[id].get("important_kwd", []),
  292. "img_id": sres.field[id].get("img_id", ""),
  293. "available_int": sres.field[id].get("available_int", 1),
  294. "positions": sres.field[id].get("position_int", "").split("\t")
  295. }
  296. if len(d["positions"]) % 5 == 0:
  297. poss = []
  298. for i in range(0, len(d["positions"]), 5):
  299. poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
  300. float(d["positions"][i + 3]), float(d["positions"][i + 4])])
  301. d["positions"] = poss
  302. origin_chunks.append(d)
  303. ##rename keys
  304. for chunk in origin_chunks:
  305. key_mapping = {
  306. "chunk_id": "id",
  307. "content_with_weight": "content",
  308. "doc_id": "document_id",
  309. "important_kwd": "important_keywords",
  310. "img_id": "image_id",
  311. }
  312. renamed_chunk = {}
  313. for key, value in chunk.items():
  314. new_key = key_mapping.get(key, key)
  315. renamed_chunk[new_key] = value
  316. res["chunks"].append(renamed_chunk)
  317. return get_result(data=res)
  318. except Exception as e:
  319. if str(e).find("not_found") > 0:
  320. return get_result(retmsg=f'No chunk found!',
  321. retcode=RetCode.DATA_ERROR)
  322. return server_error_response(e)
  323. @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk', methods=['POST'])
  324. @token_required
  325. def create(tenant_id,dataset_id,document_id):
  326. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  327. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  328. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  329. if not doc:
  330. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  331. doc = doc[0]
  332. req = request.json
  333. if not req.get("content"):
  334. return get_error_data_result(retmsg="`content` is required")
  335. md5 = hashlib.md5()
  336. md5.update((req["content"] + document_id).encode("utf-8"))
  337. chunk_id = md5.hexdigest()
  338. d = {"id": chunk_id, "content_ltks": rag_tokenizer.tokenize(req["content"]),
  339. "content_with_weight": req["content"]}
  340. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  341. d["important_kwd"] = req.get("important_kwd", [])
  342. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
  343. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  344. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  345. d["kb_id"] = [doc.kb_id]
  346. d["docnm_kwd"] = doc.name
  347. d["doc_id"] = doc.id
  348. embd_id = DocumentService.get_embd_id(document_id)
  349. embd_mdl = TenantLLMService.model_instance(
  350. tenant_id, LLMType.EMBEDDING.value, embd_id)
  351. v, c = embd_mdl.encode([doc.name, req["content"]])
  352. v = 0.1 * v[0] + 0.9 * v[1]
  353. d["q_%d_vec" % len(v)] = v.tolist()
  354. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  355. DocumentService.increment_chunk_num(
  356. doc.id, doc.kb_id, c, 1, 0)
  357. d["chunk_id"] = chunk_id
  358. # rename keys
  359. key_mapping = {
  360. "chunk_id": "id",
  361. "content_with_weight": "content",
  362. "doc_id": "document_id",
  363. "important_kwd": "important_keywords",
  364. "kb_id": "dataset_id",
  365. "create_timestamp_flt": "create_timestamp",
  366. "create_time": "create_time",
  367. "document_keyword": "document",
  368. }
  369. renamed_chunk = {}
  370. for key, value in d.items():
  371. if key in key_mapping:
  372. new_key = key_mapping.get(key, key)
  373. renamed_chunk[new_key] = value
  374. return get_result(data={"chunk": renamed_chunk})
  375. # return get_result(data={"chunk_id": chunk_id})
  376. @manager.route('dataset/<dataset_id>/document/<document_id>/chunk', methods=['DELETE'])
  377. @token_required
  378. def rm_chunk(tenant_id,dataset_id,document_id):
  379. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  380. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  381. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  382. if not doc:
  383. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  384. doc = doc[0]
  385. req = request.json
  386. if not req.get("chunk_ids"):
  387. return get_error_data_result("`chunk_ids` is required")
  388. for chunk_id in req.get("chunk_ids"):
  389. res = ELASTICSEARCH.get(
  390. chunk_id, search.index_name(
  391. tenant_id))
  392. if not res.get("found"):
  393. return server_error_response(f"Chunk {chunk_id} not found")
  394. if not ELASTICSEARCH.deleteByQuery(
  395. Q("ids", values=req["chunk_ids"]), search.index_name(tenant_id)):
  396. return get_error_data_result(retmsg="Index updating failure")
  397. deleted_chunk_ids = req["chunk_ids"]
  398. chunk_number = len(deleted_chunk_ids)
  399. DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
  400. return get_result()
  401. @manager.route('/dataset/<dataset_id>/document/<document_id>/chunk/<chunk_id>', methods=['PUT'])
  402. @token_required
  403. def set(tenant_id,dataset_id,document_id,chunk_id):
  404. res = ELASTICSEARCH.get(
  405. chunk_id, search.index_name(
  406. tenant_id))
  407. if not res.get("found"):
  408. return get_error_data_result(f"Chunk {chunk_id} not found")
  409. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  410. return get_error_data_result(retmsg=f"You don't own the dataset {dataset_id}.")
  411. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  412. if not doc:
  413. return get_error_data_result(retmsg=f"You don't own the document {document_id}.")
  414. req = request.json
  415. d = {
  416. "id": chunk_id,
  417. "content_with_weight": req.get("content",res.get["content_with_weight"])}
  418. d["content_ltks"] = rag_tokenizer.tokenize(req["content"])
  419. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  420. d["important_kwd"] = req.get("important_keywords",[])
  421. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  422. if "available" in req:
  423. d["available_int"] = req["available"]
  424. embd_id = DocumentService.get_embd_id(document_id)
  425. embd_mdl = TenantLLMService.model_instance(
  426. tenant_id, LLMType.EMBEDDING.value, embd_id)
  427. if doc.parser_id == ParserType.QA:
  428. arr = [
  429. t for t in re.split(
  430. r"[\n\t]",
  431. req["content"]) if len(t) > 1]
  432. if len(arr) != 2:
  433. return get_error_data_result(
  434. retmsg="Q&A must be separated by TAB/ENTER key.")
  435. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  436. d = beAdoc(d, arr[0], arr[1], not any(
  437. [rag_tokenizer.is_chinese(t) for t in q + a]))
  438. v, c = embd_mdl.encode([doc.name, req["content"]])
  439. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  440. d["q_%d_vec" % len(v)] = v.tolist()
  441. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  442. return get_result()
  443. @manager.route('/retrieval', methods=['GET'])
  444. @token_required
  445. def retrieval_test(tenant_id):
  446. req = request.args
  447. req_json = request.json
  448. if not req_json.get("datasets"):
  449. return get_error_data_result("`datasets` is required.")
  450. for id in req_json.get("datasets"):
  451. if not KnowledgebaseService.query(id=id,tenant_id=tenant_id):
  452. return get_error_data_result(f"You don't own the dataset {id}.")
  453. if "question" not in req_json:
  454. return get_error_data_result("`question` is required.")
  455. page = int(req.get("offset", 1))
  456. size = int(req.get("limit", 30))
  457. question = req_json["question"]
  458. kb_id = req_json["datasets"]
  459. if isinstance(kb_id, str): kb_id = [kb_id]
  460. doc_ids = req_json.get("documents", [])
  461. similarity_threshold = float(req.get("similarity_threshold", 0.0))
  462. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  463. top = int(req.get("top_k", 1024))
  464. if req.get("highlight")=="False" or req.get("highlight")=="false":
  465. highlight = False
  466. else:
  467. highlight = True
  468. try:
  469. e, kb = KnowledgebaseService.get_by_id(kb_id[0])
  470. if not e:
  471. return get_error_data_result(retmsg="Knowledgebase not found!")
  472. embd_mdl = TenantLLMService.model_instance(
  473. kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
  474. rerank_mdl = None
  475. if req.get("rerank_id"):
  476. rerank_mdl = TenantLLMService.model_instance(
  477. kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])
  478. if req.get("keyword", False):
  479. chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
  480. question += keyword_extraction(chat_mdl, question)
  481. retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
  482. ranks = retr.retrieval(question, embd_mdl, kb.tenant_id, kb_id, page, size,
  483. similarity_threshold, vector_similarity_weight, top,
  484. doc_ids, rerank_mdl=rerank_mdl, highlight=highlight)
  485. for c in ranks["chunks"]:
  486. if "vector" in c:
  487. del c["vector"]
  488. ##rename keys
  489. renamed_chunks = []
  490. for chunk in ranks["chunks"]:
  491. key_mapping = {
  492. "chunk_id": "id",
  493. "content_with_weight": "content",
  494. "doc_id": "document_id",
  495. "important_kwd": "important_keywords",
  496. "docnm_kwd": "document_keyword"
  497. }
  498. rename_chunk = {}
  499. for key, value in chunk.items():
  500. new_key = key_mapping.get(key, key)
  501. rename_chunk[new_key] = value
  502. renamed_chunks.append(rename_chunk)
  503. ranks["chunks"] = renamed_chunks
  504. return get_result(data=ranks)
  505. except Exception as e:
  506. if str(e).find("not_found") > 0:
  507. return get_result(retmsg=f'No chunk found! Check the chunk statu s please!',
  508. retcode=RetCode.DATA_ERROR)
  509. return server_error_response(e)