You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import pathlib
  17. import datetime
  18. from api.db.services.dialog_service import keyword_extraction
  19. from rag.app.qa import rmPrefix, beAdoc
  20. from rag.nlp import rag_tokenizer
  21. from api.db import LLMType, ParserType
  22. from api.db.services.llm_service import TenantLLMService
  23. from api.settings import kg_retrievaler
  24. import hashlib
  25. import re
  26. from api.utils.api_utils import token_required
  27. from api.db.db_models import Task
  28. from api.db.services.task_service import TaskService, queue_tasks
  29. from api.utils.api_utils import server_error_response
  30. from api.utils.api_utils import get_result, get_error_data_result
  31. from io import BytesIO
  32. from elasticsearch_dsl import Q
  33. from flask import request, send_file
  34. from api.db import FileSource, TaskStatus, FileType
  35. from api.db.db_models import File
  36. from api.db.services.document_service import DocumentService
  37. from api.db.services.file2document_service import File2DocumentService
  38. from api.db.services.file_service import FileService
  39. from api.db.services.knowledgebase_service import KnowledgebaseService
  40. from api.settings import RetCode, retrievaler
  41. from api.utils.api_utils import construct_json_result, get_parser_config
  42. from rag.nlp import search
  43. from rag.utils import rmSpace
  44. from rag.utils.es_conn import ELASTICSEARCH
  45. from rag.utils.storage_factory import STORAGE_IMPL
  46. import os
  47. MAXIMUM_OF_UPLOADING_FILES = 256
  48. @manager.route("/datasets/<dataset_id>/documents", methods=["POST"])
  49. @token_required
  50. def upload(dataset_id, tenant_id):
  51. """
  52. Upload documents to a dataset.
  53. ---
  54. tags:
  55. - Documents
  56. security:
  57. - ApiKeyAuth: []
  58. parameters:
  59. - in: path
  60. name: dataset_id
  61. type: string
  62. required: true
  63. description: ID of the dataset.
  64. - in: header
  65. name: Authorization
  66. type: string
  67. required: true
  68. description: Bearer token for authentication.
  69. - in: formData
  70. name: file
  71. type: file
  72. required: true
  73. description: Document files to upload.
  74. responses:
  75. 200:
  76. description: Successfully uploaded documents.
  77. schema:
  78. type: object
  79. properties:
  80. data:
  81. type: array
  82. items:
  83. type: object
  84. properties:
  85. id:
  86. type: string
  87. description: Document ID.
  88. name:
  89. type: string
  90. description: Document name.
  91. chunk_count:
  92. type: integer
  93. description: Number of chunks.
  94. token_count:
  95. type: integer
  96. description: Number of tokens.
  97. dataset_id:
  98. type: string
  99. description: ID of the dataset.
  100. chunk_method:
  101. type: string
  102. description: Chunking method used.
  103. run:
  104. type: string
  105. description: Processing status.
  106. """
  107. if "file" not in request.files:
  108. return get_error_data_result(
  109. message="No file part!", code=RetCode.ARGUMENT_ERROR
  110. )
  111. file_objs = request.files.getlist("file")
  112. for file_obj in file_objs:
  113. if file_obj.filename == "":
  114. return get_result(
  115. message="No file selected!", code=RetCode.ARGUMENT_ERROR
  116. )
  117. # total size
  118. total_size = 0
  119. for file_obj in file_objs:
  120. file_obj.seek(0, os.SEEK_END)
  121. total_size += file_obj.tell()
  122. file_obj.seek(0)
  123. MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
  124. if total_size > MAX_TOTAL_FILE_SIZE:
  125. return get_result(
  126. message=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
  127. code=RetCode.ARGUMENT_ERROR,
  128. )
  129. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  130. if not e:
  131. raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
  132. err, files = FileService.upload_document(kb, file_objs, tenant_id)
  133. if err:
  134. return get_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
  135. # rename key's name
  136. renamed_doc_list = []
  137. for file in files:
  138. doc = file[0]
  139. key_mapping = {
  140. "chunk_num": "chunk_count",
  141. "kb_id": "dataset_id",
  142. "token_num": "token_count",
  143. "parser_id": "chunk_method",
  144. }
  145. renamed_doc = {}
  146. for key, value in doc.items():
  147. new_key = key_mapping.get(key, key)
  148. renamed_doc[new_key] = value
  149. renamed_doc["run"] = "UNSTART"
  150. renamed_doc_list.append(renamed_doc)
  151. return get_result(data=renamed_doc_list)
  152. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"])
  153. @token_required
  154. def update_doc(tenant_id, dataset_id, document_id):
  155. """
  156. Update a document within a dataset.
  157. ---
  158. tags:
  159. - Documents
  160. security:
  161. - ApiKeyAuth: []
  162. parameters:
  163. - in: path
  164. name: dataset_id
  165. type: string
  166. required: true
  167. description: ID of the dataset.
  168. - in: path
  169. name: document_id
  170. type: string
  171. required: true
  172. description: ID of the document to update.
  173. - in: header
  174. name: Authorization
  175. type: string
  176. required: true
  177. description: Bearer token for authentication.
  178. - in: body
  179. name: body
  180. description: Document update parameters.
  181. required: true
  182. schema:
  183. type: object
  184. properties:
  185. name:
  186. type: string
  187. description: New name of the document.
  188. parser_config:
  189. type: object
  190. description: Parser configuration.
  191. chunk_method:
  192. type: string
  193. description: Chunking method.
  194. responses:
  195. 200:
  196. description: Document updated successfully.
  197. schema:
  198. type: object
  199. """
  200. req = request.json
  201. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  202. return get_error_data_result(message="You don't own the dataset.")
  203. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  204. if not doc:
  205. return get_error_data_result(message="The dataset doesn't own the document.")
  206. doc = doc[0]
  207. if "chunk_count" in req:
  208. if req["chunk_count"] != doc.chunk_num:
  209. return get_error_data_result(message="Can't change `chunk_count`.")
  210. if "token_count" in req:
  211. if req["token_count"] != doc.token_num:
  212. return get_error_data_result(message="Can't change `token_count`.")
  213. if "progress" in req:
  214. if req["progress"] != doc.progress:
  215. return get_error_data_result(message="Can't change `progress`.")
  216. if "name" in req and req["name"] != doc.name:
  217. if (
  218. pathlib.Path(req["name"].lower()).suffix
  219. != pathlib.Path(doc.name.lower()).suffix
  220. ):
  221. return get_result(
  222. message="The extension of file can't be changed",
  223. code=RetCode.ARGUMENT_ERROR,
  224. )
  225. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  226. if d.name == req["name"]:
  227. return get_error_data_result(
  228. message="Duplicated document name in the same dataset."
  229. )
  230. if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
  231. return get_error_data_result(message="Database error (Document rename)!")
  232. informs = File2DocumentService.get_by_document_id(document_id)
  233. if informs:
  234. e, file = FileService.get_by_id(informs[0].file_id)
  235. FileService.update_by_id(file.id, {"name": req["name"]})
  236. if "parser_config" in req:
  237. DocumentService.update_parser_config(doc.id, req["parser_config"])
  238. if "chunk_method" in req:
  239. valid_chunk_method = {
  240. "naive",
  241. "manual",
  242. "qa",
  243. "table",
  244. "paper",
  245. "book",
  246. "laws",
  247. "presentation",
  248. "picture",
  249. "one",
  250. "knowledge_graph",
  251. "email",
  252. }
  253. if req.get("chunk_method") not in valid_chunk_method:
  254. return get_error_data_result(
  255. f"`chunk_method` {req['chunk_method']} doesn't exist"
  256. )
  257. if doc.parser_id.lower() == req["chunk_method"].lower():
  258. return get_result()
  259. if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
  260. return get_error_data_result(message="Not supported yet!")
  261. e = DocumentService.update_by_id(
  262. doc.id,
  263. {
  264. "parser_id": req["chunk_method"],
  265. "progress": 0,
  266. "progress_msg": "",
  267. "run": TaskStatus.UNSTART.value,
  268. },
  269. )
  270. if not e:
  271. return get_error_data_result(message="Document not found!")
  272. req["parser_config"] = get_parser_config(
  273. req["chunk_method"], req.get("parser_config")
  274. )
  275. DocumentService.update_parser_config(doc.id, req["parser_config"])
  276. if doc.token_num > 0:
  277. e = DocumentService.increment_chunk_num(
  278. doc.id,
  279. doc.kb_id,
  280. doc.token_num * -1,
  281. doc.chunk_num * -1,
  282. doc.process_duation * -1,
  283. )
  284. if not e:
  285. return get_error_data_result(message="Document not found!")
  286. ELASTICSEARCH.deleteByQuery(
  287. Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id)
  288. )
  289. return get_result()
  290. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])
  291. @token_required
  292. def download(tenant_id, dataset_id, document_id):
  293. """
  294. Download a document from a dataset.
  295. ---
  296. tags:
  297. - Documents
  298. security:
  299. - ApiKeyAuth: []
  300. produces:
  301. - application/octet-stream
  302. parameters:
  303. - in: path
  304. name: dataset_id
  305. type: string
  306. required: true
  307. description: ID of the dataset.
  308. - in: path
  309. name: document_id
  310. type: string
  311. required: true
  312. description: ID of the document to download.
  313. - in: header
  314. name: Authorization
  315. type: string
  316. required: true
  317. description: Bearer token for authentication.
  318. responses:
  319. 200:
  320. description: Document file stream.
  321. schema:
  322. type: file
  323. 400:
  324. description: Error message.
  325. schema:
  326. type: object
  327. """
  328. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  329. return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
  330. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  331. if not doc:
  332. return get_error_data_result(
  333. message=f"The dataset not own the document {document_id}."
  334. )
  335. # The process of downloading
  336. doc_id, doc_location = File2DocumentService.get_storage_address(
  337. doc_id=document_id
  338. ) # minio address
  339. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  340. if not file_stream:
  341. return construct_json_result(
  342. message="This file is empty.", code=RetCode.DATA_ERROR
  343. )
  344. file = BytesIO(file_stream)
  345. # Use send_file with a proper filename and MIME type
  346. return send_file(
  347. file,
  348. as_attachment=True,
  349. download_name=doc[0].name,
  350. mimetype="application/octet-stream", # Set a default MIME type
  351. )
  352. @manager.route("/datasets/<dataset_id>/documents", methods=["GET"])
  353. @token_required
  354. def list_docs(dataset_id, tenant_id):
  355. """
  356. List documents in a dataset.
  357. ---
  358. tags:
  359. - Documents
  360. security:
  361. - ApiKeyAuth: []
  362. parameters:
  363. - in: path
  364. name: dataset_id
  365. type: string
  366. required: true
  367. description: ID of the dataset.
  368. - in: query
  369. name: id
  370. type: string
  371. required: false
  372. description: Filter by document ID.
  373. - in: query
  374. name: page
  375. type: integer
  376. required: false
  377. default: 1
  378. description: Page number.
  379. - in: query
  380. name: page_size
  381. type: integer
  382. required: false
  383. default: 30
  384. description: Number of items per page.
  385. - in: query
  386. name: orderby
  387. type: string
  388. required: false
  389. default: "create_time"
  390. description: Field to order by.
  391. - in: query
  392. name: desc
  393. type: boolean
  394. required: false
  395. default: true
  396. description: Order in descending.
  397. - in: header
  398. name: Authorization
  399. type: string
  400. required: true
  401. description: Bearer token for authentication.
  402. responses:
  403. 200:
  404. description: List of documents.
  405. schema:
  406. type: object
  407. properties:
  408. total:
  409. type: integer
  410. description: Total number of documents.
  411. docs:
  412. type: array
  413. items:
  414. type: object
  415. properties:
  416. id:
  417. type: string
  418. description: Document ID.
  419. name:
  420. type: string
  421. description: Document name.
  422. chunk_count:
  423. type: integer
  424. description: Number of chunks.
  425. token_count:
  426. type: integer
  427. description: Number of tokens.
  428. dataset_id:
  429. type: string
  430. description: ID of the dataset.
  431. chunk_method:
  432. type: string
  433. description: Chunking method used.
  434. run:
  435. type: string
  436. description: Processing status.
  437. """
  438. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  439. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  440. id = request.args.get("id")
  441. name = request.args.get("name")
  442. if not DocumentService.query(id=id, kb_id=dataset_id):
  443. return get_error_data_result(message=f"You don't own the document {id}.")
  444. if not DocumentService.query(name=name, kb_id=dataset_id):
  445. return get_error_data_result(message=f"You don't own the document {name}.")
  446. page = int(request.args.get("page", 1))
  447. keywords = request.args.get("keywords", "")
  448. page_size = int(request.args.get("page_size", 30))
  449. orderby = request.args.get("orderby", "create_time")
  450. if request.args.get("desc") == "False":
  451. desc = False
  452. else:
  453. desc = True
  454. docs, tol = DocumentService.get_list(
  455. dataset_id, page, page_size, orderby, desc, keywords, id, name
  456. )
  457. # rename key's name
  458. renamed_doc_list = []
  459. for doc in docs:
  460. key_mapping = {
  461. "chunk_num": "chunk_count",
  462. "kb_id": "dataset_id",
  463. "token_num": "token_count",
  464. "parser_id": "chunk_method",
  465. }
  466. run_mapping = {
  467. "0": "UNSTART",
  468. "1": "RUNNING",
  469. "2": "CANCEL",
  470. "3": "DONE",
  471. "4": "FAIL",
  472. }
  473. renamed_doc = {}
  474. for key, value in doc.items():
  475. if key == "run":
  476. renamed_doc["run"] = run_mapping.get(str(value))
  477. new_key = key_mapping.get(key, key)
  478. renamed_doc[new_key] = value
  479. if key == "run":
  480. renamed_doc["run"] = run_mapping.get(value)
  481. renamed_doc_list.append(renamed_doc)
  482. return get_result(data={"total": tol, "docs": renamed_doc_list})
  483. @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"])
  484. @token_required
  485. def delete(tenant_id, dataset_id):
  486. """
  487. Delete documents from a dataset.
  488. ---
  489. tags:
  490. - Documents
  491. security:
  492. - ApiKeyAuth: []
  493. parameters:
  494. - in: path
  495. name: dataset_id
  496. type: string
  497. required: true
  498. description: ID of the dataset.
  499. - in: body
  500. name: body
  501. description: Document deletion parameters.
  502. required: true
  503. schema:
  504. type: object
  505. properties:
  506. ids:
  507. type: array
  508. items:
  509. type: string
  510. description: List of document IDs to delete.
  511. - in: header
  512. name: Authorization
  513. type: string
  514. required: true
  515. description: Bearer token for authentication.
  516. responses:
  517. 200:
  518. description: Documents deleted successfully.
  519. schema:
  520. type: object
  521. """
  522. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  523. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  524. req = request.json
  525. if not req:
  526. doc_ids = None
  527. else:
  528. doc_ids = req.get("ids")
  529. if not doc_ids:
  530. doc_list = []
  531. docs = DocumentService.query(kb_id=dataset_id)
  532. for doc in docs:
  533. doc_list.append(doc.id)
  534. else:
  535. doc_list = doc_ids
  536. root_folder = FileService.get_root_folder(tenant_id)
  537. pf_id = root_folder["id"]
  538. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  539. errors = ""
  540. for doc_id in doc_list:
  541. try:
  542. e, doc = DocumentService.get_by_id(doc_id)
  543. if not e:
  544. return get_error_data_result(message="Document not found!")
  545. tenant_id = DocumentService.get_tenant_id(doc_id)
  546. if not tenant_id:
  547. return get_error_data_result(message="Tenant not found!")
  548. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  549. if not DocumentService.remove_document(doc, tenant_id):
  550. return get_error_data_result(
  551. message="Database error (Document removal)!"
  552. )
  553. f2d = File2DocumentService.get_by_document_id(doc_id)
  554. FileService.filter_delete(
  555. [
  556. File.source_type == FileSource.KNOWLEDGEBASE,
  557. File.id == f2d[0].file_id,
  558. ]
  559. )
  560. File2DocumentService.delete_by_document_id(doc_id)
  561. STORAGE_IMPL.rm(b, n)
  562. except Exception as e:
  563. errors += str(e)
  564. if errors:
  565. return get_result(message=errors, code=RetCode.SERVER_ERROR)
  566. return get_result()
  567. @manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])
  568. @token_required
  569. def parse(tenant_id, dataset_id):
  570. """
  571. Start parsing documents into chunks.
  572. ---
  573. tags:
  574. - Chunks
  575. security:
  576. - ApiKeyAuth: []
  577. parameters:
  578. - in: path
  579. name: dataset_id
  580. type: string
  581. required: true
  582. description: ID of the dataset.
  583. - in: body
  584. name: body
  585. description: Parsing parameters.
  586. required: true
  587. schema:
  588. type: object
  589. properties:
  590. document_ids:
  591. type: array
  592. items:
  593. type: string
  594. description: List of document IDs to parse.
  595. - in: header
  596. name: Authorization
  597. type: string
  598. required: true
  599. description: Bearer token for authentication.
  600. responses:
  601. 200:
  602. description: Parsing started successfully.
  603. schema:
  604. type: object
  605. """
  606. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  607. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  608. req = request.json
  609. if not req.get("document_ids"):
  610. return get_error_data_result("`document_ids` is required")
  611. for id in req["document_ids"]:
  612. doc = DocumentService.query(id=id, kb_id=dataset_id)
  613. if not doc:
  614. return get_error_data_result(message=f"You don't own the document {id}.")
  615. if doc[0].progress != 0.0:
  616. return get_error_data_result(
  617. "Can't stop parsing document with progress at 0 or 100"
  618. )
  619. info = {"run": "1", "progress": 0}
  620. info["progress_msg"] = ""
  621. info["chunk_num"] = 0
  622. info["token_num"] = 0
  623. DocumentService.update_by_id(id, info)
  624. ELASTICSEARCH.deleteByQuery(
  625. Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
  626. )
  627. TaskService.filter_delete([Task.doc_id == id])
  628. e, doc = DocumentService.get_by_id(id)
  629. doc = doc.to_dict()
  630. doc["tenant_id"] = tenant_id
  631. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  632. queue_tasks(doc, bucket, name)
  633. return get_result()
  634. @manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"])
  635. @token_required
  636. def stop_parsing(tenant_id, dataset_id):
  637. """
  638. Stop parsing documents into chunks.
  639. ---
  640. tags:
  641. - Chunks
  642. security:
  643. - ApiKeyAuth: []
  644. parameters:
  645. - in: path
  646. name: dataset_id
  647. type: string
  648. required: true
  649. description: ID of the dataset.
  650. - in: body
  651. name: body
  652. description: Stop parsing parameters.
  653. required: true
  654. schema:
  655. type: object
  656. properties:
  657. document_ids:
  658. type: array
  659. items:
  660. type: string
  661. description: List of document IDs to stop parsing.
  662. - in: header
  663. name: Authorization
  664. type: string
  665. required: true
  666. description: Bearer token for authentication.
  667. responses:
  668. 200:
  669. description: Parsing stopped successfully.
  670. schema:
  671. type: object
  672. """
  673. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  674. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  675. req = request.json
  676. if not req.get("document_ids"):
  677. return get_error_data_result("`document_ids` is required")
  678. for id in req["document_ids"]:
  679. doc = DocumentService.query(id=id, kb_id=dataset_id)
  680. if not doc:
  681. return get_error_data_result(message=f"You don't own the document {id}.")
  682. if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
  683. return get_error_data_result(
  684. "Can't stop parsing document with progress at 0 or 1"
  685. )
  686. info = {"run": "2", "progress": 0, "chunk_num": 0}
  687. DocumentService.update_by_id(id, info)
  688. ELASTICSEARCH.deleteByQuery(
  689. Q("match", doc_id=id), idxnm=search.index_name(tenant_id)
  690. )
  691. return get_result()
  692. @manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"])
  693. @token_required
  694. def list_chunks(tenant_id, dataset_id, document_id):
  695. """
  696. List chunks of a document.
  697. ---
  698. tags:
  699. - Chunks
  700. security:
  701. - ApiKeyAuth: []
  702. parameters:
  703. - in: path
  704. name: dataset_id
  705. type: string
  706. required: true
  707. description: ID of the dataset.
  708. - in: path
  709. name: document_id
  710. type: string
  711. required: true
  712. description: ID of the document.
  713. - in: query
  714. name: page
  715. type: integer
  716. required: false
  717. default: 1
  718. description: Page number.
  719. - in: query
  720. name: page_size
  721. type: integer
  722. required: false
  723. default: 30
  724. description: Number of items per page.
  725. - in: header
  726. name: Authorization
  727. type: string
  728. required: true
  729. description: Bearer token for authentication.
  730. responses:
  731. 200:
  732. description: List of chunks.
  733. schema:
  734. type: object
  735. properties:
  736. total:
  737. type: integer
  738. description: Total number of chunks.
  739. chunks:
  740. type: array
  741. items:
  742. type: object
  743. properties:
  744. id:
  745. type: string
  746. description: Chunk ID.
  747. content:
  748. type: string
  749. description: Chunk content.
  750. document_id:
  751. type: string
  752. description: ID of the document.
  753. important_keywords:
  754. type: array
  755. items:
  756. type: string
  757. description: Important keywords.
  758. image_id:
  759. type: string
  760. description: Image ID associated with the chunk.
  761. doc:
  762. type: object
  763. description: Document details.
  764. """
  765. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  766. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  767. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  768. if not doc:
  769. return get_error_data_result(
  770. message=f"You don't own the document {document_id}."
  771. )
  772. doc = doc[0]
  773. req = request.args
  774. doc_id = document_id
  775. page = int(req.get("page", 1))
  776. size = int(req.get("page_size", 30))
  777. question = req.get("keywords", "")
  778. query = {
  779. "doc_ids": [doc_id],
  780. "page": page,
  781. "size": size,
  782. "question": question,
  783. "sort": True,
  784. }
  785. sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
  786. key_mapping = {
  787. "chunk_num": "chunk_count",
  788. "kb_id": "dataset_id",
  789. "token_num": "token_count",
  790. "parser_id": "chunk_method",
  791. }
  792. run_mapping = {
  793. "0": "UNSTART",
  794. "1": "RUNNING",
  795. "2": "CANCEL",
  796. "3": "DONE",
  797. "4": "FAIL",
  798. }
  799. doc = doc.to_dict()
  800. renamed_doc = {}
  801. for key, value in doc.items():
  802. new_key = key_mapping.get(key, key)
  803. renamed_doc[new_key] = value
  804. if key == "run":
  805. renamed_doc["run"] = run_mapping.get(str(value))
  806. res = {"total": sres.total, "chunks": [], "doc": renamed_doc}
  807. origin_chunks = []
  808. sign = 0
  809. for id in sres.ids:
  810. d = {
  811. "chunk_id": id,
  812. "content_with_weight": (
  813. rmSpace(sres.highlight[id])
  814. if question and id in sres.highlight
  815. else sres.field[id].get("content_with_weight", "")
  816. ),
  817. "doc_id": sres.field[id]["doc_id"],
  818. "docnm_kwd": sres.field[id]["docnm_kwd"],
  819. "important_kwd": sres.field[id].get("important_kwd", []),
  820. "img_id": sres.field[id].get("img_id", ""),
  821. "available_int": sres.field[id].get("available_int", 1),
  822. "positions": sres.field[id].get("position_int", "").split("\t"),
  823. }
  824. if len(d["positions"]) % 5 == 0:
  825. poss = []
  826. for i in range(0, len(d["positions"]), 5):
  827. poss.append(
  828. [
  829. float(d["positions"][i]),
  830. float(d["positions"][i + 1]),
  831. float(d["positions"][i + 2]),
  832. float(d["positions"][i + 3]),
  833. float(d["positions"][i + 4]),
  834. ]
  835. )
  836. d["positions"] = poss
  837. origin_chunks.append(d)
  838. if req.get("id"):
  839. if req.get("id") == id:
  840. origin_chunks.clear()
  841. origin_chunks.append(d)
  842. sign = 1
  843. break
  844. if req.get("id"):
  845. if sign == 0:
  846. return get_error_data_result(f"Can't find this chunk {req.get('id')}")
  847. for chunk in origin_chunks:
  848. key_mapping = {
  849. "chunk_id": "id",
  850. "content_with_weight": "content",
  851. "doc_id": "document_id",
  852. "important_kwd": "important_keywords",
  853. "img_id": "image_id",
  854. "available_int": "available",
  855. }
  856. renamed_chunk = {}
  857. for key, value in chunk.items():
  858. new_key = key_mapping.get(key, key)
  859. renamed_chunk[new_key] = value
  860. if renamed_chunk["available"] == 0:
  861. renamed_chunk["available"] = False
  862. if renamed_chunk["available"] == 1:
  863. renamed_chunk["available"] = True
  864. res["chunks"].append(renamed_chunk)
  865. return get_result(data=res)
  866. @manager.route(
  867. "/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
  868. )
  869. @token_required
  870. def add_chunk(tenant_id, dataset_id, document_id):
  871. """
  872. Add a chunk to a document.
  873. ---
  874. tags:
  875. - Chunks
  876. security:
  877. - ApiKeyAuth: []
  878. parameters:
  879. - in: path
  880. name: dataset_id
  881. type: string
  882. required: true
  883. description: ID of the dataset.
  884. - in: path
  885. name: document_id
  886. type: string
  887. required: true
  888. description: ID of the document.
  889. - in: body
  890. name: body
  891. description: Chunk data.
  892. required: true
  893. schema:
  894. type: object
  895. properties:
  896. content:
  897. type: string
  898. required: true
  899. description: Content of the chunk.
  900. important_keywords:
  901. type: array
  902. items:
  903. type: string
  904. description: Important keywords.
  905. - in: header
  906. name: Authorization
  907. type: string
  908. required: true
  909. description: Bearer token for authentication.
  910. responses:
  911. 200:
  912. description: Chunk added successfully.
  913. schema:
  914. type: object
  915. properties:
  916. chunk:
  917. type: object
  918. properties:
  919. id:
  920. type: string
  921. description: Chunk ID.
  922. content:
  923. type: string
  924. description: Chunk content.
  925. document_id:
  926. type: string
  927. description: ID of the document.
  928. important_keywords:
  929. type: array
  930. items:
  931. type: string
  932. description: Important keywords.
  933. """
  934. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  935. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  936. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  937. if not doc:
  938. return get_error_data_result(
  939. message=f"You don't own the document {document_id}."
  940. )
  941. doc = doc[0]
  942. req = request.json
  943. if not req.get("content"):
  944. return get_error_data_result(message="`content` is required")
  945. if "important_keywords" in req:
  946. if type(req["important_keywords"]) != list:
  947. return get_error_data_result(
  948. "`important_keywords` is required to be a list"
  949. )
  950. md5 = hashlib.md5()
  951. md5.update((req["content"] + document_id).encode("utf-8"))
  952. chunk_id = md5.hexdigest()
  953. d = {
  954. "id": chunk_id,
  955. "content_ltks": rag_tokenizer.tokenize(req["content"]),
  956. "content_with_weight": req["content"],
  957. }
  958. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  959. d["important_kwd"] = req.get("important_keywords", [])
  960. d["important_tks"] = rag_tokenizer.tokenize(
  961. " ".join(req.get("important_keywords", []))
  962. )
  963. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  964. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  965. d["kb_id"] = [doc.kb_id]
  966. d["docnm_kwd"] = doc.name
  967. d["doc_id"] = doc.id
  968. embd_id = DocumentService.get_embd_id(document_id)
  969. embd_mdl = TenantLLMService.model_instance(
  970. tenant_id, LLMType.EMBEDDING.value, embd_id
  971. )
  972. v, c = embd_mdl.encode([doc.name, req["content"]])
  973. v = 0.1 * v[0] + 0.9 * v[1]
  974. d["q_%d_vec" % len(v)] = v.tolist()
  975. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  976. DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
  977. d["chunk_id"] = chunk_id
  978. d["kb_id"] = doc.kb_id
  979. # rename keys
  980. key_mapping = {
  981. "chunk_id": "id",
  982. "content_with_weight": "content",
  983. "doc_id": "document_id",
  984. "important_kwd": "important_keywords",
  985. "kb_id": "dataset_id",
  986. "create_timestamp_flt": "create_timestamp",
  987. "create_time": "create_time",
  988. "document_keyword": "document",
  989. }
  990. renamed_chunk = {}
  991. for key, value in d.items():
  992. if key in key_mapping:
  993. new_key = key_mapping.get(key, key)
  994. renamed_chunk[new_key] = value
  995. return get_result(data={"chunk": renamed_chunk})
  996. # return get_result(data={"chunk_id": chunk_id})
  997. @manager.route(
  998. "datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
  999. )
  1000. @token_required
  1001. def rm_chunk(tenant_id, dataset_id, document_id):
  1002. """
  1003. Remove chunks from a document.
  1004. ---
  1005. tags:
  1006. - Chunks
  1007. security:
  1008. - ApiKeyAuth: []
  1009. parameters:
  1010. - in: path
  1011. name: dataset_id
  1012. type: string
  1013. required: true
  1014. description: ID of the dataset.
  1015. - in: path
  1016. name: document_id
  1017. type: string
  1018. required: true
  1019. description: ID of the document.
  1020. - in: body
  1021. name: body
  1022. description: Chunk removal parameters.
  1023. required: true
  1024. schema:
  1025. type: object
  1026. properties:
  1027. chunk_ids:
  1028. type: array
  1029. items:
  1030. type: string
  1031. description: List of chunk IDs to remove.
  1032. - in: header
  1033. name: Authorization
  1034. type: string
  1035. required: true
  1036. description: Bearer token for authentication.
  1037. responses:
  1038. 200:
  1039. description: Chunks removed successfully.
  1040. schema:
  1041. type: object
  1042. """
  1043. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  1044. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1045. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  1046. if not doc:
  1047. return get_error_data_result(
  1048. message=f"You don't own the document {document_id}."
  1049. )
  1050. doc = doc[0]
  1051. req = request.json
  1052. if not req.get("chunk_ids"):
  1053. return get_error_data_result("`chunk_ids` is required")
  1054. query = {"doc_ids": [doc.id], "page": 1, "size": 1024, "question": "", "sort": True}
  1055. sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
  1056. if not req:
  1057. chunk_ids = None
  1058. else:
  1059. chunk_ids = req.get("chunk_ids")
  1060. if not chunk_ids:
  1061. chunk_list = sres.ids
  1062. else:
  1063. chunk_list = chunk_ids
  1064. for chunk_id in chunk_list:
  1065. if chunk_id not in sres.ids:
  1066. return get_error_data_result(f"Chunk {chunk_id} not found")
  1067. if not ELASTICSEARCH.deleteByQuery(
  1068. Q("ids", values=chunk_list), search.index_name(tenant_id)
  1069. ):
  1070. return get_error_data_result(message="Index updating failure")
  1071. deleted_chunk_ids = chunk_list
  1072. chunk_number = len(deleted_chunk_ids)
  1073. DocumentService.decrement_chunk_num(doc.id, doc.kb_id, 1, chunk_number, 0)
  1074. return get_result()
  1075. @manager.route(
  1076. "/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
  1077. )
  1078. @token_required
  1079. def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
  1080. """
  1081. Update a chunk within a document.
  1082. ---
  1083. tags:
  1084. - Chunks
  1085. security:
  1086. - ApiKeyAuth: []
  1087. parameters:
  1088. - in: path
  1089. name: dataset_id
  1090. type: string
  1091. required: true
  1092. description: ID of the dataset.
  1093. - in: path
  1094. name: document_id
  1095. type: string
  1096. required: true
  1097. description: ID of the document.
  1098. - in: path
  1099. name: chunk_id
  1100. type: string
  1101. required: true
  1102. description: ID of the chunk to update.
  1103. - in: body
  1104. name: body
  1105. description: Chunk update parameters.
  1106. required: true
  1107. schema:
  1108. type: object
  1109. properties:
  1110. content:
  1111. type: string
  1112. description: Updated content of the chunk.
  1113. important_keywords:
  1114. type: array
  1115. items:
  1116. type: string
  1117. description: Updated important keywords.
  1118. available:
  1119. type: boolean
  1120. description: Availability status of the chunk.
  1121. - in: header
  1122. name: Authorization
  1123. type: string
  1124. required: true
  1125. description: Bearer token for authentication.
  1126. responses:
  1127. 200:
  1128. description: Chunk updated successfully.
  1129. schema:
  1130. type: object
  1131. """
  1132. try:
  1133. res = ELASTICSEARCH.get(chunk_id, search.index_name(tenant_id))
  1134. except Exception:
  1135. return get_error_data_result(f"Can't find this chunk {chunk_id}")
  1136. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  1137. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1138. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  1139. if not doc:
  1140. return get_error_data_result(
  1141. message=f"You don't own the document {document_id}."
  1142. )
  1143. doc = doc[0]
  1144. query = {
  1145. "doc_ids": [document_id],
  1146. "page": 1,
  1147. "size": 1024,
  1148. "question": "",
  1149. "sort": True,
  1150. }
  1151. sres = retrievaler.search(query, search.index_name(tenant_id), highlight=True)
  1152. if chunk_id not in sres.ids:
  1153. return get_error_data_result(f"You don't own the chunk {chunk_id}")
  1154. req = request.json
  1155. content = res["_source"].get("content_with_weight")
  1156. d = {"id": chunk_id, "content_with_weight": req.get("content", content)}
  1157. d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
  1158. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  1159. if "important_keywords" in req:
  1160. if not isinstance(req["important_keywords"], list):
  1161. return get_error_data_result("`important_keywords` should be a list")
  1162. d["important_kwd"] = req.get("important_keywords")
  1163. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  1164. if "available" in req:
  1165. d["available_int"] = int(req["available"])
  1166. embd_id = DocumentService.get_embd_id(document_id)
  1167. embd_mdl = TenantLLMService.model_instance(
  1168. tenant_id, LLMType.EMBEDDING.value, embd_id
  1169. )
  1170. if doc.parser_id == ParserType.QA:
  1171. arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
  1172. if len(arr) != 2:
  1173. return get_error_data_result(
  1174. message="Q&A must be separated by TAB/ENTER key."
  1175. )
  1176. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  1177. d = beAdoc(
  1178. d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
  1179. )
  1180. v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
  1181. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  1182. d["q_%d_vec" % len(v)] = v.tolist()
  1183. ELASTICSEARCH.upsert([d], search.index_name(tenant_id))
  1184. return get_result()
  1185. @manager.route("/retrieval", methods=["POST"])
  1186. @token_required
  1187. def retrieval_test(tenant_id):
  1188. """
  1189. Retrieve chunks based on a query.
  1190. ---
  1191. tags:
  1192. - Retrieval
  1193. security:
  1194. - ApiKeyAuth: []
  1195. parameters:
  1196. - in: body
  1197. name: body
  1198. description: Retrieval parameters.
  1199. required: true
  1200. schema:
  1201. type: object
  1202. properties:
  1203. dataset_ids:
  1204. type: array
  1205. items:
  1206. type: string
  1207. required: true
  1208. description: List of dataset IDs to search in.
  1209. question:
  1210. type: string
  1211. required: true
  1212. description: Query string.
  1213. document_ids:
  1214. type: array
  1215. items:
  1216. type: string
  1217. description: List of document IDs to filter.
  1218. similarity_threshold:
  1219. type: number
  1220. format: float
  1221. description: Similarity threshold.
  1222. vector_similarity_weight:
  1223. type: number
  1224. format: float
  1225. description: Vector similarity weight.
  1226. top_k:
  1227. type: integer
  1228. description: Maximum number of chunks to return.
  1229. highlight:
  1230. type: boolean
  1231. description: Whether to highlight matched content.
  1232. - in: header
  1233. name: Authorization
  1234. type: string
  1235. required: true
  1236. description: Bearer token for authentication.
  1237. responses:
  1238. 200:
  1239. description: Retrieval results.
  1240. schema:
  1241. type: object
  1242. properties:
  1243. chunks:
  1244. type: array
  1245. items:
  1246. type: object
  1247. properties:
  1248. id:
  1249. type: string
  1250. description: Chunk ID.
  1251. content:
  1252. type: string
  1253. description: Chunk content.
  1254. document_id:
  1255. type: string
  1256. description: ID of the document.
  1257. dataset_id:
  1258. type: string
  1259. description: ID of the dataset.
  1260. similarity:
  1261. type: number
  1262. format: float
  1263. description: Similarity score.
  1264. """
  1265. req = request.json
  1266. if not req.get("dataset_ids"):
  1267. return get_error_data_result("`dataset_ids` is required.")
  1268. kb_ids = req["dataset_ids"]
  1269. if not isinstance(kb_ids, list):
  1270. return get_error_data_result("`dataset_ids` should be a list")
  1271. kbs = KnowledgebaseService.get_by_ids(kb_ids)
  1272. for id in kb_ids:
  1273. if not KnowledgebaseService.query(id=id, tenant_id=tenant_id):
  1274. return get_error_data_result(f"You don't own the dataset {id}.")
  1275. embd_nms = list(set([kb.embd_id for kb in kbs]))
  1276. if len(embd_nms) != 1:
  1277. return get_result(
  1278. message='Datasets use different embedding models."',
  1279. code=RetCode.AUTHENTICATION_ERROR,
  1280. )
  1281. if "question" not in req:
  1282. return get_error_data_result("`question` is required.")
  1283. page = int(req.get("page", 1))
  1284. size = int(req.get("page_size", 30))
  1285. question = req["question"]
  1286. doc_ids = req.get("document_ids", [])
  1287. if not isinstance(doc_ids, list):
  1288. return get_error_data_result("`documents` should be a list")
  1289. doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
  1290. for doc_id in doc_ids:
  1291. if doc_id not in doc_ids_list:
  1292. return get_error_data_result(
  1293. f"The datasets don't own the document {doc_id}"
  1294. )
  1295. similarity_threshold = float(req.get("similarity_threshold", 0.2))
  1296. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  1297. top = int(req.get("top_k", 1024))
  1298. if req.get("highlight") == "False" or req.get("highlight") == "false":
  1299. highlight = False
  1300. else:
  1301. highlight = True
  1302. try:
  1303. e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
  1304. if not e:
  1305. return get_error_data_result(message="Dataset not found!")
  1306. embd_mdl = TenantLLMService.model_instance(
  1307. kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
  1308. )
  1309. rerank_mdl = None
  1310. if req.get("rerank_id"):
  1311. rerank_mdl = TenantLLMService.model_instance(
  1312. kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
  1313. )
  1314. if req.get("keyword", False):
  1315. chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
  1316. question += keyword_extraction(chat_mdl, question)
  1317. retr = retrievaler if kb.parser_id != ParserType.KG else kg_retrievaler
  1318. ranks = retr.retrieval(
  1319. question,
  1320. embd_mdl,
  1321. kb.tenant_id,
  1322. kb_ids,
  1323. page,
  1324. size,
  1325. similarity_threshold,
  1326. vector_similarity_weight,
  1327. top,
  1328. doc_ids,
  1329. rerank_mdl=rerank_mdl,
  1330. highlight=highlight,
  1331. )
  1332. for c in ranks["chunks"]:
  1333. if "vector" in c:
  1334. del c["vector"]
  1335. ##rename keys
  1336. renamed_chunks = []
  1337. for chunk in ranks["chunks"]:
  1338. key_mapping = {
  1339. "chunk_id": "id",
  1340. "content_with_weight": "content",
  1341. "doc_id": "document_id",
  1342. "important_kwd": "important_keywords",
  1343. "docnm_kwd": "document_keyword",
  1344. }
  1345. rename_chunk = {}
  1346. for key, value in chunk.items():
  1347. new_key = key_mapping.get(key, key)
  1348. rename_chunk[new_key] = value
  1349. renamed_chunks.append(rename_chunk)
  1350. ranks["chunks"] = renamed_chunks
  1351. return get_result(data=ranks)
  1352. except Exception as e:
  1353. if str(e).find("not_found") > 0:
  1354. return get_result(
  1355. message="No chunk found! Check the chunk status please!",
  1356. code=RetCode.DATA_ERROR,
  1357. )
  1358. return server_error_response(e)