Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

doc.py 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import pathlib
  17. import datetime
  18. from api.db.services.dialog_service import keyword_extraction, label_question
  19. from rag.app.qa import rmPrefix, beAdoc
  20. from rag.nlp import rag_tokenizer
  21. from api.db import LLMType, ParserType
  22. from api.db.services.llm_service import TenantLLMService, LLMBundle
  23. from api import settings
  24. import xxhash
  25. import re
  26. from api.utils.api_utils import token_required
  27. from api.db.db_models import Task
  28. from api.db.services.task_service import TaskService, queue_tasks
  29. from api.utils.api_utils import server_error_response
  30. from api.utils.api_utils import get_result, get_error_data_result
  31. from io import BytesIO
  32. from flask import request, send_file
  33. from api.db import FileSource, TaskStatus, FileType
  34. from api.db.db_models import File
  35. from api.db.services.document_service import DocumentService
  36. from api.db.services.file2document_service import File2DocumentService
  37. from api.db.services.file_service import FileService
  38. from api.db.services.knowledgebase_service import KnowledgebaseService
  39. from api.utils.api_utils import construct_json_result, get_parser_config
  40. from rag.nlp import search
  41. from rag.utils import rmSpace
  42. from rag.utils.storage_factory import STORAGE_IMPL
  43. from pydantic import BaseModel, Field, validator
  44. MAXIMUM_OF_UPLOADING_FILES = 256
  45. class Chunk(BaseModel):
  46. id: str = ""
  47. content: str = ""
  48. document_id: str = ""
  49. docnm_kwd: str = ""
  50. important_keywords: list = Field(default_factory=list)
  51. questions: list = Field(default_factory=list)
  52. question_tks: str = ""
  53. image_id: str = ""
  54. available: bool = True
  55. positions: list[list[int]] = Field(default_factory=list)
  56. @validator('positions')
  57. def validate_positions(cls, value):
  58. for sublist in value:
  59. if len(sublist) != 5:
  60. raise ValueError("Each sublist in positions must have a length of 5")
  61. return value
  62. @manager.route("/datasets/<dataset_id>/documents", methods=["POST"]) # noqa: F821
  63. @token_required
  64. def upload(dataset_id, tenant_id):
  65. """
  66. Upload documents to a dataset.
  67. ---
  68. tags:
  69. - Documents
  70. security:
  71. - ApiKeyAuth: []
  72. parameters:
  73. - in: path
  74. name: dataset_id
  75. type: string
  76. required: true
  77. description: ID of the dataset.
  78. - in: header
  79. name: Authorization
  80. type: string
  81. required: true
  82. description: Bearer token for authentication.
  83. - in: formData
  84. name: file
  85. type: file
  86. required: true
  87. description: Document files to upload.
  88. responses:
  89. 200:
  90. description: Successfully uploaded documents.
  91. schema:
  92. type: object
  93. properties:
  94. data:
  95. type: array
  96. items:
  97. type: object
  98. properties:
  99. id:
  100. type: string
  101. description: Document ID.
  102. name:
  103. type: string
  104. description: Document name.
  105. chunk_count:
  106. type: integer
  107. description: Number of chunks.
  108. token_count:
  109. type: integer
  110. description: Number of tokens.
  111. dataset_id:
  112. type: string
  113. description: ID of the dataset.
  114. chunk_method:
  115. type: string
  116. description: Chunking method used.
  117. run:
  118. type: string
  119. description: Processing status.
  120. """
  121. if "file" not in request.files:
  122. return get_error_data_result(
  123. message="No file part!", code=settings.RetCode.ARGUMENT_ERROR
  124. )
  125. file_objs = request.files.getlist("file")
  126. for file_obj in file_objs:
  127. if file_obj.filename == "":
  128. return get_result(
  129. message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR
  130. )
  131. '''
  132. # total size
  133. total_size = 0
  134. for file_obj in file_objs:
  135. file_obj.seek(0, os.SEEK_END)
  136. total_size += file_obj.tell()
  137. file_obj.seek(0)
  138. MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
  139. if total_size > MAX_TOTAL_FILE_SIZE:
  140. return get_result(
  141. message=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
  142. code=settings.RetCode.ARGUMENT_ERROR,
  143. )
  144. '''
  145. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  146. if not e:
  147. raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
  148. err, files = FileService.upload_document(kb, file_objs, tenant_id)
  149. if err:
  150. return get_result(message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
  151. # rename key's name
  152. renamed_doc_list = []
  153. for file in files:
  154. doc = file[0]
  155. key_mapping = {
  156. "chunk_num": "chunk_count",
  157. "kb_id": "dataset_id",
  158. "token_num": "token_count",
  159. "parser_id": "chunk_method",
  160. }
  161. renamed_doc = {}
  162. for key, value in doc.items():
  163. new_key = key_mapping.get(key, key)
  164. renamed_doc[new_key] = value
  165. renamed_doc["run"] = "UNSTART"
  166. renamed_doc_list.append(renamed_doc)
  167. return get_result(data=renamed_doc_list)
  168. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"]) # noqa: F821
  169. @token_required
  170. def update_doc(tenant_id, dataset_id, document_id):
  171. """
  172. Update a document within a dataset.
  173. ---
  174. tags:
  175. - Documents
  176. security:
  177. - ApiKeyAuth: []
  178. parameters:
  179. - in: path
  180. name: dataset_id
  181. type: string
  182. required: true
  183. description: ID of the dataset.
  184. - in: path
  185. name: document_id
  186. type: string
  187. required: true
  188. description: ID of the document to update.
  189. - in: header
  190. name: Authorization
  191. type: string
  192. required: true
  193. description: Bearer token for authentication.
  194. - in: body
  195. name: body
  196. description: Document update parameters.
  197. required: true
  198. schema:
  199. type: object
  200. properties:
  201. name:
  202. type: string
  203. description: New name of the document.
  204. parser_config:
  205. type: object
  206. description: Parser configuration.
  207. chunk_method:
  208. type: string
  209. description: Chunking method.
  210. responses:
  211. 200:
  212. description: Document updated successfully.
  213. schema:
  214. type: object
  215. """
  216. req = request.json
  217. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  218. return get_error_data_result(message="You don't own the dataset.")
  219. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  220. if not doc:
  221. return get_error_data_result(message="The dataset doesn't own the document.")
  222. doc = doc[0]
  223. if "chunk_count" in req:
  224. if req["chunk_count"] != doc.chunk_num:
  225. return get_error_data_result(message="Can't change `chunk_count`.")
  226. if "token_count" in req:
  227. if req["token_count"] != doc.token_num:
  228. return get_error_data_result(message="Can't change `token_count`.")
  229. if "progress" in req:
  230. if req["progress"] != doc.progress:
  231. return get_error_data_result(message="Can't change `progress`.")
  232. if "name" in req and req["name"] != doc.name:
  233. if (
  234. pathlib.Path(req["name"].lower()).suffix
  235. != pathlib.Path(doc.name.lower()).suffix
  236. ):
  237. return get_result(
  238. message="The extension of file can't be changed",
  239. code=settings.RetCode.ARGUMENT_ERROR,
  240. )
  241. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  242. if d.name == req["name"]:
  243. return get_error_data_result(
  244. message="Duplicated document name in the same dataset."
  245. )
  246. if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
  247. return get_error_data_result(message="Database error (Document rename)!")
  248. informs = File2DocumentService.get_by_document_id(document_id)
  249. if informs:
  250. e, file = FileService.get_by_id(informs[0].file_id)
  251. FileService.update_by_id(file.id, {"name": req["name"]})
  252. if "parser_config" in req:
  253. DocumentService.update_parser_config(doc.id, req["parser_config"])
  254. if "chunk_method" in req:
  255. valid_chunk_method = {
  256. "naive",
  257. "manual",
  258. "qa",
  259. "table",
  260. "paper",
  261. "book",
  262. "laws",
  263. "presentation",
  264. "picture",
  265. "one",
  266. "knowledge_graph",
  267. "email",
  268. "tag"
  269. }
  270. if req.get("chunk_method") not in valid_chunk_method:
  271. return get_error_data_result(
  272. f"`chunk_method` {req['chunk_method']} doesn't exist"
  273. )
  274. if doc.parser_id.lower() == req["chunk_method"].lower():
  275. return get_result()
  276. if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
  277. return get_error_data_result(message="Not supported yet!")
  278. e = DocumentService.update_by_id(
  279. doc.id,
  280. {
  281. "parser_id": req["chunk_method"],
  282. "progress": 0,
  283. "progress_msg": "",
  284. "run": TaskStatus.UNSTART.value,
  285. },
  286. )
  287. if not e:
  288. return get_error_data_result(message="Document not found!")
  289. req["parser_config"] = get_parser_config(
  290. req["chunk_method"], req.get("parser_config")
  291. )
  292. DocumentService.update_parser_config(doc.id, req["parser_config"])
  293. if doc.token_num > 0:
  294. e = DocumentService.increment_chunk_num(
  295. doc.id,
  296. doc.kb_id,
  297. doc.token_num * -1,
  298. doc.chunk_num * -1,
  299. doc.process_duation * -1,
  300. )
  301. if not e:
  302. return get_error_data_result(message="Document not found!")
  303. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  304. return get_result()
  305. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"]) # noqa: F821
  306. @token_required
  307. def download(tenant_id, dataset_id, document_id):
  308. """
  309. Download a document from a dataset.
  310. ---
  311. tags:
  312. - Documents
  313. security:
  314. - ApiKeyAuth: []
  315. produces:
  316. - application/octet-stream
  317. parameters:
  318. - in: path
  319. name: dataset_id
  320. type: string
  321. required: true
  322. description: ID of the dataset.
  323. - in: path
  324. name: document_id
  325. type: string
  326. required: true
  327. description: ID of the document to download.
  328. - in: header
  329. name: Authorization
  330. type: string
  331. required: true
  332. description: Bearer token for authentication.
  333. responses:
  334. 200:
  335. description: Document file stream.
  336. schema:
  337. type: file
  338. 400:
  339. description: Error message.
  340. schema:
  341. type: object
  342. """
  343. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  344. return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
  345. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  346. if not doc:
  347. return get_error_data_result(
  348. message=f"The dataset not own the document {document_id}."
  349. )
  350. # The process of downloading
  351. doc_id, doc_location = File2DocumentService.get_storage_address(
  352. doc_id=document_id
  353. ) # minio address
  354. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  355. if not file_stream:
  356. return construct_json_result(
  357. message="This file is empty.", code=settings.RetCode.DATA_ERROR
  358. )
  359. file = BytesIO(file_stream)
  360. # Use send_file with a proper filename and MIME type
  361. return send_file(
  362. file,
  363. as_attachment=True,
  364. download_name=doc[0].name,
  365. mimetype="application/octet-stream", # Set a default MIME type
  366. )
  367. @manager.route("/datasets/<dataset_id>/documents", methods=["GET"]) # noqa: F821
  368. @token_required
  369. def list_docs(dataset_id, tenant_id):
  370. """
  371. List documents in a dataset.
  372. ---
  373. tags:
  374. - Documents
  375. security:
  376. - ApiKeyAuth: []
  377. parameters:
  378. - in: path
  379. name: dataset_id
  380. type: string
  381. required: true
  382. description: ID of the dataset.
  383. - in: query
  384. name: id
  385. type: string
  386. required: false
  387. description: Filter by document ID.
  388. - in: query
  389. name: page
  390. type: integer
  391. required: false
  392. default: 1
  393. description: Page number.
  394. - in: query
  395. name: page_size
  396. type: integer
  397. required: false
  398. default: 30
  399. description: Number of items per page.
  400. - in: query
  401. name: orderby
  402. type: string
  403. required: false
  404. default: "create_time"
  405. description: Field to order by.
  406. - in: query
  407. name: desc
  408. type: boolean
  409. required: false
  410. default: true
  411. description: Order in descending.
  412. - in: header
  413. name: Authorization
  414. type: string
  415. required: true
  416. description: Bearer token for authentication.
  417. responses:
  418. 200:
  419. description: List of documents.
  420. schema:
  421. type: object
  422. properties:
  423. total:
  424. type: integer
  425. description: Total number of documents.
  426. docs:
  427. type: array
  428. items:
  429. type: object
  430. properties:
  431. id:
  432. type: string
  433. description: Document ID.
  434. name:
  435. type: string
  436. description: Document name.
  437. chunk_count:
  438. type: integer
  439. description: Number of chunks.
  440. token_count:
  441. type: integer
  442. description: Number of tokens.
  443. dataset_id:
  444. type: string
  445. description: ID of the dataset.
  446. chunk_method:
  447. type: string
  448. description: Chunking method used.
  449. run:
  450. type: string
  451. description: Processing status.
  452. """
  453. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  454. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  455. id = request.args.get("id")
  456. name = request.args.get("name")
  457. if not DocumentService.query(id=id, kb_id=dataset_id):
  458. return get_error_data_result(message=f"You don't own the document {id}.")
  459. if not DocumentService.query(name=name, kb_id=dataset_id):
  460. return get_error_data_result(message=f"You don't own the document {name}.")
  461. page = int(request.args.get("page", 1))
  462. keywords = request.args.get("keywords", "")
  463. page_size = int(request.args.get("page_size", 30))
  464. orderby = request.args.get("orderby", "create_time")
  465. if request.args.get("desc") == "False":
  466. desc = False
  467. else:
  468. desc = True
  469. docs, tol = DocumentService.get_list(
  470. dataset_id, page, page_size, orderby, desc, keywords, id, name
  471. )
  472. # rename key's name
  473. renamed_doc_list = []
  474. for doc in docs:
  475. key_mapping = {
  476. "chunk_num": "chunk_count",
  477. "kb_id": "dataset_id",
  478. "token_num": "token_count",
  479. "parser_id": "chunk_method",
  480. }
  481. run_mapping = {
  482. "0": "UNSTART",
  483. "1": "RUNNING",
  484. "2": "CANCEL",
  485. "3": "DONE",
  486. "4": "FAIL",
  487. }
  488. renamed_doc = {}
  489. for key, value in doc.items():
  490. if key == "run":
  491. renamed_doc["run"] = run_mapping.get(str(value))
  492. new_key = key_mapping.get(key, key)
  493. renamed_doc[new_key] = value
  494. if key == "run":
  495. renamed_doc["run"] = run_mapping.get(value)
  496. renamed_doc_list.append(renamed_doc)
  497. return get_result(data={"total": tol, "docs": renamed_doc_list})
  498. @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"]) # noqa: F821
  499. @token_required
  500. def delete(tenant_id, dataset_id):
  501. """
  502. Delete documents from a dataset.
  503. ---
  504. tags:
  505. - Documents
  506. security:
  507. - ApiKeyAuth: []
  508. parameters:
  509. - in: path
  510. name: dataset_id
  511. type: string
  512. required: true
  513. description: ID of the dataset.
  514. - in: body
  515. name: body
  516. description: Document deletion parameters.
  517. required: true
  518. schema:
  519. type: object
  520. properties:
  521. ids:
  522. type: array
  523. items:
  524. type: string
  525. description: List of document IDs to delete.
  526. - in: header
  527. name: Authorization
  528. type: string
  529. required: true
  530. description: Bearer token for authentication.
  531. responses:
  532. 200:
  533. description: Documents deleted successfully.
  534. schema:
  535. type: object
  536. """
  537. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  538. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  539. req = request.json
  540. if not req:
  541. doc_ids = None
  542. else:
  543. doc_ids = req.get("ids")
  544. if not doc_ids:
  545. doc_list = []
  546. docs = DocumentService.query(kb_id=dataset_id)
  547. for doc in docs:
  548. doc_list.append(doc.id)
  549. else:
  550. doc_list = doc_ids
  551. root_folder = FileService.get_root_folder(tenant_id)
  552. pf_id = root_folder["id"]
  553. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  554. errors = ""
  555. for doc_id in doc_list:
  556. try:
  557. e, doc = DocumentService.get_by_id(doc_id)
  558. if not e:
  559. return get_error_data_result(message="Document not found!")
  560. tenant_id = DocumentService.get_tenant_id(doc_id)
  561. if not tenant_id:
  562. return get_error_data_result(message="Tenant not found!")
  563. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  564. if not DocumentService.remove_document(doc, tenant_id):
  565. return get_error_data_result(
  566. message="Database error (Document removal)!"
  567. )
  568. f2d = File2DocumentService.get_by_document_id(doc_id)
  569. FileService.filter_delete(
  570. [
  571. File.source_type == FileSource.KNOWLEDGEBASE,
  572. File.id == f2d[0].file_id,
  573. ]
  574. )
  575. File2DocumentService.delete_by_document_id(doc_id)
  576. STORAGE_IMPL.rm(b, n)
  577. except Exception as e:
  578. errors += str(e)
  579. if errors:
  580. return get_result(message=errors, code=settings.RetCode.SERVER_ERROR)
  581. return get_result()
  582. @manager.route("/datasets/<dataset_id>/chunks", methods=["POST"]) # noqa: F821
  583. @token_required
  584. def parse(tenant_id, dataset_id):
  585. """
  586. Start parsing documents into chunks.
  587. ---
  588. tags:
  589. - Chunks
  590. security:
  591. - ApiKeyAuth: []
  592. parameters:
  593. - in: path
  594. name: dataset_id
  595. type: string
  596. required: true
  597. description: ID of the dataset.
  598. - in: body
  599. name: body
  600. description: Parsing parameters.
  601. required: true
  602. schema:
  603. type: object
  604. properties:
  605. document_ids:
  606. type: array
  607. items:
  608. type: string
  609. description: List of document IDs to parse.
  610. - in: header
  611. name: Authorization
  612. type: string
  613. required: true
  614. description: Bearer token for authentication.
  615. responses:
  616. 200:
  617. description: Parsing started successfully.
  618. schema:
  619. type: object
  620. """
  621. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  622. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  623. req = request.json
  624. if not req.get("document_ids"):
  625. return get_error_data_result("`document_ids` is required")
  626. for id in req["document_ids"]:
  627. doc = DocumentService.query(id=id, kb_id=dataset_id)
  628. if not doc:
  629. return get_error_data_result(message=f"You don't own the document {id}.")
  630. if doc[0].progress != 0.0:
  631. return get_error_data_result(
  632. "Can't stop parsing document with progress at 0 or 100"
  633. )
  634. info = {"run": "1", "progress": 0}
  635. info["progress_msg"] = ""
  636. info["chunk_num"] = 0
  637. info["token_num"] = 0
  638. DocumentService.update_by_id(id, info)
  639. settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
  640. TaskService.filter_delete([Task.doc_id == id])
  641. e, doc = DocumentService.get_by_id(id)
  642. doc = doc.to_dict()
  643. doc["tenant_id"] = tenant_id
  644. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  645. queue_tasks(doc, bucket, name)
  646. return get_result()
  647. @manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"]) # noqa: F821
  648. @token_required
  649. def stop_parsing(tenant_id, dataset_id):
  650. """
  651. Stop parsing documents into chunks.
  652. ---
  653. tags:
  654. - Chunks
  655. security:
  656. - ApiKeyAuth: []
  657. parameters:
  658. - in: path
  659. name: dataset_id
  660. type: string
  661. required: true
  662. description: ID of the dataset.
  663. - in: body
  664. name: body
  665. description: Stop parsing parameters.
  666. required: true
  667. schema:
  668. type: object
  669. properties:
  670. document_ids:
  671. type: array
  672. items:
  673. type: string
  674. description: List of document IDs to stop parsing.
  675. - in: header
  676. name: Authorization
  677. type: string
  678. required: true
  679. description: Bearer token for authentication.
  680. responses:
  681. 200:
  682. description: Parsing stopped successfully.
  683. schema:
  684. type: object
  685. """
  686. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  687. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  688. req = request.json
  689. if not req.get("document_ids"):
  690. return get_error_data_result("`document_ids` is required")
  691. for id in req["document_ids"]:
  692. doc = DocumentService.query(id=id, kb_id=dataset_id)
  693. if not doc:
  694. return get_error_data_result(message=f"You don't own the document {id}.")
  695. if int(doc[0].progress) == 1 or doc[0].progress == 0:
  696. return get_error_data_result(
  697. "Can't stop parsing document with progress at 0 or 1"
  698. )
  699. info = {"run": "2", "progress": 0, "chunk_num": 0}
  700. DocumentService.update_by_id(id, info)
  701. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  702. return get_result()
  703. @manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"]) # noqa: F821
  704. @token_required
  705. def list_chunks(tenant_id, dataset_id, document_id):
  706. """
  707. List chunks of a document.
  708. ---
  709. tags:
  710. - Chunks
  711. security:
  712. - ApiKeyAuth: []
  713. parameters:
  714. - in: path
  715. name: dataset_id
  716. type: string
  717. required: true
  718. description: ID of the dataset.
  719. - in: path
  720. name: document_id
  721. type: string
  722. required: true
  723. description: ID of the document.
  724. - in: query
  725. name: page
  726. type: integer
  727. required: false
  728. default: 1
  729. description: Page number.
  730. - in: query
  731. name: page_size
  732. type: integer
  733. required: false
  734. default: 30
  735. description: Number of items per page.
  736. - in: header
  737. name: Authorization
  738. type: string
  739. required: true
  740. description: Bearer token for authentication.
  741. responses:
  742. 200:
  743. description: List of chunks.
  744. schema:
  745. type: object
  746. properties:
  747. total:
  748. type: integer
  749. description: Total number of chunks.
  750. chunks:
  751. type: array
  752. items:
  753. type: object
  754. properties:
  755. id:
  756. type: string
  757. description: Chunk ID.
  758. content:
  759. type: string
  760. description: Chunk content.
  761. document_id:
  762. type: string
  763. description: ID of the document.
  764. important_keywords:
  765. type: array
  766. items:
  767. type: string
  768. description: Important keywords.
  769. image_id:
  770. type: string
  771. description: Image ID associated with the chunk.
  772. doc:
  773. type: object
  774. description: Document details.
  775. """
  776. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  777. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  778. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  779. if not doc:
  780. return get_error_data_result(
  781. message=f"You don't own the document {document_id}."
  782. )
  783. doc = doc[0]
  784. req = request.args
  785. doc_id = document_id
  786. page = int(req.get("page", 1))
  787. size = int(req.get("page_size", 30))
  788. question = req.get("keywords", "")
  789. query = {
  790. "doc_ids": [doc_id],
  791. "page": page,
  792. "size": size,
  793. "question": question,
  794. "sort": True,
  795. }
  796. key_mapping = {
  797. "chunk_num": "chunk_count",
  798. "kb_id": "dataset_id",
  799. "token_num": "token_count",
  800. "parser_id": "chunk_method",
  801. }
  802. run_mapping = {
  803. "0": "UNSTART",
  804. "1": "RUNNING",
  805. "2": "CANCEL",
  806. "3": "DONE",
  807. "4": "FAIL",
  808. }
  809. doc = doc.to_dict()
  810. renamed_doc = {}
  811. for key, value in doc.items():
  812. new_key = key_mapping.get(key, key)
  813. renamed_doc[new_key] = value
  814. if key == "run":
  815. renamed_doc["run"] = run_mapping.get(str(value))
  816. res = {"total": 0, "chunks": [], "doc": renamed_doc}
  817. if req.get("id"):
  818. chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
  819. k = []
  820. for n in chunk.keys():
  821. if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
  822. k.append(n)
  823. for n in k:
  824. del chunk[n]
  825. if not chunk:
  826. return get_error_data_result(f"Chunk `{req.get('id')}` not found.")
  827. res['total'] = 1
  828. final_chunk = {
  829. "id":chunk.get("id",chunk.get("chunk_id")),
  830. "content":chunk["content_with_weight"],
  831. "document_id":chunk.get("doc_id",chunk.get("document_id")),
  832. "docnm_kwd":chunk["docnm_kwd"],
  833. "important_keywords":chunk.get("important_kwd",[]),
  834. "questions":chunk.get("question_kwd",[]),
  835. "dataset_id":chunk.get("kb_id",chunk.get("dataset_id")),
  836. "image_id":chunk["img_id"],
  837. "available":bool(chunk.get("available_int",1)),
  838. "positions":chunk.get("position_int",[]),
  839. }
  840. res["chunks"].append(final_chunk)
  841. _ = Chunk(**final_chunk)
  842. elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
  843. sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
  844. highlight=True)
  845. res["total"] = sres.total
  846. for id in sres.ids:
  847. d = {
  848. "id": id,
  849. "content": (
  850. rmSpace(sres.highlight[id])
  851. if question and id in sres.highlight
  852. else sres.field[id].get("content_with_weight", "")
  853. ),
  854. "document_id": sres.field[id]["doc_id"],
  855. "docnm_kwd": sres.field[id]["docnm_kwd"],
  856. "important_keywords": sres.field[id].get("important_kwd", []),
  857. "questions": sres.field[id].get("question_kwd", []),
  858. "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
  859. "image_id": sres.field[id].get("img_id", ""),
  860. "available": bool(sres.field[id].get("available_int", 1)),
  861. "positions": sres.field[id].get("position_int",[]),
  862. }
  863. res["chunks"].append(d)
  864. _ = Chunk(**d) # validate the chunk
  865. return get_result(data=res)
  866. @manager.route( # noqa: F821
  867. "/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
  868. )
  869. @token_required
  870. def add_chunk(tenant_id, dataset_id, document_id):
  871. """
  872. Add a chunk to a document.
  873. ---
  874. tags:
  875. - Chunks
  876. security:
  877. - ApiKeyAuth: []
  878. parameters:
  879. - in: path
  880. name: dataset_id
  881. type: string
  882. required: true
  883. description: ID of the dataset.
  884. - in: path
  885. name: document_id
  886. type: string
  887. required: true
  888. description: ID of the document.
  889. - in: body
  890. name: body
  891. description: Chunk data.
  892. required: true
  893. schema:
  894. type: object
  895. properties:
  896. content:
  897. type: string
  898. required: true
  899. description: Content of the chunk.
  900. important_keywords:
  901. type: array
  902. items:
  903. type: string
  904. description: Important keywords.
  905. - in: header
  906. name: Authorization
  907. type: string
  908. required: true
  909. description: Bearer token for authentication.
  910. responses:
  911. 200:
  912. description: Chunk added successfully.
  913. schema:
  914. type: object
  915. properties:
  916. chunk:
  917. type: object
  918. properties:
  919. id:
  920. type: string
  921. description: Chunk ID.
  922. content:
  923. type: string
  924. description: Chunk content.
  925. document_id:
  926. type: string
  927. description: ID of the document.
  928. important_keywords:
  929. type: array
  930. items:
  931. type: string
  932. description: Important keywords.
  933. """
  934. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  935. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  936. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  937. if not doc:
  938. return get_error_data_result(
  939. message=f"You don't own the document {document_id}."
  940. )
  941. doc = doc[0]
  942. req = request.json
  943. if not req.get("content"):
  944. return get_error_data_result(message="`content` is required")
  945. if "important_keywords" in req:
  946. if not isinstance(req["important_keywords"], list):
  947. return get_error_data_result(
  948. "`important_keywords` is required to be a list"
  949. )
  950. if "questions" in req:
  951. if not isinstance(req["questions"], list):
  952. return get_error_data_result(
  953. "`questions` is required to be a list"
  954. )
  955. chunk_id = xxhash.xxh64((req["content"] + document_id).encode("utf-8")).hexdigest()
  956. d = {
  957. "id": chunk_id,
  958. "content_ltks": rag_tokenizer.tokenize(req["content"]),
  959. "content_with_weight": req["content"],
  960. }
  961. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  962. d["important_kwd"] = req.get("important_keywords", [])
  963. d["important_tks"] = rag_tokenizer.tokenize(
  964. " ".join(req.get("important_keywords", []))
  965. )
  966. d["question_kwd"] = req.get("questions", [])
  967. d["question_tks"] = rag_tokenizer.tokenize(
  968. "\n".join(req.get("questions", []))
  969. )
  970. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  971. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  972. d["kb_id"] = dataset_id
  973. d["docnm_kwd"] = doc.name
  974. d["doc_id"] = document_id
  975. embd_id = DocumentService.get_embd_id(document_id)
  976. embd_mdl = TenantLLMService.model_instance(
  977. tenant_id, LLMType.EMBEDDING.value, embd_id
  978. )
  979. v, c = embd_mdl.encode([doc.name, req["content"] if not d["question_kwd"] else "\n".join(d["question_kwd"])])
  980. v = 0.1 * v[0] + 0.9 * v[1]
  981. d["q_%d_vec" % len(v)] = v.tolist()
  982. settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
  983. DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
  984. # rename keys
  985. key_mapping = {
  986. "id": "id",
  987. "content_with_weight": "content",
  988. "doc_id": "document_id",
  989. "important_kwd": "important_keywords",
  990. "question_kwd": "questions",
  991. "kb_id": "dataset_id",
  992. "create_timestamp_flt": "create_timestamp",
  993. "create_time": "create_time",
  994. "document_keyword": "document",
  995. }
  996. renamed_chunk = {}
  997. for key, value in d.items():
  998. if key in key_mapping:
  999. new_key = key_mapping.get(key, key)
  1000. renamed_chunk[new_key] = value
  1001. _ = Chunk(**renamed_chunk) # validate the chunk
  1002. return get_result(data={"chunk": renamed_chunk})
  1003. # return get_result(data={"chunk_id": chunk_id})
  1004. @manager.route( # noqa: F821
  1005. "datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
  1006. )
  1007. @token_required
  1008. def rm_chunk(tenant_id, dataset_id, document_id):
  1009. """
  1010. Remove chunks from a document.
  1011. ---
  1012. tags:
  1013. - Chunks
  1014. security:
  1015. - ApiKeyAuth: []
  1016. parameters:
  1017. - in: path
  1018. name: dataset_id
  1019. type: string
  1020. required: true
  1021. description: ID of the dataset.
  1022. - in: path
  1023. name: document_id
  1024. type: string
  1025. required: true
  1026. description: ID of the document.
  1027. - in: body
  1028. name: body
  1029. description: Chunk removal parameters.
  1030. required: true
  1031. schema:
  1032. type: object
  1033. properties:
  1034. chunk_ids:
  1035. type: array
  1036. items:
  1037. type: string
  1038. description: List of chunk IDs to remove.
  1039. - in: header
  1040. name: Authorization
  1041. type: string
  1042. required: true
  1043. description: Bearer token for authentication.
  1044. responses:
  1045. 200:
  1046. description: Chunks removed successfully.
  1047. schema:
  1048. type: object
  1049. """
  1050. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1051. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1052. req = request.json
  1053. condition = {"doc_id": document_id}
  1054. if "chunk_ids" in req:
  1055. condition["id"] = req["chunk_ids"]
  1056. chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
  1057. if chunk_number != 0:
  1058. DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
  1059. if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
  1060. return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req['chunk_ids'])}")
  1061. return get_result(message=f"deleted {chunk_number} chunks")
  1062. @manager.route( # noqa: F821
  1063. "/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
  1064. )
  1065. @token_required
  1066. def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
  1067. """
  1068. Update a chunk within a document.
  1069. ---
  1070. tags:
  1071. - Chunks
  1072. security:
  1073. - ApiKeyAuth: []
  1074. parameters:
  1075. - in: path
  1076. name: dataset_id
  1077. type: string
  1078. required: true
  1079. description: ID of the dataset.
  1080. - in: path
  1081. name: document_id
  1082. type: string
  1083. required: true
  1084. description: ID of the document.
  1085. - in: path
  1086. name: chunk_id
  1087. type: string
  1088. required: true
  1089. description: ID of the chunk to update.
  1090. - in: body
  1091. name: body
  1092. description: Chunk update parameters.
  1093. required: true
  1094. schema:
  1095. type: object
  1096. properties:
  1097. content:
  1098. type: string
  1099. description: Updated content of the chunk.
  1100. important_keywords:
  1101. type: array
  1102. items:
  1103. type: string
  1104. description: Updated important keywords.
  1105. available:
  1106. type: boolean
  1107. description: Availability status of the chunk.
  1108. - in: header
  1109. name: Authorization
  1110. type: string
  1111. required: true
  1112. description: Bearer token for authentication.
  1113. responses:
  1114. 200:
  1115. description: Chunk updated successfully.
  1116. schema:
  1117. type: object
  1118. """
  1119. chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
  1120. if chunk is None:
  1121. return get_error_data_result(f"Can't find this chunk {chunk_id}")
  1122. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1123. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1124. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  1125. if not doc:
  1126. return get_error_data_result(
  1127. message=f"You don't own the document {document_id}."
  1128. )
  1129. doc = doc[0]
  1130. req = request.json
  1131. if "content" in req:
  1132. content = req["content"]
  1133. else:
  1134. content = chunk.get("content_with_weight", "")
  1135. d = {"id": chunk_id, "content_with_weight": content}
  1136. d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
  1137. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  1138. if "important_keywords" in req:
  1139. if not isinstance(req["important_keywords"], list):
  1140. return get_error_data_result("`important_keywords` should be a list")
  1141. d["important_kwd"] = req.get("important_keywords", [])
  1142. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  1143. if "questions" in req:
  1144. if not isinstance(req["questions"], list):
  1145. return get_error_data_result("`questions` should be a list")
  1146. d["question_kwd"] = req.get("questions")
  1147. d["question_tks"] = rag_tokenizer.tokenize("\n".join(req["questions"]))
  1148. if "available" in req:
  1149. d["available_int"] = int(req["available"])
  1150. embd_id = DocumentService.get_embd_id(document_id)
  1151. embd_mdl = TenantLLMService.model_instance(
  1152. tenant_id, LLMType.EMBEDDING.value, embd_id
  1153. )
  1154. if doc.parser_id == ParserType.QA:
  1155. arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
  1156. if len(arr) != 2:
  1157. return get_error_data_result(
  1158. message="Q&A must be separated by TAB/ENTER key."
  1159. )
  1160. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  1161. d = beAdoc(
  1162. d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
  1163. )
  1164. v, c = embd_mdl.encode([doc.name, d["content_with_weight"] if not d.get("question_kwd") else "\n".join(d["question_kwd"])])
  1165. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  1166. d["q_%d_vec" % len(v)] = v.tolist()
  1167. settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
  1168. return get_result()
  1169. @manager.route("/retrieval", methods=["POST"]) # noqa: F821
  1170. @token_required
  1171. def retrieval_test(tenant_id):
  1172. """
  1173. Retrieve chunks based on a query.
  1174. ---
  1175. tags:
  1176. - Retrieval
  1177. security:
  1178. - ApiKeyAuth: []
  1179. parameters:
  1180. - in: body
  1181. name: body
  1182. description: Retrieval parameters.
  1183. required: true
  1184. schema:
  1185. type: object
  1186. properties:
  1187. dataset_ids:
  1188. type: array
  1189. items:
  1190. type: string
  1191. required: true
  1192. description: List of dataset IDs to search in.
  1193. question:
  1194. type: string
  1195. required: true
  1196. description: Query string.
  1197. document_ids:
  1198. type: array
  1199. items:
  1200. type: string
  1201. description: List of document IDs to filter.
  1202. similarity_threshold:
  1203. type: number
  1204. format: float
  1205. description: Similarity threshold.
  1206. vector_similarity_weight:
  1207. type: number
  1208. format: float
  1209. description: Vector similarity weight.
  1210. top_k:
  1211. type: integer
  1212. description: Maximum number of chunks to return.
  1213. highlight:
  1214. type: boolean
  1215. description: Whether to highlight matched content.
  1216. - in: header
  1217. name: Authorization
  1218. type: string
  1219. required: true
  1220. description: Bearer token for authentication.
  1221. responses:
  1222. 200:
  1223. description: Retrieval results.
  1224. schema:
  1225. type: object
  1226. properties:
  1227. chunks:
  1228. type: array
  1229. items:
  1230. type: object
  1231. properties:
  1232. id:
  1233. type: string
  1234. description: Chunk ID.
  1235. content:
  1236. type: string
  1237. description: Chunk content.
  1238. document_id:
  1239. type: string
  1240. description: ID of the document.
  1241. dataset_id:
  1242. type: string
  1243. description: ID of the dataset.
  1244. similarity:
  1245. type: number
  1246. format: float
  1247. description: Similarity score.
  1248. """
  1249. req = request.json
  1250. if not req.get("dataset_ids"):
  1251. return get_error_data_result("`dataset_ids` is required.")
  1252. kb_ids = req["dataset_ids"]
  1253. if not isinstance(kb_ids, list):
  1254. return get_error_data_result("`dataset_ids` should be a list")
  1255. for id in kb_ids:
  1256. if not KnowledgebaseService.accessible(kb_id=id, user_id=tenant_id):
  1257. return get_error_data_result(f"You don't own the dataset {id}.")
  1258. kbs = KnowledgebaseService.get_by_ids(kb_ids)
  1259. embd_nms = list(set([kb.embd_id for kb in kbs]))
  1260. if len(embd_nms) != 1:
  1261. return get_result(
  1262. message='Datasets use different embedding models."',
  1263. code=settings.RetCode.DATA_ERROR,
  1264. )
  1265. if "question" not in req:
  1266. return get_error_data_result("`question` is required.")
  1267. page = int(req.get("page", 1))
  1268. size = int(req.get("page_size", 30))
  1269. question = req["question"]
  1270. doc_ids = req.get("document_ids", [])
  1271. use_kg = req.get("use_kg", False)
  1272. if not isinstance(doc_ids, list):
  1273. return get_error_data_result("`documents` should be a list")
  1274. doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
  1275. for doc_id in doc_ids:
  1276. if doc_id not in doc_ids_list:
  1277. return get_error_data_result(
  1278. f"The datasets don't own the document {doc_id}"
  1279. )
  1280. similarity_threshold = float(req.get("similarity_threshold", 0.2))
  1281. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  1282. top = int(req.get("top_k", 1024))
  1283. if req.get("highlight") == "False" or req.get("highlight") == "false":
  1284. highlight = False
  1285. else:
  1286. highlight = True
  1287. try:
  1288. e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
  1289. if not e:
  1290. return get_error_data_result(message="Dataset not found!")
  1291. embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING, llm_name=kb.embd_id)
  1292. rerank_mdl = None
  1293. if req.get("rerank_id"):
  1294. rerank_mdl = LLMBundle(kb.tenant_id, LLMType.RERANK, llm_name=req["rerank_id"])
  1295. if req.get("keyword", False):
  1296. chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT)
  1297. question += keyword_extraction(chat_mdl, question)
  1298. ranks = settings.retrievaler.retrieval(
  1299. question,
  1300. embd_mdl,
  1301. kb.tenant_id,
  1302. kb_ids,
  1303. page,
  1304. size,
  1305. similarity_threshold,
  1306. vector_similarity_weight,
  1307. top,
  1308. doc_ids,
  1309. rerank_mdl=rerank_mdl,
  1310. highlight=highlight,
  1311. rank_feature=label_question(question, kbs)
  1312. )
  1313. if use_kg:
  1314. ck = settings.kg_retrievaler.retrieval(question,
  1315. [k.tenant_id for k in kbs],
  1316. kb_ids,
  1317. embd_mdl,
  1318. LLMBundle(kb.tenant_id, LLMType.CHAT))
  1319. if ck["content_with_weight"]:
  1320. ranks["chunks"].insert(0, ck)
  1321. for c in ranks["chunks"]:
  1322. c.pop("vector", None)
  1323. ##rename keys
  1324. renamed_chunks = []
  1325. for chunk in ranks["chunks"]:
  1326. key_mapping = {
  1327. "chunk_id": "id",
  1328. "content_with_weight": "content",
  1329. "doc_id": "document_id",
  1330. "important_kwd": "important_keywords",
  1331. "question_kwd": "questions",
  1332. "docnm_kwd": "document_keyword",
  1333. "kb_id":"dataset_id"
  1334. }
  1335. rename_chunk = {}
  1336. for key, value in chunk.items():
  1337. new_key = key_mapping.get(key, key)
  1338. rename_chunk[new_key] = value
  1339. renamed_chunks.append(rename_chunk)
  1340. ranks["chunks"] = renamed_chunks
  1341. return get_result(data=ranks)
  1342. except Exception as e:
  1343. if str(e).find("not_found") > 0:
  1344. return get_result(
  1345. message="No chunk found! Check the chunk status please!",
  1346. code=settings.RetCode.DATA_ERROR,
  1347. )
  1348. return server_error_response(e)