You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

doc.py 45KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import pathlib
  17. import datetime
  18. from api.db.services.dialog_service import keyword_extraction
  19. from rag.app.qa import rmPrefix, beAdoc
  20. from rag.nlp import rag_tokenizer
  21. from api.db import LLMType, ParserType
  22. from api.db.services.llm_service import TenantLLMService
  23. from api import settings
  24. import hashlib
  25. import re
  26. from api.utils.api_utils import token_required
  27. from api.db.db_models import Task
  28. from api.db.services.task_service import TaskService, queue_tasks
  29. from api.utils.api_utils import server_error_response
  30. from api.utils.api_utils import get_result, get_error_data_result
  31. from io import BytesIO
  32. from flask import request, send_file
  33. from api.db import FileSource, TaskStatus, FileType
  34. from api.db.db_models import File
  35. from api.db.services.document_service import DocumentService
  36. from api.db.services.file2document_service import File2DocumentService
  37. from api.db.services.file_service import FileService
  38. from api.db.services.knowledgebase_service import KnowledgebaseService
  39. from api import settings
  40. from api.utils.api_utils import construct_json_result, get_parser_config
  41. from rag.nlp import search
  42. from rag.utils import rmSpace
  43. from rag.utils.storage_factory import STORAGE_IMPL
  44. import os
  45. MAXIMUM_OF_UPLOADING_FILES = 256
  46. @manager.route("/datasets/<dataset_id>/documents", methods=["POST"])
  47. @token_required
  48. def upload(dataset_id, tenant_id):
  49. """
  50. Upload documents to a dataset.
  51. ---
  52. tags:
  53. - Documents
  54. security:
  55. - ApiKeyAuth: []
  56. parameters:
  57. - in: path
  58. name: dataset_id
  59. type: string
  60. required: true
  61. description: ID of the dataset.
  62. - in: header
  63. name: Authorization
  64. type: string
  65. required: true
  66. description: Bearer token for authentication.
  67. - in: formData
  68. name: file
  69. type: file
  70. required: true
  71. description: Document files to upload.
  72. responses:
  73. 200:
  74. description: Successfully uploaded documents.
  75. schema:
  76. type: object
  77. properties:
  78. data:
  79. type: array
  80. items:
  81. type: object
  82. properties:
  83. id:
  84. type: string
  85. description: Document ID.
  86. name:
  87. type: string
  88. description: Document name.
  89. chunk_count:
  90. type: integer
  91. description: Number of chunks.
  92. token_count:
  93. type: integer
  94. description: Number of tokens.
  95. dataset_id:
  96. type: string
  97. description: ID of the dataset.
  98. chunk_method:
  99. type: string
  100. description: Chunking method used.
  101. run:
  102. type: string
  103. description: Processing status.
  104. """
  105. if "file" not in request.files:
  106. return get_error_data_result(
  107. message="No file part!", code=settings.RetCode.ARGUMENT_ERROR
  108. )
  109. file_objs = request.files.getlist("file")
  110. for file_obj in file_objs:
  111. if file_obj.filename == "":
  112. return get_result(
  113. message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR
  114. )
  115. # total size
  116. total_size = 0
  117. for file_obj in file_objs:
  118. file_obj.seek(0, os.SEEK_END)
  119. total_size += file_obj.tell()
  120. file_obj.seek(0)
  121. MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
  122. if total_size > MAX_TOTAL_FILE_SIZE:
  123. return get_result(
  124. message=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
  125. code=settings.RetCode.ARGUMENT_ERROR,
  126. )
  127. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  128. if not e:
  129. raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
  130. err, files = FileService.upload_document(kb, file_objs, tenant_id)
  131. if err:
  132. return get_result(message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
  133. # rename key's name
  134. renamed_doc_list = []
  135. for file in files:
  136. doc = file[0]
  137. key_mapping = {
  138. "chunk_num": "chunk_count",
  139. "kb_id": "dataset_id",
  140. "token_num": "token_count",
  141. "parser_id": "chunk_method",
  142. }
  143. renamed_doc = {}
  144. for key, value in doc.items():
  145. new_key = key_mapping.get(key, key)
  146. renamed_doc[new_key] = value
  147. renamed_doc["run"] = "UNSTART"
  148. renamed_doc_list.append(renamed_doc)
  149. return get_result(data=renamed_doc_list)
  150. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"])
  151. @token_required
  152. def update_doc(tenant_id, dataset_id, document_id):
  153. """
  154. Update a document within a dataset.
  155. ---
  156. tags:
  157. - Documents
  158. security:
  159. - ApiKeyAuth: []
  160. parameters:
  161. - in: path
  162. name: dataset_id
  163. type: string
  164. required: true
  165. description: ID of the dataset.
  166. - in: path
  167. name: document_id
  168. type: string
  169. required: true
  170. description: ID of the document to update.
  171. - in: header
  172. name: Authorization
  173. type: string
  174. required: true
  175. description: Bearer token for authentication.
  176. - in: body
  177. name: body
  178. description: Document update parameters.
  179. required: true
  180. schema:
  181. type: object
  182. properties:
  183. name:
  184. type: string
  185. description: New name of the document.
  186. parser_config:
  187. type: object
  188. description: Parser configuration.
  189. chunk_method:
  190. type: string
  191. description: Chunking method.
  192. responses:
  193. 200:
  194. description: Document updated successfully.
  195. schema:
  196. type: object
  197. """
  198. req = request.json
  199. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  200. return get_error_data_result(message="You don't own the dataset.")
  201. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  202. if not doc:
  203. return get_error_data_result(message="The dataset doesn't own the document.")
  204. doc = doc[0]
  205. if "chunk_count" in req:
  206. if req["chunk_count"] != doc.chunk_num:
  207. return get_error_data_result(message="Can't change `chunk_count`.")
  208. if "token_count" in req:
  209. if req["token_count"] != doc.token_num:
  210. return get_error_data_result(message="Can't change `token_count`.")
  211. if "progress" in req:
  212. if req["progress"] != doc.progress:
  213. return get_error_data_result(message="Can't change `progress`.")
  214. if "name" in req and req["name"] != doc.name:
  215. if (
  216. pathlib.Path(req["name"].lower()).suffix
  217. != pathlib.Path(doc.name.lower()).suffix
  218. ):
  219. return get_result(
  220. message="The extension of file can't be changed",
  221. code=settings.RetCode.ARGUMENT_ERROR,
  222. )
  223. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  224. if d.name == req["name"]:
  225. return get_error_data_result(
  226. message="Duplicated document name in the same dataset."
  227. )
  228. if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
  229. return get_error_data_result(message="Database error (Document rename)!")
  230. informs = File2DocumentService.get_by_document_id(document_id)
  231. if informs:
  232. e, file = FileService.get_by_id(informs[0].file_id)
  233. FileService.update_by_id(file.id, {"name": req["name"]})
  234. if "parser_config" in req:
  235. DocumentService.update_parser_config(doc.id, req["parser_config"])
  236. if "chunk_method" in req:
  237. valid_chunk_method = {
  238. "naive",
  239. "manual",
  240. "qa",
  241. "table",
  242. "paper",
  243. "book",
  244. "laws",
  245. "presentation",
  246. "picture",
  247. "one",
  248. "knowledge_graph",
  249. "email",
  250. }
  251. if req.get("chunk_method") not in valid_chunk_method:
  252. return get_error_data_result(
  253. f"`chunk_method` {req['chunk_method']} doesn't exist"
  254. )
  255. if doc.parser_id.lower() == req["chunk_method"].lower():
  256. return get_result()
  257. if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
  258. return get_error_data_result(message="Not supported yet!")
  259. e = DocumentService.update_by_id(
  260. doc.id,
  261. {
  262. "parser_id": req["chunk_method"],
  263. "progress": 0,
  264. "progress_msg": "",
  265. "run": TaskStatus.UNSTART.value,
  266. },
  267. )
  268. if not e:
  269. return get_error_data_result(message="Document not found!")
  270. req["parser_config"] = get_parser_config(
  271. req["chunk_method"], req.get("parser_config")
  272. )
  273. DocumentService.update_parser_config(doc.id, req["parser_config"])
  274. if doc.token_num > 0:
  275. e = DocumentService.increment_chunk_num(
  276. doc.id,
  277. doc.kb_id,
  278. doc.token_num * -1,
  279. doc.chunk_num * -1,
  280. doc.process_duation * -1,
  281. )
  282. if not e:
  283. return get_error_data_result(message="Document not found!")
  284. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  285. return get_result()
  286. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])
  287. @token_required
  288. def download(tenant_id, dataset_id, document_id):
  289. """
  290. Download a document from a dataset.
  291. ---
  292. tags:
  293. - Documents
  294. security:
  295. - ApiKeyAuth: []
  296. produces:
  297. - application/octet-stream
  298. parameters:
  299. - in: path
  300. name: dataset_id
  301. type: string
  302. required: true
  303. description: ID of the dataset.
  304. - in: path
  305. name: document_id
  306. type: string
  307. required: true
  308. description: ID of the document to download.
  309. - in: header
  310. name: Authorization
  311. type: string
  312. required: true
  313. description: Bearer token for authentication.
  314. responses:
  315. 200:
  316. description: Document file stream.
  317. schema:
  318. type: file
  319. 400:
  320. description: Error message.
  321. schema:
  322. type: object
  323. """
  324. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  325. return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
  326. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  327. if not doc:
  328. return get_error_data_result(
  329. message=f"The dataset not own the document {document_id}."
  330. )
  331. # The process of downloading
  332. doc_id, doc_location = File2DocumentService.get_storage_address(
  333. doc_id=document_id
  334. ) # minio address
  335. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  336. if not file_stream:
  337. return construct_json_result(
  338. message="This file is empty.", code=settings.RetCode.DATA_ERROR
  339. )
  340. file = BytesIO(file_stream)
  341. # Use send_file with a proper filename and MIME type
  342. return send_file(
  343. file,
  344. as_attachment=True,
  345. download_name=doc[0].name,
  346. mimetype="application/octet-stream", # Set a default MIME type
  347. )
  348. @manager.route("/datasets/<dataset_id>/documents", methods=["GET"])
  349. @token_required
  350. def list_docs(dataset_id, tenant_id):
  351. """
  352. List documents in a dataset.
  353. ---
  354. tags:
  355. - Documents
  356. security:
  357. - ApiKeyAuth: []
  358. parameters:
  359. - in: path
  360. name: dataset_id
  361. type: string
  362. required: true
  363. description: ID of the dataset.
  364. - in: query
  365. name: id
  366. type: string
  367. required: false
  368. description: Filter by document ID.
  369. - in: query
  370. name: page
  371. type: integer
  372. required: false
  373. default: 1
  374. description: Page number.
  375. - in: query
  376. name: page_size
  377. type: integer
  378. required: false
  379. default: 30
  380. description: Number of items per page.
  381. - in: query
  382. name: orderby
  383. type: string
  384. required: false
  385. default: "create_time"
  386. description: Field to order by.
  387. - in: query
  388. name: desc
  389. type: boolean
  390. required: false
  391. default: true
  392. description: Order in descending.
  393. - in: header
  394. name: Authorization
  395. type: string
  396. required: true
  397. description: Bearer token for authentication.
  398. responses:
  399. 200:
  400. description: List of documents.
  401. schema:
  402. type: object
  403. properties:
  404. total:
  405. type: integer
  406. description: Total number of documents.
  407. docs:
  408. type: array
  409. items:
  410. type: object
  411. properties:
  412. id:
  413. type: string
  414. description: Document ID.
  415. name:
  416. type: string
  417. description: Document name.
  418. chunk_count:
  419. type: integer
  420. description: Number of chunks.
  421. token_count:
  422. type: integer
  423. description: Number of tokens.
  424. dataset_id:
  425. type: string
  426. description: ID of the dataset.
  427. chunk_method:
  428. type: string
  429. description: Chunking method used.
  430. run:
  431. type: string
  432. description: Processing status.
  433. """
  434. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  435. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  436. id = request.args.get("id")
  437. name = request.args.get("name")
  438. if not DocumentService.query(id=id, kb_id=dataset_id):
  439. return get_error_data_result(message=f"You don't own the document {id}.")
  440. if not DocumentService.query(name=name, kb_id=dataset_id):
  441. return get_error_data_result(message=f"You don't own the document {name}.")
  442. page = int(request.args.get("page", 1))
  443. keywords = request.args.get("keywords", "")
  444. page_size = int(request.args.get("page_size", 30))
  445. orderby = request.args.get("orderby", "create_time")
  446. if request.args.get("desc") == "False":
  447. desc = False
  448. else:
  449. desc = True
  450. docs, tol = DocumentService.get_list(
  451. dataset_id, page, page_size, orderby, desc, keywords, id, name
  452. )
  453. # rename key's name
  454. renamed_doc_list = []
  455. for doc in docs:
  456. key_mapping = {
  457. "chunk_num": "chunk_count",
  458. "kb_id": "dataset_id",
  459. "token_num": "token_count",
  460. "parser_id": "chunk_method",
  461. }
  462. run_mapping = {
  463. "0": "UNSTART",
  464. "1": "RUNNING",
  465. "2": "CANCEL",
  466. "3": "DONE",
  467. "4": "FAIL",
  468. }
  469. renamed_doc = {}
  470. for key, value in doc.items():
  471. if key == "run":
  472. renamed_doc["run"] = run_mapping.get(str(value))
  473. new_key = key_mapping.get(key, key)
  474. renamed_doc[new_key] = value
  475. if key == "run":
  476. renamed_doc["run"] = run_mapping.get(value)
  477. renamed_doc_list.append(renamed_doc)
  478. return get_result(data={"total": tol, "docs": renamed_doc_list})
  479. @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"])
  480. @token_required
  481. def delete(tenant_id, dataset_id):
  482. """
  483. Delete documents from a dataset.
  484. ---
  485. tags:
  486. - Documents
  487. security:
  488. - ApiKeyAuth: []
  489. parameters:
  490. - in: path
  491. name: dataset_id
  492. type: string
  493. required: true
  494. description: ID of the dataset.
  495. - in: body
  496. name: body
  497. description: Document deletion parameters.
  498. required: true
  499. schema:
  500. type: object
  501. properties:
  502. ids:
  503. type: array
  504. items:
  505. type: string
  506. description: List of document IDs to delete.
  507. - in: header
  508. name: Authorization
  509. type: string
  510. required: true
  511. description: Bearer token for authentication.
  512. responses:
  513. 200:
  514. description: Documents deleted successfully.
  515. schema:
  516. type: object
  517. """
  518. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  519. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  520. req = request.json
  521. if not req:
  522. doc_ids = None
  523. else:
  524. doc_ids = req.get("ids")
  525. if not doc_ids:
  526. doc_list = []
  527. docs = DocumentService.query(kb_id=dataset_id)
  528. for doc in docs:
  529. doc_list.append(doc.id)
  530. else:
  531. doc_list = doc_ids
  532. root_folder = FileService.get_root_folder(tenant_id)
  533. pf_id = root_folder["id"]
  534. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  535. errors = ""
  536. for doc_id in doc_list:
  537. try:
  538. e, doc = DocumentService.get_by_id(doc_id)
  539. if not e:
  540. return get_error_data_result(message="Document not found!")
  541. tenant_id = DocumentService.get_tenant_id(doc_id)
  542. if not tenant_id:
  543. return get_error_data_result(message="Tenant not found!")
  544. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  545. if not DocumentService.remove_document(doc, tenant_id):
  546. return get_error_data_result(
  547. message="Database error (Document removal)!"
  548. )
  549. f2d = File2DocumentService.get_by_document_id(doc_id)
  550. FileService.filter_delete(
  551. [
  552. File.source_type == FileSource.KNOWLEDGEBASE,
  553. File.id == f2d[0].file_id,
  554. ]
  555. )
  556. File2DocumentService.delete_by_document_id(doc_id)
  557. STORAGE_IMPL.rm(b, n)
  558. except Exception as e:
  559. errors += str(e)
  560. if errors:
  561. return get_result(message=errors, code=settings.RetCode.SERVER_ERROR)
  562. return get_result()
  563. @manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])
  564. @token_required
  565. def parse(tenant_id, dataset_id):
  566. """
  567. Start parsing documents into chunks.
  568. ---
  569. tags:
  570. - Chunks
  571. security:
  572. - ApiKeyAuth: []
  573. parameters:
  574. - in: path
  575. name: dataset_id
  576. type: string
  577. required: true
  578. description: ID of the dataset.
  579. - in: body
  580. name: body
  581. description: Parsing parameters.
  582. required: true
  583. schema:
  584. type: object
  585. properties:
  586. document_ids:
  587. type: array
  588. items:
  589. type: string
  590. description: List of document IDs to parse.
  591. - in: header
  592. name: Authorization
  593. type: string
  594. required: true
  595. description: Bearer token for authentication.
  596. responses:
  597. 200:
  598. description: Parsing started successfully.
  599. schema:
  600. type: object
  601. """
  602. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  603. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  604. req = request.json
  605. if not req.get("document_ids"):
  606. return get_error_data_result("`document_ids` is required")
  607. for id in req["document_ids"]:
  608. doc = DocumentService.query(id=id, kb_id=dataset_id)
  609. if not doc:
  610. return get_error_data_result(message=f"You don't own the document {id}.")
  611. if doc[0].progress != 0.0:
  612. return get_error_data_result(
  613. "Can't stop parsing document with progress at 0 or 100"
  614. )
  615. info = {"run": "1", "progress": 0}
  616. info["progress_msg"] = ""
  617. info["chunk_num"] = 0
  618. info["token_num"] = 0
  619. DocumentService.update_by_id(id, info)
  620. settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
  621. TaskService.filter_delete([Task.doc_id == id])
  622. e, doc = DocumentService.get_by_id(id)
  623. doc = doc.to_dict()
  624. doc["tenant_id"] = tenant_id
  625. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  626. queue_tasks(doc, bucket, name)
  627. return get_result()
  628. @manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"])
  629. @token_required
  630. def stop_parsing(tenant_id, dataset_id):
  631. """
  632. Stop parsing documents into chunks.
  633. ---
  634. tags:
  635. - Chunks
  636. security:
  637. - ApiKeyAuth: []
  638. parameters:
  639. - in: path
  640. name: dataset_id
  641. type: string
  642. required: true
  643. description: ID of the dataset.
  644. - in: body
  645. name: body
  646. description: Stop parsing parameters.
  647. required: true
  648. schema:
  649. type: object
  650. properties:
  651. document_ids:
  652. type: array
  653. items:
  654. type: string
  655. description: List of document IDs to stop parsing.
  656. - in: header
  657. name: Authorization
  658. type: string
  659. required: true
  660. description: Bearer token for authentication.
  661. responses:
  662. 200:
  663. description: Parsing stopped successfully.
  664. schema:
  665. type: object
  666. """
  667. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  668. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  669. req = request.json
  670. if not req.get("document_ids"):
  671. return get_error_data_result("`document_ids` is required")
  672. for id in req["document_ids"]:
  673. doc = DocumentService.query(id=id, kb_id=dataset_id)
  674. if not doc:
  675. return get_error_data_result(message=f"You don't own the document {id}.")
  676. if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
  677. return get_error_data_result(
  678. "Can't stop parsing document with progress at 0 or 1"
  679. )
  680. info = {"run": "2", "progress": 0, "chunk_num": 0}
  681. DocumentService.update_by_id(id, info)
  682. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  683. return get_result()
  684. @manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"])
  685. @token_required
  686. def list_chunks(tenant_id, dataset_id, document_id):
  687. """
  688. List chunks of a document.
  689. ---
  690. tags:
  691. - Chunks
  692. security:
  693. - ApiKeyAuth: []
  694. parameters:
  695. - in: path
  696. name: dataset_id
  697. type: string
  698. required: true
  699. description: ID of the dataset.
  700. - in: path
  701. name: document_id
  702. type: string
  703. required: true
  704. description: ID of the document.
  705. - in: query
  706. name: page
  707. type: integer
  708. required: false
  709. default: 1
  710. description: Page number.
  711. - in: query
  712. name: page_size
  713. type: integer
  714. required: false
  715. default: 30
  716. description: Number of items per page.
  717. - in: header
  718. name: Authorization
  719. type: string
  720. required: true
  721. description: Bearer token for authentication.
  722. responses:
  723. 200:
  724. description: List of chunks.
  725. schema:
  726. type: object
  727. properties:
  728. total:
  729. type: integer
  730. description: Total number of chunks.
  731. chunks:
  732. type: array
  733. items:
  734. type: object
  735. properties:
  736. id:
  737. type: string
  738. description: Chunk ID.
  739. content:
  740. type: string
  741. description: Chunk content.
  742. document_id:
  743. type: string
  744. description: ID of the document.
  745. important_keywords:
  746. type: array
  747. items:
  748. type: string
  749. description: Important keywords.
  750. image_id:
  751. type: string
  752. description: Image ID associated with the chunk.
  753. doc:
  754. type: object
  755. description: Document details.
  756. """
  757. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  758. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  759. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  760. if not doc:
  761. return get_error_data_result(
  762. message=f"You don't own the document {document_id}."
  763. )
  764. doc = doc[0]
  765. req = request.args
  766. doc_id = document_id
  767. page = int(req.get("page", 1))
  768. size = int(req.get("page_size", 30))
  769. question = req.get("keywords", "")
  770. query = {
  771. "doc_ids": [doc_id],
  772. "page": page,
  773. "size": size,
  774. "question": question,
  775. "sort": True,
  776. }
  777. key_mapping = {
  778. "chunk_num": "chunk_count",
  779. "kb_id": "dataset_id",
  780. "token_num": "token_count",
  781. "parser_id": "chunk_method",
  782. }
  783. run_mapping = {
  784. "0": "UNSTART",
  785. "1": "RUNNING",
  786. "2": "CANCEL",
  787. "3": "DONE",
  788. "4": "FAIL",
  789. }
  790. doc = doc.to_dict()
  791. renamed_doc = {}
  792. for key, value in doc.items():
  793. new_key = key_mapping.get(key, key)
  794. renamed_doc[new_key] = value
  795. if key == "run":
  796. renamed_doc["run"] = run_mapping.get(str(value))
  797. res = {"total": 0, "chunks": [], "doc": renamed_doc}
  798. origin_chunks = []
  799. if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
  800. sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
  801. highlight=True)
  802. res["total"] = sres.total
  803. sign = 0
  804. for id in sres.ids:
  805. d = {
  806. "id": id,
  807. "content_with_weight": (
  808. rmSpace(sres.highlight[id])
  809. if question and id in sres.highlight
  810. else sres.field[id].get("content_with_weight", "")
  811. ),
  812. "doc_id": sres.field[id]["doc_id"],
  813. "docnm_kwd": sres.field[id]["docnm_kwd"],
  814. "important_kwd": sres.field[id].get("important_kwd", []),
  815. "img_id": sres.field[id].get("img_id", ""),
  816. "available_int": sres.field[id].get("available_int", 1),
  817. "positions": sres.field[id].get("position_int", "").split("\t"),
  818. }
  819. if len(d["positions"]) % 5 == 0:
  820. poss = []
  821. for i in range(0, len(d["positions"]), 5):
  822. poss.append(
  823. [
  824. float(d["positions"][i]),
  825. float(d["positions"][i + 1]),
  826. float(d["positions"][i + 2]),
  827. float(d["positions"][i + 3]),
  828. float(d["positions"][i + 4]),
  829. ]
  830. )
  831. d["positions"] = poss
  832. origin_chunks.append(d)
  833. if req.get("id"):
  834. if req.get("id") == id:
  835. origin_chunks.clear()
  836. origin_chunks.append(d)
  837. sign = 1
  838. break
  839. if req.get("id"):
  840. if sign == 0:
  841. return get_error_data_result(f"Can't find this chunk {req.get('id')}")
  842. for chunk in origin_chunks:
  843. key_mapping = {
  844. "id": "id",
  845. "content_with_weight": "content",
  846. "doc_id": "document_id",
  847. "important_kwd": "important_keywords",
  848. "img_id": "image_id",
  849. "available_int": "available",
  850. }
  851. renamed_chunk = {}
  852. for key, value in chunk.items():
  853. new_key = key_mapping.get(key, key)
  854. renamed_chunk[new_key] = value
  855. if renamed_chunk["available"] == 0:
  856. renamed_chunk["available"] = False
  857. if renamed_chunk["available"] == 1:
  858. renamed_chunk["available"] = True
  859. res["chunks"].append(renamed_chunk)
  860. return get_result(data=res)
  861. @manager.route(
  862. "/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
  863. )
  864. @token_required
  865. def add_chunk(tenant_id, dataset_id, document_id):
  866. """
  867. Add a chunk to a document.
  868. ---
  869. tags:
  870. - Chunks
  871. security:
  872. - ApiKeyAuth: []
  873. parameters:
  874. - in: path
  875. name: dataset_id
  876. type: string
  877. required: true
  878. description: ID of the dataset.
  879. - in: path
  880. name: document_id
  881. type: string
  882. required: true
  883. description: ID of the document.
  884. - in: body
  885. name: body
  886. description: Chunk data.
  887. required: true
  888. schema:
  889. type: object
  890. properties:
  891. content:
  892. type: string
  893. required: true
  894. description: Content of the chunk.
  895. important_keywords:
  896. type: array
  897. items:
  898. type: string
  899. description: Important keywords.
  900. - in: header
  901. name: Authorization
  902. type: string
  903. required: true
  904. description: Bearer token for authentication.
  905. responses:
  906. 200:
  907. description: Chunk added successfully.
  908. schema:
  909. type: object
  910. properties:
  911. chunk:
  912. type: object
  913. properties:
  914. id:
  915. type: string
  916. description: Chunk ID.
  917. content:
  918. type: string
  919. description: Chunk content.
  920. document_id:
  921. type: string
  922. description: ID of the document.
  923. important_keywords:
  924. type: array
  925. items:
  926. type: string
  927. description: Important keywords.
  928. """
  929. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  930. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  931. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  932. if not doc:
  933. return get_error_data_result(
  934. message=f"You don't own the document {document_id}."
  935. )
  936. doc = doc[0]
  937. req = request.json
  938. if not req.get("content"):
  939. return get_error_data_result(message="`content` is required")
  940. if "important_keywords" in req:
  941. if type(req["important_keywords"]) != list:
  942. return get_error_data_result(
  943. "`important_keywords` is required to be a list"
  944. )
  945. md5 = hashlib.md5()
  946. md5.update((req["content"] + document_id).encode("utf-8"))
  947. chunk_id = md5.hexdigest()
  948. d = {
  949. "id": chunk_id,
  950. "content_ltks": rag_tokenizer.tokenize(req["content"]),
  951. "content_with_weight": req["content"],
  952. }
  953. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  954. d["important_kwd"] = req.get("important_keywords", [])
  955. d["important_tks"] = rag_tokenizer.tokenize(
  956. " ".join(req.get("important_keywords", []))
  957. )
  958. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  959. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  960. d["kb_id"] = dataset_id
  961. d["docnm_kwd"] = doc.name
  962. d["doc_id"] = document_id
  963. embd_id = DocumentService.get_embd_id(document_id)
  964. embd_mdl = TenantLLMService.model_instance(
  965. tenant_id, LLMType.EMBEDDING.value, embd_id
  966. )
  967. v, c = embd_mdl.encode([doc.name, req["content"]])
  968. v = 0.1 * v[0] + 0.9 * v[1]
  969. d["q_%d_vec" % len(v)] = v.tolist()
  970. settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
  971. DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
  972. # rename keys
  973. key_mapping = {
  974. "id": "id",
  975. "content_with_weight": "content",
  976. "doc_id": "document_id",
  977. "important_kwd": "important_keywords",
  978. "kb_id": "dataset_id",
  979. "create_timestamp_flt": "create_timestamp",
  980. "create_time": "create_time",
  981. "document_keyword": "document",
  982. }
  983. renamed_chunk = {}
  984. for key, value in d.items():
  985. if key in key_mapping:
  986. new_key = key_mapping.get(key, key)
  987. renamed_chunk[new_key] = value
  988. return get_result(data={"chunk": renamed_chunk})
  989. # return get_result(data={"chunk_id": chunk_id})
  990. @manager.route(
  991. "datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
  992. )
  993. @token_required
  994. def rm_chunk(tenant_id, dataset_id, document_id):
  995. """
  996. Remove chunks from a document.
  997. ---
  998. tags:
  999. - Chunks
  1000. security:
  1001. - ApiKeyAuth: []
  1002. parameters:
  1003. - in: path
  1004. name: dataset_id
  1005. type: string
  1006. required: true
  1007. description: ID of the dataset.
  1008. - in: path
  1009. name: document_id
  1010. type: string
  1011. required: true
  1012. description: ID of the document.
  1013. - in: body
  1014. name: body
  1015. description: Chunk removal parameters.
  1016. required: true
  1017. schema:
  1018. type: object
  1019. properties:
  1020. chunk_ids:
  1021. type: array
  1022. items:
  1023. type: string
  1024. description: List of chunk IDs to remove.
  1025. - in: header
  1026. name: Authorization
  1027. type: string
  1028. required: true
  1029. description: Bearer token for authentication.
  1030. responses:
  1031. 200:
  1032. description: Chunks removed successfully.
  1033. schema:
  1034. type: object
  1035. """
  1036. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1037. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1038. req = request.json
  1039. condition = {"doc_id": document_id}
  1040. if "chunk_ids" in req:
  1041. condition["id"] = req["chunk_ids"]
  1042. chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
  1043. if chunk_number != 0:
  1044. DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
  1045. if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
  1046. return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req['chunk_ids'])}")
  1047. return get_result(message=f"deleted {chunk_number} chunks")
  1048. @manager.route(
  1049. "/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
  1050. )
  1051. @token_required
  1052. def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
  1053. """
  1054. Update a chunk within a document.
  1055. ---
  1056. tags:
  1057. - Chunks
  1058. security:
  1059. - ApiKeyAuth: []
  1060. parameters:
  1061. - in: path
  1062. name: dataset_id
  1063. type: string
  1064. required: true
  1065. description: ID of the dataset.
  1066. - in: path
  1067. name: document_id
  1068. type: string
  1069. required: true
  1070. description: ID of the document.
  1071. - in: path
  1072. name: chunk_id
  1073. type: string
  1074. required: true
  1075. description: ID of the chunk to update.
  1076. - in: body
  1077. name: body
  1078. description: Chunk update parameters.
  1079. required: true
  1080. schema:
  1081. type: object
  1082. properties:
  1083. content:
  1084. type: string
  1085. description: Updated content of the chunk.
  1086. important_keywords:
  1087. type: array
  1088. items:
  1089. type: string
  1090. description: Updated important keywords.
  1091. available:
  1092. type: boolean
  1093. description: Availability status of the chunk.
  1094. - in: header
  1095. name: Authorization
  1096. type: string
  1097. required: true
  1098. description: Bearer token for authentication.
  1099. responses:
  1100. 200:
  1101. description: Chunk updated successfully.
  1102. schema:
  1103. type: object
  1104. """
  1105. chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
  1106. if chunk is None:
  1107. return get_error_data_result(f"Can't find this chunk {chunk_id}")
  1108. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1109. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1110. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  1111. if not doc:
  1112. return get_error_data_result(
  1113. message=f"You don't own the document {document_id}."
  1114. )
  1115. doc = doc[0]
  1116. req = request.json
  1117. if "content" in req:
  1118. content = req["content"]
  1119. else:
  1120. content = chunk.get("content_with_weight", "")
  1121. d = {"id": chunk_id, "content_with_weight": content}
  1122. d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
  1123. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  1124. if "important_keywords" in req:
  1125. if not isinstance(req["important_keywords"], list):
  1126. return get_error_data_result("`important_keywords` should be a list")
  1127. d["important_kwd"] = req.get("important_keywords")
  1128. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  1129. if "available" in req:
  1130. d["available_int"] = int(req["available"])
  1131. embd_id = DocumentService.get_embd_id(document_id)
  1132. embd_mdl = TenantLLMService.model_instance(
  1133. tenant_id, LLMType.EMBEDDING.value, embd_id
  1134. )
  1135. if doc.parser_id == ParserType.QA:
  1136. arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
  1137. if len(arr) != 2:
  1138. return get_error_data_result(
  1139. message="Q&A must be separated by TAB/ENTER key."
  1140. )
  1141. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  1142. d = beAdoc(
  1143. d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
  1144. )
  1145. v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
  1146. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  1147. d["q_%d_vec" % len(v)] = v.tolist()
  1148. settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
  1149. return get_result()
  1150. @manager.route("/retrieval", methods=["POST"])
  1151. @token_required
  1152. def retrieval_test(tenant_id):
  1153. """
  1154. Retrieve chunks based on a query.
  1155. ---
  1156. tags:
  1157. - Retrieval
  1158. security:
  1159. - ApiKeyAuth: []
  1160. parameters:
  1161. - in: body
  1162. name: body
  1163. description: Retrieval parameters.
  1164. required: true
  1165. schema:
  1166. type: object
  1167. properties:
  1168. dataset_ids:
  1169. type: array
  1170. items:
  1171. type: string
  1172. required: true
  1173. description: List of dataset IDs to search in.
  1174. question:
  1175. type: string
  1176. required: true
  1177. description: Query string.
  1178. document_ids:
  1179. type: array
  1180. items:
  1181. type: string
  1182. description: List of document IDs to filter.
  1183. similarity_threshold:
  1184. type: number
  1185. format: float
  1186. description: Similarity threshold.
  1187. vector_similarity_weight:
  1188. type: number
  1189. format: float
  1190. description: Vector similarity weight.
  1191. top_k:
  1192. type: integer
  1193. description: Maximum number of chunks to return.
  1194. highlight:
  1195. type: boolean
  1196. description: Whether to highlight matched content.
  1197. - in: header
  1198. name: Authorization
  1199. type: string
  1200. required: true
  1201. description: Bearer token for authentication.
  1202. responses:
  1203. 200:
  1204. description: Retrieval results.
  1205. schema:
  1206. type: object
  1207. properties:
  1208. chunks:
  1209. type: array
  1210. items:
  1211. type: object
  1212. properties:
  1213. id:
  1214. type: string
  1215. description: Chunk ID.
  1216. content:
  1217. type: string
  1218. description: Chunk content.
  1219. document_id:
  1220. type: string
  1221. description: ID of the document.
  1222. dataset_id:
  1223. type: string
  1224. description: ID of the dataset.
  1225. similarity:
  1226. type: number
  1227. format: float
  1228. description: Similarity score.
  1229. """
  1230. req = request.json
  1231. if not req.get("dataset_ids"):
  1232. return get_error_data_result("`dataset_ids` is required.")
  1233. kb_ids = req["dataset_ids"]
  1234. if not isinstance(kb_ids, list):
  1235. return get_error_data_result("`dataset_ids` should be a list")
  1236. kbs = KnowledgebaseService.get_by_ids(kb_ids)
  1237. for id in kb_ids:
  1238. if not KnowledgebaseService.accessible(kb_id=id, user_id=tenant_id):
  1239. return get_error_data_result(f"You don't own the dataset {id}.")
  1240. embd_nms = list(set([kb.embd_id for kb in kbs]))
  1241. if len(embd_nms) != 1:
  1242. return get_result(
  1243. message='Datasets use different embedding models."',
  1244. code=settings.RetCode.AUTHENTICATION_ERROR,
  1245. )
  1246. if "question" not in req:
  1247. return get_error_data_result("`question` is required.")
  1248. page = int(req.get("page", 1))
  1249. size = int(req.get("page_size", 30))
  1250. question = req["question"]
  1251. doc_ids = req.get("document_ids", [])
  1252. if not isinstance(doc_ids, list):
  1253. return get_error_data_result("`documents` should be a list")
  1254. doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
  1255. for doc_id in doc_ids:
  1256. if doc_id not in doc_ids_list:
  1257. return get_error_data_result(
  1258. f"The datasets don't own the document {doc_id}"
  1259. )
  1260. similarity_threshold = float(req.get("similarity_threshold", 0.2))
  1261. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  1262. top = int(req.get("top_k", 1024))
  1263. if req.get("highlight") == "False" or req.get("highlight") == "false":
  1264. highlight = False
  1265. else:
  1266. highlight = True
  1267. try:
  1268. e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
  1269. if not e:
  1270. return get_error_data_result(message="Dataset not found!")
  1271. embd_mdl = TenantLLMService.model_instance(
  1272. kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
  1273. )
  1274. rerank_mdl = None
  1275. if req.get("rerank_id"):
  1276. rerank_mdl = TenantLLMService.model_instance(
  1277. kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
  1278. )
  1279. if req.get("keyword", False):
  1280. chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
  1281. question += keyword_extraction(chat_mdl, question)
  1282. retr = settings.retrievaler if kb.parser_id != ParserType.KG else settings.kg_retrievaler
  1283. ranks = retr.retrieval(
  1284. question,
  1285. embd_mdl,
  1286. kb.tenant_id,
  1287. kb_ids,
  1288. page,
  1289. size,
  1290. similarity_threshold,
  1291. vector_similarity_weight,
  1292. top,
  1293. doc_ids,
  1294. rerank_mdl=rerank_mdl,
  1295. highlight=highlight,
  1296. )
  1297. for c in ranks["chunks"]:
  1298. if "vector" in c:
  1299. del c["vector"]
  1300. ##rename keys
  1301. renamed_chunks = []
  1302. for chunk in ranks["chunks"]:
  1303. key_mapping = {
  1304. "chunk_id": "id",
  1305. "content_with_weight": "content",
  1306. "doc_id": "document_id",
  1307. "important_kwd": "important_keywords",
  1308. "docnm_kwd": "document_keyword",
  1309. }
  1310. rename_chunk = {}
  1311. for key, value in chunk.items():
  1312. new_key = key_mapping.get(key, key)
  1313. rename_chunk[new_key] = value
  1314. renamed_chunks.append(rename_chunk)
  1315. ranks["chunks"] = renamed_chunks
  1316. return get_result(data=ranks)
  1317. except Exception as e:
  1318. if str(e).find("not_found") > 0:
  1319. return get_result(
  1320. message="No chunk found! Check the chunk status please!",
  1321. code=settings.RetCode.DATA_ERROR,
  1322. )
  1323. return server_error_response(e)