您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import pathlib
  17. import datetime
  18. from api.db.services.dialog_service import keyword_extraction
  19. from rag.app.qa import rmPrefix, beAdoc
  20. from rag.nlp import rag_tokenizer
  21. from api.db import LLMType, ParserType
  22. from api.db.services.llm_service import TenantLLMService
  23. from api import settings
  24. import hashlib
  25. import re
  26. from api.utils.api_utils import token_required
  27. from api.db.db_models import Task
  28. from api.db.services.task_service import TaskService, queue_tasks
  29. from api.utils.api_utils import server_error_response
  30. from api.utils.api_utils import get_result, get_error_data_result
  31. from io import BytesIO
  32. from flask import request, send_file
  33. from api.db import FileSource, TaskStatus, FileType
  34. from api.db.db_models import File
  35. from api.db.services.document_service import DocumentService
  36. from api.db.services.file2document_service import File2DocumentService
  37. from api.db.services.file_service import FileService
  38. from api.db.services.knowledgebase_service import KnowledgebaseService
  39. from api.utils.api_utils import construct_json_result, get_parser_config
  40. from rag.nlp import search
  41. from rag.utils import rmSpace
  42. from rag.utils.storage_factory import STORAGE_IMPL
  43. import os
  44. MAXIMUM_OF_UPLOADING_FILES = 256
  45. @manager.route("/datasets/<dataset_id>/documents", methods=["POST"])
  46. @token_required
  47. def upload(dataset_id, tenant_id):
  48. """
  49. Upload documents to a dataset.
  50. ---
  51. tags:
  52. - Documents
  53. security:
  54. - ApiKeyAuth: []
  55. parameters:
  56. - in: path
  57. name: dataset_id
  58. type: string
  59. required: true
  60. description: ID of the dataset.
  61. - in: header
  62. name: Authorization
  63. type: string
  64. required: true
  65. description: Bearer token for authentication.
  66. - in: formData
  67. name: file
  68. type: file
  69. required: true
  70. description: Document files to upload.
  71. responses:
  72. 200:
  73. description: Successfully uploaded documents.
  74. schema:
  75. type: object
  76. properties:
  77. data:
  78. type: array
  79. items:
  80. type: object
  81. properties:
  82. id:
  83. type: string
  84. description: Document ID.
  85. name:
  86. type: string
  87. description: Document name.
  88. chunk_count:
  89. type: integer
  90. description: Number of chunks.
  91. token_count:
  92. type: integer
  93. description: Number of tokens.
  94. dataset_id:
  95. type: string
  96. description: ID of the dataset.
  97. chunk_method:
  98. type: string
  99. description: Chunking method used.
  100. run:
  101. type: string
  102. description: Processing status.
  103. """
  104. if "file" not in request.files:
  105. return get_error_data_result(
  106. message="No file part!", code=settings.RetCode.ARGUMENT_ERROR
  107. )
  108. file_objs = request.files.getlist("file")
  109. for file_obj in file_objs:
  110. if file_obj.filename == "":
  111. return get_result(
  112. message="No file selected!", code=settings.RetCode.ARGUMENT_ERROR
  113. )
  114. # total size
  115. total_size = 0
  116. for file_obj in file_objs:
  117. file_obj.seek(0, os.SEEK_END)
  118. total_size += file_obj.tell()
  119. file_obj.seek(0)
  120. MAX_TOTAL_FILE_SIZE = 10 * 1024 * 1024
  121. if total_size > MAX_TOTAL_FILE_SIZE:
  122. return get_result(
  123. message=f"Total file size exceeds 10MB limit! ({total_size / (1024 * 1024):.2f} MB)",
  124. code=settings.RetCode.ARGUMENT_ERROR,
  125. )
  126. e, kb = KnowledgebaseService.get_by_id(dataset_id)
  127. if not e:
  128. raise LookupError(f"Can't find the dataset with ID {dataset_id}!")
  129. err, files = FileService.upload_document(kb, file_objs, tenant_id)
  130. if err:
  131. return get_result(message="\n".join(err), code=settings.RetCode.SERVER_ERROR)
  132. # rename key's name
  133. renamed_doc_list = []
  134. for file in files:
  135. doc = file[0]
  136. key_mapping = {
  137. "chunk_num": "chunk_count",
  138. "kb_id": "dataset_id",
  139. "token_num": "token_count",
  140. "parser_id": "chunk_method",
  141. }
  142. renamed_doc = {}
  143. for key, value in doc.items():
  144. new_key = key_mapping.get(key, key)
  145. renamed_doc[new_key] = value
  146. renamed_doc["run"] = "UNSTART"
  147. renamed_doc_list.append(renamed_doc)
  148. return get_result(data=renamed_doc_list)
  149. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["PUT"])
  150. @token_required
  151. def update_doc(tenant_id, dataset_id, document_id):
  152. """
  153. Update a document within a dataset.
  154. ---
  155. tags:
  156. - Documents
  157. security:
  158. - ApiKeyAuth: []
  159. parameters:
  160. - in: path
  161. name: dataset_id
  162. type: string
  163. required: true
  164. description: ID of the dataset.
  165. - in: path
  166. name: document_id
  167. type: string
  168. required: true
  169. description: ID of the document to update.
  170. - in: header
  171. name: Authorization
  172. type: string
  173. required: true
  174. description: Bearer token for authentication.
  175. - in: body
  176. name: body
  177. description: Document update parameters.
  178. required: true
  179. schema:
  180. type: object
  181. properties:
  182. name:
  183. type: string
  184. description: New name of the document.
  185. parser_config:
  186. type: object
  187. description: Parser configuration.
  188. chunk_method:
  189. type: string
  190. description: Chunking method.
  191. responses:
  192. 200:
  193. description: Document updated successfully.
  194. schema:
  195. type: object
  196. """
  197. req = request.json
  198. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  199. return get_error_data_result(message="You don't own the dataset.")
  200. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  201. if not doc:
  202. return get_error_data_result(message="The dataset doesn't own the document.")
  203. doc = doc[0]
  204. if "chunk_count" in req:
  205. if req["chunk_count"] != doc.chunk_num:
  206. return get_error_data_result(message="Can't change `chunk_count`.")
  207. if "token_count" in req:
  208. if req["token_count"] != doc.token_num:
  209. return get_error_data_result(message="Can't change `token_count`.")
  210. if "progress" in req:
  211. if req["progress"] != doc.progress:
  212. return get_error_data_result(message="Can't change `progress`.")
  213. if "name" in req and req["name"] != doc.name:
  214. if (
  215. pathlib.Path(req["name"].lower()).suffix
  216. != pathlib.Path(doc.name.lower()).suffix
  217. ):
  218. return get_result(
  219. message="The extension of file can't be changed",
  220. code=settings.RetCode.ARGUMENT_ERROR,
  221. )
  222. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  223. if d.name == req["name"]:
  224. return get_error_data_result(
  225. message="Duplicated document name in the same dataset."
  226. )
  227. if not DocumentService.update_by_id(document_id, {"name": req["name"]}):
  228. return get_error_data_result(message="Database error (Document rename)!")
  229. informs = File2DocumentService.get_by_document_id(document_id)
  230. if informs:
  231. e, file = FileService.get_by_id(informs[0].file_id)
  232. FileService.update_by_id(file.id, {"name": req["name"]})
  233. if "parser_config" in req:
  234. DocumentService.update_parser_config(doc.id, req["parser_config"])
  235. if "chunk_method" in req:
  236. valid_chunk_method = {
  237. "naive",
  238. "manual",
  239. "qa",
  240. "table",
  241. "paper",
  242. "book",
  243. "laws",
  244. "presentation",
  245. "picture",
  246. "one",
  247. "knowledge_graph",
  248. "email",
  249. }
  250. if req.get("chunk_method") not in valid_chunk_method:
  251. return get_error_data_result(
  252. f"`chunk_method` {req['chunk_method']} doesn't exist"
  253. )
  254. if doc.parser_id.lower() == req["chunk_method"].lower():
  255. return get_result()
  256. if doc.type == FileType.VISUAL or re.search(r"\.(ppt|pptx|pages)$", doc.name):
  257. return get_error_data_result(message="Not supported yet!")
  258. e = DocumentService.update_by_id(
  259. doc.id,
  260. {
  261. "parser_id": req["chunk_method"],
  262. "progress": 0,
  263. "progress_msg": "",
  264. "run": TaskStatus.UNSTART.value,
  265. },
  266. )
  267. if not e:
  268. return get_error_data_result(message="Document not found!")
  269. req["parser_config"] = get_parser_config(
  270. req["chunk_method"], req.get("parser_config")
  271. )
  272. DocumentService.update_parser_config(doc.id, req["parser_config"])
  273. if doc.token_num > 0:
  274. e = DocumentService.increment_chunk_num(
  275. doc.id,
  276. doc.kb_id,
  277. doc.token_num * -1,
  278. doc.chunk_num * -1,
  279. doc.process_duation * -1,
  280. )
  281. if not e:
  282. return get_error_data_result(message="Document not found!")
  283. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  284. return get_result()
  285. @manager.route("/datasets/<dataset_id>/documents/<document_id>", methods=["GET"])
  286. @token_required
  287. def download(tenant_id, dataset_id, document_id):
  288. """
  289. Download a document from a dataset.
  290. ---
  291. tags:
  292. - Documents
  293. security:
  294. - ApiKeyAuth: []
  295. produces:
  296. - application/octet-stream
  297. parameters:
  298. - in: path
  299. name: dataset_id
  300. type: string
  301. required: true
  302. description: ID of the dataset.
  303. - in: path
  304. name: document_id
  305. type: string
  306. required: true
  307. description: ID of the document to download.
  308. - in: header
  309. name: Authorization
  310. type: string
  311. required: true
  312. description: Bearer token for authentication.
  313. responses:
  314. 200:
  315. description: Document file stream.
  316. schema:
  317. type: file
  318. 400:
  319. description: Error message.
  320. schema:
  321. type: object
  322. """
  323. if not KnowledgebaseService.query(id=dataset_id, tenant_id=tenant_id):
  324. return get_error_data_result(message=f"You do not own the dataset {dataset_id}.")
  325. doc = DocumentService.query(kb_id=dataset_id, id=document_id)
  326. if not doc:
  327. return get_error_data_result(
  328. message=f"The dataset not own the document {document_id}."
  329. )
  330. # The process of downloading
  331. doc_id, doc_location = File2DocumentService.get_storage_address(
  332. doc_id=document_id
  333. ) # minio address
  334. file_stream = STORAGE_IMPL.get(doc_id, doc_location)
  335. if not file_stream:
  336. return construct_json_result(
  337. message="This file is empty.", code=settings.RetCode.DATA_ERROR
  338. )
  339. file = BytesIO(file_stream)
  340. # Use send_file with a proper filename and MIME type
  341. return send_file(
  342. file,
  343. as_attachment=True,
  344. download_name=doc[0].name,
  345. mimetype="application/octet-stream", # Set a default MIME type
  346. )
  347. @manager.route("/datasets/<dataset_id>/documents", methods=["GET"])
  348. @token_required
  349. def list_docs(dataset_id, tenant_id):
  350. """
  351. List documents in a dataset.
  352. ---
  353. tags:
  354. - Documents
  355. security:
  356. - ApiKeyAuth: []
  357. parameters:
  358. - in: path
  359. name: dataset_id
  360. type: string
  361. required: true
  362. description: ID of the dataset.
  363. - in: query
  364. name: id
  365. type: string
  366. required: false
  367. description: Filter by document ID.
  368. - in: query
  369. name: page
  370. type: integer
  371. required: false
  372. default: 1
  373. description: Page number.
  374. - in: query
  375. name: page_size
  376. type: integer
  377. required: false
  378. default: 30
  379. description: Number of items per page.
  380. - in: query
  381. name: orderby
  382. type: string
  383. required: false
  384. default: "create_time"
  385. description: Field to order by.
  386. - in: query
  387. name: desc
  388. type: boolean
  389. required: false
  390. default: true
  391. description: Order in descending.
  392. - in: header
  393. name: Authorization
  394. type: string
  395. required: true
  396. description: Bearer token for authentication.
  397. responses:
  398. 200:
  399. description: List of documents.
  400. schema:
  401. type: object
  402. properties:
  403. total:
  404. type: integer
  405. description: Total number of documents.
  406. docs:
  407. type: array
  408. items:
  409. type: object
  410. properties:
  411. id:
  412. type: string
  413. description: Document ID.
  414. name:
  415. type: string
  416. description: Document name.
  417. chunk_count:
  418. type: integer
  419. description: Number of chunks.
  420. token_count:
  421. type: integer
  422. description: Number of tokens.
  423. dataset_id:
  424. type: string
  425. description: ID of the dataset.
  426. chunk_method:
  427. type: string
  428. description: Chunking method used.
  429. run:
  430. type: string
  431. description: Processing status.
  432. """
  433. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  434. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  435. id = request.args.get("id")
  436. name = request.args.get("name")
  437. if not DocumentService.query(id=id, kb_id=dataset_id):
  438. return get_error_data_result(message=f"You don't own the document {id}.")
  439. if not DocumentService.query(name=name, kb_id=dataset_id):
  440. return get_error_data_result(message=f"You don't own the document {name}.")
  441. page = int(request.args.get("page", 1))
  442. keywords = request.args.get("keywords", "")
  443. page_size = int(request.args.get("page_size", 30))
  444. orderby = request.args.get("orderby", "create_time")
  445. if request.args.get("desc") == "False":
  446. desc = False
  447. else:
  448. desc = True
  449. docs, tol = DocumentService.get_list(
  450. dataset_id, page, page_size, orderby, desc, keywords, id, name
  451. )
  452. # rename key's name
  453. renamed_doc_list = []
  454. for doc in docs:
  455. key_mapping = {
  456. "chunk_num": "chunk_count",
  457. "kb_id": "dataset_id",
  458. "token_num": "token_count",
  459. "parser_id": "chunk_method",
  460. }
  461. run_mapping = {
  462. "0": "UNSTART",
  463. "1": "RUNNING",
  464. "2": "CANCEL",
  465. "3": "DONE",
  466. "4": "FAIL",
  467. }
  468. renamed_doc = {}
  469. for key, value in doc.items():
  470. if key == "run":
  471. renamed_doc["run"] = run_mapping.get(str(value))
  472. new_key = key_mapping.get(key, key)
  473. renamed_doc[new_key] = value
  474. if key == "run":
  475. renamed_doc["run"] = run_mapping.get(value)
  476. renamed_doc_list.append(renamed_doc)
  477. return get_result(data={"total": tol, "docs": renamed_doc_list})
  478. @manager.route("/datasets/<dataset_id>/documents", methods=["DELETE"])
  479. @token_required
  480. def delete(tenant_id, dataset_id):
  481. """
  482. Delete documents from a dataset.
  483. ---
  484. tags:
  485. - Documents
  486. security:
  487. - ApiKeyAuth: []
  488. parameters:
  489. - in: path
  490. name: dataset_id
  491. type: string
  492. required: true
  493. description: ID of the dataset.
  494. - in: body
  495. name: body
  496. description: Document deletion parameters.
  497. required: true
  498. schema:
  499. type: object
  500. properties:
  501. ids:
  502. type: array
  503. items:
  504. type: string
  505. description: List of document IDs to delete.
  506. - in: header
  507. name: Authorization
  508. type: string
  509. required: true
  510. description: Bearer token for authentication.
  511. responses:
  512. 200:
  513. description: Documents deleted successfully.
  514. schema:
  515. type: object
  516. """
  517. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  518. return get_error_data_result(message=f"You don't own the dataset {dataset_id}. ")
  519. req = request.json
  520. if not req:
  521. doc_ids = None
  522. else:
  523. doc_ids = req.get("ids")
  524. if not doc_ids:
  525. doc_list = []
  526. docs = DocumentService.query(kb_id=dataset_id)
  527. for doc in docs:
  528. doc_list.append(doc.id)
  529. else:
  530. doc_list = doc_ids
  531. root_folder = FileService.get_root_folder(tenant_id)
  532. pf_id = root_folder["id"]
  533. FileService.init_knowledgebase_docs(pf_id, tenant_id)
  534. errors = ""
  535. for doc_id in doc_list:
  536. try:
  537. e, doc = DocumentService.get_by_id(doc_id)
  538. if not e:
  539. return get_error_data_result(message="Document not found!")
  540. tenant_id = DocumentService.get_tenant_id(doc_id)
  541. if not tenant_id:
  542. return get_error_data_result(message="Tenant not found!")
  543. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  544. if not DocumentService.remove_document(doc, tenant_id):
  545. return get_error_data_result(
  546. message="Database error (Document removal)!"
  547. )
  548. f2d = File2DocumentService.get_by_document_id(doc_id)
  549. FileService.filter_delete(
  550. [
  551. File.source_type == FileSource.KNOWLEDGEBASE,
  552. File.id == f2d[0].file_id,
  553. ]
  554. )
  555. File2DocumentService.delete_by_document_id(doc_id)
  556. STORAGE_IMPL.rm(b, n)
  557. except Exception as e:
  558. errors += str(e)
  559. if errors:
  560. return get_result(message=errors, code=settings.RetCode.SERVER_ERROR)
  561. return get_result()
  562. @manager.route("/datasets/<dataset_id>/chunks", methods=["POST"])
  563. @token_required
  564. def parse(tenant_id, dataset_id):
  565. """
  566. Start parsing documents into chunks.
  567. ---
  568. tags:
  569. - Chunks
  570. security:
  571. - ApiKeyAuth: []
  572. parameters:
  573. - in: path
  574. name: dataset_id
  575. type: string
  576. required: true
  577. description: ID of the dataset.
  578. - in: body
  579. name: body
  580. description: Parsing parameters.
  581. required: true
  582. schema:
  583. type: object
  584. properties:
  585. document_ids:
  586. type: array
  587. items:
  588. type: string
  589. description: List of document IDs to parse.
  590. - in: header
  591. name: Authorization
  592. type: string
  593. required: true
  594. description: Bearer token for authentication.
  595. responses:
  596. 200:
  597. description: Parsing started successfully.
  598. schema:
  599. type: object
  600. """
  601. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  602. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  603. req = request.json
  604. if not req.get("document_ids"):
  605. return get_error_data_result("`document_ids` is required")
  606. for id in req["document_ids"]:
  607. doc = DocumentService.query(id=id, kb_id=dataset_id)
  608. if not doc:
  609. return get_error_data_result(message=f"You don't own the document {id}.")
  610. if doc[0].progress != 0.0:
  611. return get_error_data_result(
  612. "Can't stop parsing document with progress at 0 or 100"
  613. )
  614. info = {"run": "1", "progress": 0}
  615. info["progress_msg"] = ""
  616. info["chunk_num"] = 0
  617. info["token_num"] = 0
  618. DocumentService.update_by_id(id, info)
  619. settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id)
  620. TaskService.filter_delete([Task.doc_id == id])
  621. e, doc = DocumentService.get_by_id(id)
  622. doc = doc.to_dict()
  623. doc["tenant_id"] = tenant_id
  624. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  625. queue_tasks(doc, bucket, name)
  626. return get_result()
  627. @manager.route("/datasets/<dataset_id>/chunks", methods=["DELETE"])
  628. @token_required
  629. def stop_parsing(tenant_id, dataset_id):
  630. """
  631. Stop parsing documents into chunks.
  632. ---
  633. tags:
  634. - Chunks
  635. security:
  636. - ApiKeyAuth: []
  637. parameters:
  638. - in: path
  639. name: dataset_id
  640. type: string
  641. required: true
  642. description: ID of the dataset.
  643. - in: body
  644. name: body
  645. description: Stop parsing parameters.
  646. required: true
  647. schema:
  648. type: object
  649. properties:
  650. document_ids:
  651. type: array
  652. items:
  653. type: string
  654. description: List of document IDs to stop parsing.
  655. - in: header
  656. name: Authorization
  657. type: string
  658. required: true
  659. description: Bearer token for authentication.
  660. responses:
  661. 200:
  662. description: Parsing stopped successfully.
  663. schema:
  664. type: object
  665. """
  666. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  667. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  668. req = request.json
  669. if not req.get("document_ids"):
  670. return get_error_data_result("`document_ids` is required")
  671. for id in req["document_ids"]:
  672. doc = DocumentService.query(id=id, kb_id=dataset_id)
  673. if not doc:
  674. return get_error_data_result(message=f"You don't own the document {id}.")
  675. if int(doc[0].progress) == 1 or int(doc[0].progress) == 0:
  676. return get_error_data_result(
  677. "Can't stop parsing document with progress at 0 or 1"
  678. )
  679. info = {"run": "2", "progress": 0, "chunk_num": 0}
  680. DocumentService.update_by_id(id, info)
  681. settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), dataset_id)
  682. return get_result()
  683. @manager.route("/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["GET"])
  684. @token_required
  685. def list_chunks(tenant_id, dataset_id, document_id):
  686. """
  687. List chunks of a document.
  688. ---
  689. tags:
  690. - Chunks
  691. security:
  692. - ApiKeyAuth: []
  693. parameters:
  694. - in: path
  695. name: dataset_id
  696. type: string
  697. required: true
  698. description: ID of the dataset.
  699. - in: path
  700. name: document_id
  701. type: string
  702. required: true
  703. description: ID of the document.
  704. - in: query
  705. name: page
  706. type: integer
  707. required: false
  708. default: 1
  709. description: Page number.
  710. - in: query
  711. name: page_size
  712. type: integer
  713. required: false
  714. default: 30
  715. description: Number of items per page.
  716. - in: header
  717. name: Authorization
  718. type: string
  719. required: true
  720. description: Bearer token for authentication.
  721. responses:
  722. 200:
  723. description: List of chunks.
  724. schema:
  725. type: object
  726. properties:
  727. total:
  728. type: integer
  729. description: Total number of chunks.
  730. chunks:
  731. type: array
  732. items:
  733. type: object
  734. properties:
  735. id:
  736. type: string
  737. description: Chunk ID.
  738. content:
  739. type: string
  740. description: Chunk content.
  741. document_id:
  742. type: string
  743. description: ID of the document.
  744. important_keywords:
  745. type: array
  746. items:
  747. type: string
  748. description: Important keywords.
  749. image_id:
  750. type: string
  751. description: Image ID associated with the chunk.
  752. doc:
  753. type: object
  754. description: Document details.
  755. """
  756. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  757. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  758. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  759. if not doc:
  760. return get_error_data_result(
  761. message=f"You don't own the document {document_id}."
  762. )
  763. doc = doc[0]
  764. req = request.args
  765. doc_id = document_id
  766. page = int(req.get("page", 1))
  767. size = int(req.get("page_size", 30))
  768. question = req.get("keywords", "")
  769. query = {
  770. "doc_ids": [doc_id],
  771. "page": page,
  772. "size": size,
  773. "question": question,
  774. "sort": True,
  775. }
  776. key_mapping = {
  777. "chunk_num": "chunk_count",
  778. "kb_id": "dataset_id",
  779. "token_num": "token_count",
  780. "parser_id": "chunk_method",
  781. }
  782. run_mapping = {
  783. "0": "UNSTART",
  784. "1": "RUNNING",
  785. "2": "CANCEL",
  786. "3": "DONE",
  787. "4": "FAIL",
  788. }
  789. doc = doc.to_dict()
  790. renamed_doc = {}
  791. for key, value in doc.items():
  792. new_key = key_mapping.get(key, key)
  793. renamed_doc[new_key] = value
  794. if key == "run":
  795. renamed_doc["run"] = run_mapping.get(str(value))
  796. res = {"total": 0, "chunks": [], "doc": renamed_doc}
  797. origin_chunks = []
  798. if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
  799. sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
  800. highlight=True)
  801. res["total"] = sres.total
  802. sign = 0
  803. for id in sres.ids:
  804. d = {
  805. "id": id,
  806. "content_with_weight": (
  807. rmSpace(sres.highlight[id])
  808. if question and id in sres.highlight
  809. else sres.field[id].get("content_with_weight", "")
  810. ),
  811. "doc_id": sres.field[id]["doc_id"],
  812. "docnm_kwd": sres.field[id]["docnm_kwd"],
  813. "important_kwd": sres.field[id].get("important_kwd", []),
  814. "img_id": sres.field[id].get("img_id", ""),
  815. "available_int": sres.field[id].get("available_int", 1),
  816. "positions": sres.field[id].get("position_int", "").split("\t"),
  817. }
  818. if len(d["positions"]) % 5 == 0:
  819. poss = []
  820. for i in range(0, len(d["positions"]), 5):
  821. poss.append(
  822. [
  823. float(d["positions"][i]),
  824. float(d["positions"][i + 1]),
  825. float(d["positions"][i + 2]),
  826. float(d["positions"][i + 3]),
  827. float(d["positions"][i + 4]),
  828. ]
  829. )
  830. d["positions"] = poss
  831. origin_chunks.append(d)
  832. if req.get("id"):
  833. if req.get("id") == id:
  834. origin_chunks.clear()
  835. origin_chunks.append(d)
  836. sign = 1
  837. break
  838. if req.get("id"):
  839. if sign == 0:
  840. return get_error_data_result(f"Can't find this chunk {req.get('id')}")
  841. for chunk in origin_chunks:
  842. key_mapping = {
  843. "id": "id",
  844. "content_with_weight": "content",
  845. "doc_id": "document_id",
  846. "important_kwd": "important_keywords",
  847. "img_id": "image_id",
  848. "available_int": "available",
  849. }
  850. renamed_chunk = {}
  851. for key, value in chunk.items():
  852. new_key = key_mapping.get(key, key)
  853. renamed_chunk[new_key] = value
  854. if renamed_chunk["available"] == 0:
  855. renamed_chunk["available"] = False
  856. if renamed_chunk["available"] == 1:
  857. renamed_chunk["available"] = True
  858. res["chunks"].append(renamed_chunk)
  859. return get_result(data=res)
  860. @manager.route(
  861. "/datasets/<dataset_id>/documents/<document_id>/chunks", methods=["POST"]
  862. )
  863. @token_required
  864. def add_chunk(tenant_id, dataset_id, document_id):
  865. """
  866. Add a chunk to a document.
  867. ---
  868. tags:
  869. - Chunks
  870. security:
  871. - ApiKeyAuth: []
  872. parameters:
  873. - in: path
  874. name: dataset_id
  875. type: string
  876. required: true
  877. description: ID of the dataset.
  878. - in: path
  879. name: document_id
  880. type: string
  881. required: true
  882. description: ID of the document.
  883. - in: body
  884. name: body
  885. description: Chunk data.
  886. required: true
  887. schema:
  888. type: object
  889. properties:
  890. content:
  891. type: string
  892. required: true
  893. description: Content of the chunk.
  894. important_keywords:
  895. type: array
  896. items:
  897. type: string
  898. description: Important keywords.
  899. - in: header
  900. name: Authorization
  901. type: string
  902. required: true
  903. description: Bearer token for authentication.
  904. responses:
  905. 200:
  906. description: Chunk added successfully.
  907. schema:
  908. type: object
  909. properties:
  910. chunk:
  911. type: object
  912. properties:
  913. id:
  914. type: string
  915. description: Chunk ID.
  916. content:
  917. type: string
  918. description: Chunk content.
  919. document_id:
  920. type: string
  921. description: ID of the document.
  922. important_keywords:
  923. type: array
  924. items:
  925. type: string
  926. description: Important keywords.
  927. """
  928. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  929. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  930. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  931. if not doc:
  932. return get_error_data_result(
  933. message=f"You don't own the document {document_id}."
  934. )
  935. doc = doc[0]
  936. req = request.json
  937. if not req.get("content"):
  938. return get_error_data_result(message="`content` is required")
  939. if "important_keywords" in req:
  940. if type(req["important_keywords"]) != list:
  941. return get_error_data_result(
  942. "`important_keywords` is required to be a list"
  943. )
  944. md5 = hashlib.md5()
  945. md5.update((req["content"] + document_id).encode("utf-8"))
  946. chunk_id = md5.hexdigest()
  947. d = {
  948. "id": chunk_id,
  949. "content_ltks": rag_tokenizer.tokenize(req["content"]),
  950. "content_with_weight": req["content"],
  951. }
  952. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  953. d["important_kwd"] = req.get("important_keywords", [])
  954. d["important_tks"] = rag_tokenizer.tokenize(
  955. " ".join(req.get("important_keywords", []))
  956. )
  957. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  958. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  959. d["kb_id"] = dataset_id
  960. d["docnm_kwd"] = doc.name
  961. d["doc_id"] = document_id
  962. embd_id = DocumentService.get_embd_id(document_id)
  963. embd_mdl = TenantLLMService.model_instance(
  964. tenant_id, LLMType.EMBEDDING.value, embd_id
  965. )
  966. v, c = embd_mdl.encode([doc.name, req["content"]])
  967. v = 0.1 * v[0] + 0.9 * v[1]
  968. d["q_%d_vec" % len(v)] = v.tolist()
  969. settings.docStoreConn.insert([d], search.index_name(tenant_id), dataset_id)
  970. DocumentService.increment_chunk_num(doc.id, doc.kb_id, c, 1, 0)
  971. # rename keys
  972. key_mapping = {
  973. "id": "id",
  974. "content_with_weight": "content",
  975. "doc_id": "document_id",
  976. "important_kwd": "important_keywords",
  977. "kb_id": "dataset_id",
  978. "create_timestamp_flt": "create_timestamp",
  979. "create_time": "create_time",
  980. "document_keyword": "document",
  981. }
  982. renamed_chunk = {}
  983. for key, value in d.items():
  984. if key in key_mapping:
  985. new_key = key_mapping.get(key, key)
  986. renamed_chunk[new_key] = value
  987. return get_result(data={"chunk": renamed_chunk})
  988. # return get_result(data={"chunk_id": chunk_id})
  989. @manager.route(
  990. "datasets/<dataset_id>/documents/<document_id>/chunks", methods=["DELETE"]
  991. )
  992. @token_required
  993. def rm_chunk(tenant_id, dataset_id, document_id):
  994. """
  995. Remove chunks from a document.
  996. ---
  997. tags:
  998. - Chunks
  999. security:
  1000. - ApiKeyAuth: []
  1001. parameters:
  1002. - in: path
  1003. name: dataset_id
  1004. type: string
  1005. required: true
  1006. description: ID of the dataset.
  1007. - in: path
  1008. name: document_id
  1009. type: string
  1010. required: true
  1011. description: ID of the document.
  1012. - in: body
  1013. name: body
  1014. description: Chunk removal parameters.
  1015. required: true
  1016. schema:
  1017. type: object
  1018. properties:
  1019. chunk_ids:
  1020. type: array
  1021. items:
  1022. type: string
  1023. description: List of chunk IDs to remove.
  1024. - in: header
  1025. name: Authorization
  1026. type: string
  1027. required: true
  1028. description: Bearer token for authentication.
  1029. responses:
  1030. 200:
  1031. description: Chunks removed successfully.
  1032. schema:
  1033. type: object
  1034. """
  1035. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1036. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1037. req = request.json
  1038. condition = {"doc_id": document_id}
  1039. if "chunk_ids" in req:
  1040. condition["id"] = req["chunk_ids"]
  1041. chunk_number = settings.docStoreConn.delete(condition, search.index_name(tenant_id), dataset_id)
  1042. if chunk_number != 0:
  1043. DocumentService.decrement_chunk_num(document_id, dataset_id, 1, chunk_number, 0)
  1044. if "chunk_ids" in req and chunk_number != len(req["chunk_ids"]):
  1045. return get_error_data_result(message=f"rm_chunk deleted chunks {chunk_number}, expect {len(req['chunk_ids'])}")
  1046. return get_result(message=f"deleted {chunk_number} chunks")
  1047. @manager.route(
  1048. "/datasets/<dataset_id>/documents/<document_id>/chunks/<chunk_id>", methods=["PUT"]
  1049. )
  1050. @token_required
  1051. def update_chunk(tenant_id, dataset_id, document_id, chunk_id):
  1052. """
  1053. Update a chunk within a document.
  1054. ---
  1055. tags:
  1056. - Chunks
  1057. security:
  1058. - ApiKeyAuth: []
  1059. parameters:
  1060. - in: path
  1061. name: dataset_id
  1062. type: string
  1063. required: true
  1064. description: ID of the dataset.
  1065. - in: path
  1066. name: document_id
  1067. type: string
  1068. required: true
  1069. description: ID of the document.
  1070. - in: path
  1071. name: chunk_id
  1072. type: string
  1073. required: true
  1074. description: ID of the chunk to update.
  1075. - in: body
  1076. name: body
  1077. description: Chunk update parameters.
  1078. required: true
  1079. schema:
  1080. type: object
  1081. properties:
  1082. content:
  1083. type: string
  1084. description: Updated content of the chunk.
  1085. important_keywords:
  1086. type: array
  1087. items:
  1088. type: string
  1089. description: Updated important keywords.
  1090. available:
  1091. type: boolean
  1092. description: Availability status of the chunk.
  1093. - in: header
  1094. name: Authorization
  1095. type: string
  1096. required: true
  1097. description: Bearer token for authentication.
  1098. responses:
  1099. 200:
  1100. description: Chunk updated successfully.
  1101. schema:
  1102. type: object
  1103. """
  1104. chunk = settings.docStoreConn.get(chunk_id, search.index_name(tenant_id), [dataset_id])
  1105. if chunk is None:
  1106. return get_error_data_result(f"Can't find this chunk {chunk_id}")
  1107. if not KnowledgebaseService.accessible(kb_id=dataset_id, user_id=tenant_id):
  1108. return get_error_data_result(message=f"You don't own the dataset {dataset_id}.")
  1109. doc = DocumentService.query(id=document_id, kb_id=dataset_id)
  1110. if not doc:
  1111. return get_error_data_result(
  1112. message=f"You don't own the document {document_id}."
  1113. )
  1114. doc = doc[0]
  1115. req = request.json
  1116. if "content" in req:
  1117. content = req["content"]
  1118. else:
  1119. content = chunk.get("content_with_weight", "")
  1120. d = {"id": chunk_id, "content_with_weight": content}
  1121. d["content_ltks"] = rag_tokenizer.tokenize(d["content_with_weight"])
  1122. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  1123. if "important_keywords" in req:
  1124. if not isinstance(req["important_keywords"], list):
  1125. return get_error_data_result("`important_keywords` should be a list")
  1126. d["important_kwd"] = req.get("important_keywords")
  1127. d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_keywords"]))
  1128. if "available" in req:
  1129. d["available_int"] = int(req["available"])
  1130. embd_id = DocumentService.get_embd_id(document_id)
  1131. embd_mdl = TenantLLMService.model_instance(
  1132. tenant_id, LLMType.EMBEDDING.value, embd_id
  1133. )
  1134. if doc.parser_id == ParserType.QA:
  1135. arr = [t for t in re.split(r"[\n\t]", d["content_with_weight"]) if len(t) > 1]
  1136. if len(arr) != 2:
  1137. return get_error_data_result(
  1138. message="Q&A must be separated by TAB/ENTER key."
  1139. )
  1140. q, a = rmPrefix(arr[0]), rmPrefix(arr[1])
  1141. d = beAdoc(
  1142. d, arr[0], arr[1], not any([rag_tokenizer.is_chinese(t) for t in q + a])
  1143. )
  1144. v, c = embd_mdl.encode([doc.name, d["content_with_weight"]])
  1145. v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
  1146. d["q_%d_vec" % len(v)] = v.tolist()
  1147. settings.docStoreConn.update({"id": chunk_id}, d, search.index_name(tenant_id), dataset_id)
  1148. return get_result()
  1149. @manager.route("/retrieval", methods=["POST"])
  1150. @token_required
  1151. def retrieval_test(tenant_id):
  1152. """
  1153. Retrieve chunks based on a query.
  1154. ---
  1155. tags:
  1156. - Retrieval
  1157. security:
  1158. - ApiKeyAuth: []
  1159. parameters:
  1160. - in: body
  1161. name: body
  1162. description: Retrieval parameters.
  1163. required: true
  1164. schema:
  1165. type: object
  1166. properties:
  1167. dataset_ids:
  1168. type: array
  1169. items:
  1170. type: string
  1171. required: true
  1172. description: List of dataset IDs to search in.
  1173. question:
  1174. type: string
  1175. required: true
  1176. description: Query string.
  1177. document_ids:
  1178. type: array
  1179. items:
  1180. type: string
  1181. description: List of document IDs to filter.
  1182. similarity_threshold:
  1183. type: number
  1184. format: float
  1185. description: Similarity threshold.
  1186. vector_similarity_weight:
  1187. type: number
  1188. format: float
  1189. description: Vector similarity weight.
  1190. top_k:
  1191. type: integer
  1192. description: Maximum number of chunks to return.
  1193. highlight:
  1194. type: boolean
  1195. description: Whether to highlight matched content.
  1196. - in: header
  1197. name: Authorization
  1198. type: string
  1199. required: true
  1200. description: Bearer token for authentication.
  1201. responses:
  1202. 200:
  1203. description: Retrieval results.
  1204. schema:
  1205. type: object
  1206. properties:
  1207. chunks:
  1208. type: array
  1209. items:
  1210. type: object
  1211. properties:
  1212. id:
  1213. type: string
  1214. description: Chunk ID.
  1215. content:
  1216. type: string
  1217. description: Chunk content.
  1218. document_id:
  1219. type: string
  1220. description: ID of the document.
  1221. dataset_id:
  1222. type: string
  1223. description: ID of the dataset.
  1224. similarity:
  1225. type: number
  1226. format: float
  1227. description: Similarity score.
  1228. """
  1229. req = request.json
  1230. if not req.get("dataset_ids"):
  1231. return get_error_data_result("`dataset_ids` is required.")
  1232. kb_ids = req["dataset_ids"]
  1233. if not isinstance(kb_ids, list):
  1234. return get_error_data_result("`dataset_ids` should be a list")
  1235. kbs = KnowledgebaseService.get_by_ids(kb_ids)
  1236. for id in kb_ids:
  1237. if not KnowledgebaseService.accessible(kb_id=id, user_id=tenant_id):
  1238. return get_error_data_result(f"You don't own the dataset {id}.")
  1239. embd_nms = list(set([kb.embd_id for kb in kbs]))
  1240. if len(embd_nms) != 1:
  1241. return get_result(
  1242. message='Datasets use different embedding models."',
  1243. code=settings.RetCode.AUTHENTICATION_ERROR,
  1244. )
  1245. if "question" not in req:
  1246. return get_error_data_result("`question` is required.")
  1247. page = int(req.get("page", 1))
  1248. size = int(req.get("page_size", 30))
  1249. question = req["question"]
  1250. doc_ids = req.get("document_ids", [])
  1251. if not isinstance(doc_ids, list):
  1252. return get_error_data_result("`documents` should be a list")
  1253. doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
  1254. for doc_id in doc_ids:
  1255. if doc_id not in doc_ids_list:
  1256. return get_error_data_result(
  1257. f"The datasets don't own the document {doc_id}"
  1258. )
  1259. similarity_threshold = float(req.get("similarity_threshold", 0.2))
  1260. vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
  1261. top = int(req.get("top_k", 1024))
  1262. if req.get("highlight") == "False" or req.get("highlight") == "false":
  1263. highlight = False
  1264. else:
  1265. highlight = True
  1266. try:
  1267. e, kb = KnowledgebaseService.get_by_id(kb_ids[0])
  1268. if not e:
  1269. return get_error_data_result(message="Dataset not found!")
  1270. embd_mdl = TenantLLMService.model_instance(
  1271. kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id
  1272. )
  1273. rerank_mdl = None
  1274. if req.get("rerank_id"):
  1275. rerank_mdl = TenantLLMService.model_instance(
  1276. kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"]
  1277. )
  1278. if req.get("keyword", False):
  1279. chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
  1280. question += keyword_extraction(chat_mdl, question)
  1281. retr = settings.retrievaler if kb.parser_id != ParserType.KG else settings.kg_retrievaler
  1282. ranks = retr.retrieval(
  1283. question,
  1284. embd_mdl,
  1285. kb.tenant_id,
  1286. kb_ids,
  1287. page,
  1288. size,
  1289. similarity_threshold,
  1290. vector_similarity_weight,
  1291. top,
  1292. doc_ids,
  1293. rerank_mdl=rerank_mdl,
  1294. highlight=highlight,
  1295. )
  1296. for c in ranks["chunks"]:
  1297. c.pop("vector", None)
  1298. ##rename keys
  1299. renamed_chunks = []
  1300. for chunk in ranks["chunks"]:
  1301. key_mapping = {
  1302. "chunk_id": "id",
  1303. "content_with_weight": "content",
  1304. "doc_id": "document_id",
  1305. "important_kwd": "important_keywords",
  1306. "docnm_kwd": "document_keyword",
  1307. }
  1308. rename_chunk = {}
  1309. for key, value in chunk.items():
  1310. new_key = key_mapping.get(key, key)
  1311. rename_chunk[new_key] = value
  1312. renamed_chunks.append(rename_chunk)
  1313. ranks["chunks"] = renamed_chunks
  1314. return get_result(data=ranks)
  1315. except Exception as e:
  1316. if str(e).find("not_found") > 0:
  1317. return get_result(
  1318. message="No chunk found! Check the chunk status please!",
  1319. code=settings.RetCode.DATA_ERROR,
  1320. )
  1321. return server_error_response(e)