Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

document_app.py 19KB


  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License
  15. #
  16. import datetime
  17. import hashlib
  18. import json
  19. import os
  20. import pathlib
  21. import re
  22. import traceback
  23. from concurrent.futures import ThreadPoolExecutor
  24. from copy import deepcopy
  25. from io import BytesIO
  26. import flask
  27. from elasticsearch_dsl import Q
  28. from flask import request
  29. from flask_login import login_required, current_user
  30. from api.db.db_models import Task, File
  31. from api.db.services.dialog_service import DialogService, ConversationService
  32. from api.db.services.file2document_service import File2DocumentService
  33. from api.db.services.file_service import FileService
  34. from api.db.services.llm_service import LLMBundle
  35. from api.db.services.task_service import TaskService, queue_tasks
  36. from api.db.services.user_service import TenantService, UserTenantService
  37. from graphrag.mind_map_extractor import MindMapExtractor
  38. from rag.app import naive
  39. from rag.nlp import search
  40. from rag.utils.es_conn import ELASTICSEARCH
  41. from api.db.services import duplicate_name
  42. from api.db.services.knowledgebase_service import KnowledgebaseService
  43. from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
  44. from api.utils import get_uuid
  45. from api.db import FileType, TaskStatus, ParserType, FileSource, LLMType
  46. from api.db.services.document_service import DocumentService, doc_upload_and_parse
  47. from api.settings import RetCode, stat_logger
  48. from api.utils.api_utils import get_json_result
  49. from rag.utils.storage_factory import STORAGE_IMPL
  50. from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
  51. from api.utils.web_utils import html2pdf, is_valid_url
  52. from api.contants import IMG_BASE64_PREFIX
  53. @manager.route('/upload', methods=['POST'])
  54. @login_required
  55. @validate_request("kb_id")
  56. def upload():
  57. kb_id = request.form.get("kb_id")
  58. if not kb_id:
  59. return get_json_result(
  60. data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
  61. if 'file' not in request.files:
  62. return get_json_result(
  63. data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
  64. file_objs = request.files.getlist('file')
  65. for file_obj in file_objs:
  66. if file_obj.filename == '':
  67. return get_json_result(
  68. data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
  69. e, kb = KnowledgebaseService.get_by_id(kb_id)
  70. if not e:
  71. raise LookupError("Can't find this knowledgebase!")
  72. err, _ = FileService.upload_document(kb, file_objs, current_user.id)
  73. if err:
  74. return get_json_result(
  75. data=False, retmsg="\n".join(err), retcode=RetCode.SERVER_ERROR)
  76. return get_json_result(data=True)
  77. @manager.route('/web_crawl', methods=['POST'])
  78. @login_required
  79. @validate_request("kb_id", "name", "url")
  80. def web_crawl():
  81. kb_id = request.form.get("kb_id")
  82. if not kb_id:
  83. return get_json_result(
  84. data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
  85. name = request.form.get("name")
  86. url = request.form.get("url")
  87. if not is_valid_url(url):
  88. return get_json_result(
  89. data=False, retmsg='The URL format is invalid', retcode=RetCode.ARGUMENT_ERROR)
  90. e, kb = KnowledgebaseService.get_by_id(kb_id)
  91. if not e:
  92. raise LookupError("Can't find this knowledgebase!")
  93. blob = html2pdf(url)
  94. if not blob: return server_error_response(ValueError("Download failure."))
  95. root_folder = FileService.get_root_folder(current_user.id)
  96. pf_id = root_folder["id"]
  97. FileService.init_knowledgebase_docs(pf_id, current_user.id)
  98. kb_root_folder = FileService.get_kb_folder(current_user.id)
  99. kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
  100. try:
  101. filename = duplicate_name(
  102. DocumentService.query,
  103. name=name + ".pdf",
  104. kb_id=kb.id)
  105. filetype = filename_type(filename)
  106. if filetype == FileType.OTHER.value:
  107. raise RuntimeError("This type of file has not been supported yet!")
  108. location = filename
  109. while STORAGE_IMPL.obj_exist(kb_id, location):
  110. location += "_"
  111. STORAGE_IMPL.put(kb_id, location, blob)
  112. doc = {
  113. "id": get_uuid(),
  114. "kb_id": kb.id,
  115. "parser_id": kb.parser_id,
  116. "parser_config": kb.parser_config,
  117. "created_by": current_user.id,
  118. "type": filetype,
  119. "name": filename,
  120. "location": location,
  121. "size": len(blob),
  122. "thumbnail": thumbnail(filename, blob)
  123. }
  124. if doc["type"] == FileType.VISUAL:
  125. doc["parser_id"] = ParserType.PICTURE.value
  126. if doc["type"] == FileType.AURAL:
  127. doc["parser_id"] = ParserType.AUDIO.value
  128. if re.search(r"\.(ppt|pptx|pages)$", filename):
  129. doc["parser_id"] = ParserType.PRESENTATION.value
  130. if re.search(r"\.(eml)$", filename):
  131. doc["parser_id"] = ParserType.EMAIL.value
  132. DocumentService.insert(doc)
  133. FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
  134. except Exception as e:
  135. return server_error_response(e)
  136. return get_json_result(data=True)
  137. @manager.route('/create', methods=['POST'])
  138. @login_required
  139. @validate_request("name", "kb_id")
  140. def create():
  141. req = request.json
  142. kb_id = req["kb_id"]
  143. if not kb_id:
  144. return get_json_result(
  145. data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
  146. try:
  147. e, kb = KnowledgebaseService.get_by_id(kb_id)
  148. if not e:
  149. return get_data_error_result(
  150. retmsg="Can't find this knowledgebase!")
  151. if DocumentService.query(name=req["name"], kb_id=kb_id):
  152. return get_data_error_result(
  153. retmsg="Duplicated document name in the same knowledgebase.")
  154. doc = DocumentService.insert({
  155. "id": get_uuid(),
  156. "kb_id": kb.id,
  157. "parser_id": kb.parser_id,
  158. "parser_config": kb.parser_config,
  159. "created_by": current_user.id,
  160. "type": FileType.VIRTUAL,
  161. "name": req["name"],
  162. "location": "",
  163. "size": 0
  164. })
  165. return get_json_result(data=doc.to_json())
  166. except Exception as e:
  167. return server_error_response(e)
  168. @manager.route('/list', methods=['GET'])
  169. @login_required
  170. def list_docs():
  171. kb_id = request.args.get("kb_id")
  172. if not kb_id:
  173. return get_json_result(
  174. data=False, retmsg='Lack of "KB ID"', retcode=RetCode.ARGUMENT_ERROR)
  175. tenants = UserTenantService.query(user_id=current_user.id)
  176. for tenant in tenants:
  177. if KnowledgebaseService.query(
  178. tenant_id=tenant.tenant_id, id=kb_id):
  179. break
  180. else:
  181. return get_json_result(
  182. data=False, retmsg=f'Only owner of knowledgebase authorized for this operation.',
  183. retcode=RetCode.OPERATING_ERROR)
  184. keywords = request.args.get("keywords", "")
  185. page_number = int(request.args.get("page", 1))
  186. items_per_page = int(request.args.get("page_size", 15))
  187. orderby = request.args.get("orderby", "create_time")
  188. desc = request.args.get("desc", True)
  189. try:
  190. docs, tol = DocumentService.get_by_kb_id(
  191. kb_id, page_number, items_per_page, orderby, desc, keywords)
  192. for doc_item in docs:
  193. if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
  194. doc_item['thumbnail'] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
  195. return get_json_result(data={"total": tol, "docs": docs})
  196. except Exception as e:
  197. return server_error_response(e)
  198. @manager.route('/infos', methods=['POST'])
  199. def docinfos():
  200. req = request.json
  201. doc_ids = req["doc_ids"]
  202. docs = DocumentService.get_by_ids(doc_ids)
  203. return get_json_result(data=list(docs.dicts()))
  204. @manager.route('/thumbnails', methods=['GET'])
  205. #@login_required
  206. def thumbnails():
  207. doc_ids = request.args.get("doc_ids").split(",")
  208. if not doc_ids:
  209. return get_json_result(
  210. data=False, retmsg='Lack of "Document ID"', retcode=RetCode.ARGUMENT_ERROR)
  211. try:
  212. docs = DocumentService.get_thumbnails(doc_ids)
  213. for doc_item in docs:
  214. if doc_item['thumbnail'] and not doc_item['thumbnail'].startswith(IMG_BASE64_PREFIX):
  215. doc_item['thumbnail'] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
  216. return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
  217. except Exception as e:
  218. return server_error_response(e)
  219. @manager.route('/change_status', methods=['POST'])
  220. @login_required
  221. @validate_request("doc_id", "status")
  222. def change_status():
  223. req = request.json
  224. if str(req["status"]) not in ["0", "1"]:
  225. get_json_result(
  226. data=False,
  227. retmsg='"Status" must be either 0 or 1!',
  228. retcode=RetCode.ARGUMENT_ERROR)
  229. try:
  230. e, doc = DocumentService.get_by_id(req["doc_id"])
  231. if not e:
  232. return get_data_error_result(retmsg="Document not found!")
  233. e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
  234. if not e:
  235. return get_data_error_result(
  236. retmsg="Can't find this knowledgebase!")
  237. if not DocumentService.update_by_id(
  238. req["doc_id"], {"status": str(req["status"])}):
  239. return get_data_error_result(
  240. retmsg="Database error (Document update)!")
  241. if str(req["status"]) == "0":
  242. ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
  243. scripts="ctx._source.available_int=0;",
  244. idxnm=search.index_name(
  245. kb.tenant_id)
  246. )
  247. else:
  248. ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=req["doc_id"]),
  249. scripts="ctx._source.available_int=1;",
  250. idxnm=search.index_name(
  251. kb.tenant_id)
  252. )
  253. return get_json_result(data=True)
  254. except Exception as e:
  255. return server_error_response(e)
  256. @manager.route('/rm', methods=['POST'])
  257. @login_required
  258. @validate_request("doc_id")
  259. def rm():
  260. req = request.json
  261. doc_ids = req["doc_id"]
  262. if isinstance(doc_ids, str): doc_ids = [doc_ids]
  263. root_folder = FileService.get_root_folder(current_user.id)
  264. pf_id = root_folder["id"]
  265. FileService.init_knowledgebase_docs(pf_id, current_user.id)
  266. errors = ""
  267. for doc_id in doc_ids:
  268. try:
  269. e, doc = DocumentService.get_by_id(doc_id)
  270. if not e:
  271. return get_data_error_result(retmsg="Document not found!")
  272. tenant_id = DocumentService.get_tenant_id(doc_id)
  273. if not tenant_id:
  274. return get_data_error_result(retmsg="Tenant not found!")
  275. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  276. if not DocumentService.remove_document(doc, tenant_id):
  277. return get_data_error_result(
  278. retmsg="Database error (Document removal)!")
  279. f2d = File2DocumentService.get_by_document_id(doc_id)
  280. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  281. File2DocumentService.delete_by_document_id(doc_id)
  282. STORAGE_IMPL.rm(b, n)
  283. except Exception as e:
  284. errors += str(e)
  285. if errors:
  286. return get_json_result(data=False, retmsg=errors, retcode=RetCode.SERVER_ERROR)
  287. return get_json_result(data=True)
  288. @manager.route('/run', methods=['POST'])
  289. @login_required
  290. @validate_request("doc_ids", "run")
  291. def run():
  292. req = request.json
  293. try:
  294. for id in req["doc_ids"]:
  295. info = {"run": str(req["run"]), "progress": 0}
  296. if str(req["run"]) == TaskStatus.RUNNING.value:
  297. info["progress_msg"] = ""
  298. info["chunk_num"] = 0
  299. info["token_num"] = 0
  300. DocumentService.update_by_id(id, info)
  301. # if str(req["run"]) == TaskStatus.CANCEL.value:
  302. tenant_id = DocumentService.get_tenant_id(id)
  303. if not tenant_id:
  304. return get_data_error_result(retmsg="Tenant not found!")
  305. ELASTICSEARCH.deleteByQuery(
  306. Q("match", doc_id=id), idxnm=search.index_name(tenant_id))
  307. if str(req["run"]) == TaskStatus.RUNNING.value:
  308. TaskService.filter_delete([Task.doc_id == id])
  309. e, doc = DocumentService.get_by_id(id)
  310. doc = doc.to_dict()
  311. doc["tenant_id"] = tenant_id
  312. bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
  313. queue_tasks(doc, bucket, name)
  314. return get_json_result(data=True)
  315. except Exception as e:
  316. return server_error_response(e)
  317. @manager.route('/rename', methods=['POST'])
  318. @login_required
  319. @validate_request("doc_id", "name")
  320. def rename():
  321. req = request.json
  322. try:
  323. e, doc = DocumentService.get_by_id(req["doc_id"])
  324. if not e:
  325. return get_data_error_result(retmsg="Document not found!")
  326. if pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
  327. doc.name.lower()).suffix:
  328. return get_json_result(
  329. data=False,
  330. retmsg="The extension of file can't be changed",
  331. retcode=RetCode.ARGUMENT_ERROR)
  332. for d in DocumentService.query(name=req["name"], kb_id=doc.kb_id):
  333. if d.name == req["name"]:
  334. return get_data_error_result(
  335. retmsg="Duplicated document name in the same knowledgebase.")
  336. if not DocumentService.update_by_id(
  337. req["doc_id"], {"name": req["name"]}):
  338. return get_data_error_result(
  339. retmsg="Database error (Document rename)!")
  340. informs = File2DocumentService.get_by_document_id(req["doc_id"])
  341. if informs:
  342. e, file = FileService.get_by_id(informs[0].file_id)
  343. FileService.update_by_id(file.id, {"name": req["name"]})
  344. return get_json_result(data=True)
  345. except Exception as e:
  346. return server_error_response(e)
  347. @manager.route('/get/<doc_id>', methods=['GET'])
  348. # @login_required
  349. def get(doc_id):
  350. try:
  351. e, doc = DocumentService.get_by_id(doc_id)
  352. if not e:
  353. return get_data_error_result(retmsg="Document not found!")
  354. b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
  355. response = flask.make_response(STORAGE_IMPL.get(b, n))
  356. ext = re.search(r"\.([^.]+)$", doc.name)
  357. if ext:
  358. if doc.type == FileType.VISUAL.value:
  359. response.headers.set('Content-Type', 'image/%s' % ext.group(1))
  360. else:
  361. response.headers.set(
  362. 'Content-Type',
  363. 'application/%s' %
  364. ext.group(1))
  365. return response
  366. except Exception as e:
  367. return server_error_response(e)
  368. @manager.route('/change_parser', methods=['POST'])
  369. @login_required
  370. @validate_request("doc_id", "parser_id")
  371. def change_parser():
  372. req = request.json
  373. try:
  374. e, doc = DocumentService.get_by_id(req["doc_id"])
  375. if not e:
  376. return get_data_error_result(retmsg="Document not found!")
  377. if doc.parser_id.lower() == req["parser_id"].lower():
  378. if "parser_config" in req:
  379. if req["parser_config"] == doc.parser_config:
  380. return get_json_result(data=True)
  381. else:
  382. return get_json_result(data=True)
  383. if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
  384. or (re.search(
  385. r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
  386. return get_data_error_result(retmsg="Not supported yet!")
  387. e = DocumentService.update_by_id(doc.id,
  388. {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "",
  389. "run": TaskStatus.UNSTART.value})
  390. if not e:
  391. return get_data_error_result(retmsg="Document not found!")
  392. if "parser_config" in req:
  393. DocumentService.update_parser_config(doc.id, req["parser_config"])
  394. if doc.token_num > 0:
  395. e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1,
  396. doc.process_duation * -1)
  397. if not e:
  398. return get_data_error_result(retmsg="Document not found!")
  399. tenant_id = DocumentService.get_tenant_id(req["doc_id"])
  400. if not tenant_id:
  401. return get_data_error_result(retmsg="Tenant not found!")
  402. ELASTICSEARCH.deleteByQuery(
  403. Q("match", doc_id=doc.id), idxnm=search.index_name(tenant_id))
  404. return get_json_result(data=True)
  405. except Exception as e:
  406. return server_error_response(e)
  407. @manager.route('/image/<image_id>', methods=['GET'])
  408. # @login_required
  409. def get_image(image_id):
  410. try:
  411. bkt, nm = image_id.split("-")
  412. response = flask.make_response(STORAGE_IMPL.get(bkt, nm))
  413. response.headers.set('Content-Type', 'image/JPEG')
  414. return response
  415. except Exception as e:
  416. return server_error_response(e)
  417. @manager.route('/upload_and_parse', methods=['POST'])
  418. @login_required
  419. @validate_request("conversation_id")
  420. def upload_and_parse():
  421. if 'file' not in request.files:
  422. return get_json_result(
  423. data=False, retmsg='No file part!', retcode=RetCode.ARGUMENT_ERROR)
  424. file_objs = request.files.getlist('file')
  425. for file_obj in file_objs:
  426. if file_obj.filename == '':
  427. return get_json_result(
  428. data=False, retmsg='No file selected!', retcode=RetCode.ARGUMENT_ERROR)
  429. doc_ids = doc_upload_and_parse(request.form.get("conversation_id"), file_objs, current_user.id)
  430. return get_json_result(data=doc_ids)