You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import os
  16. import re
  17. import warnings
  18. from flask import request
  19. from flask_login import login_required, current_user
  20. from httpx import HTTPError
  21. from api.contants import NAME_LENGTH_LIMIT
  22. from api.db import FileType, ParserType, FileSource
  23. from api.db import StatusEnum
  24. from api.db.db_models import File
  25. from api.db.services import duplicate_name
  26. from api.db.services.document_service import DocumentService
  27. from api.db.services.file2document_service import File2DocumentService
  28. from api.db.services.file_service import FileService
  29. from api.db.services.knowledgebase_service import KnowledgebaseService
  30. from api.db.services.user_service import TenantService
  31. from api.settings import RetCode
  32. from api.utils import get_uuid
  33. from api.utils.api_utils import construct_json_result, construct_error_response
  34. from api.utils.api_utils import construct_result, validate_request
  35. from api.utils.file_utils import filename_type, thumbnail
  36. from rag.utils.minio_conn import MINIO
  37. MAXIMUM_OF_UPLOADING_FILES = 256
  38. # ------------------------------ create a dataset ---------------------------------------
  39. @manager.route('/', methods=['POST'])
  40. @login_required # use login
  41. @validate_request("name") # check name key
  42. def create_dataset():
  43. # Check if Authorization header is present
  44. authorization_token = request.headers.get('Authorization')
  45. if not authorization_token:
  46. return construct_json_result(code=RetCode.AUTHENTICATION_ERROR, message="Authorization header is missing.")
  47. # TODO: Login or API key
  48. # objs = APIToken.query(token=authorization_token)
  49. #
  50. # # Authorization error
  51. # if not objs:
  52. # return construct_json_result(code=RetCode.AUTHENTICATION_ERROR, message="Token is invalid.")
  53. #
  54. # tenant_id = objs[0].tenant_id
  55. tenant_id = current_user.id
  56. request_body = request.json
  57. # In case that there's no name
  58. if "name" not in request_body:
  59. return construct_json_result(code=RetCode.DATA_ERROR, message="Expected 'name' field in request body")
  60. dataset_name = request_body["name"]
  61. # empty dataset_name
  62. if not dataset_name:
  63. return construct_json_result(code=RetCode.DATA_ERROR, message="Empty dataset name")
  64. # In case that there's space in the head or the tail
  65. dataset_name = dataset_name.strip()
  66. # In case that the length of the name exceeds the limit
  67. dataset_name_length = len(dataset_name)
  68. if dataset_name_length > NAME_LENGTH_LIMIT:
  69. return construct_json_result(code=RetCode.DATA_ERROR,
  70. message=f"Dataset name: {dataset_name} with length {dataset_name_length} exceeds {NAME_LENGTH_LIMIT}!")
  71. # In case that there are other fields in the data-binary
  72. if len(request_body.keys()) > 1:
  73. name_list = []
  74. for key_name in request_body.keys():
  75. if key_name != 'name':
  76. name_list.append(key_name)
  77. return construct_json_result(code=RetCode.DATA_ERROR,
  78. message=f"fields: {name_list}, are not allowed in request body.")
  79. # If there is a duplicate name, it will modify it to make it unique
  80. request_body["name"] = duplicate_name(
  81. KnowledgebaseService.query,
  82. name=dataset_name,
  83. tenant_id=tenant_id,
  84. status=StatusEnum.VALID.value)
  85. try:
  86. request_body["id"] = get_uuid()
  87. request_body["tenant_id"] = tenant_id
  88. request_body["created_by"] = tenant_id
  89. exist, t = TenantService.get_by_id(tenant_id)
  90. if not exist:
  91. return construct_result(code=RetCode.AUTHENTICATION_ERROR, message="Tenant not found.")
  92. request_body["embd_id"] = t.embd_id
  93. if not KnowledgebaseService.save(**request_body):
  94. # failed to create new dataset
  95. return construct_result()
  96. return construct_json_result(code=RetCode.SUCCESS,
  97. data={"dataset_name": request_body["name"], "dataset_id": request_body["id"]})
  98. except Exception as e:
  99. return construct_error_response(e)
  100. # -----------------------------list datasets-------------------------------------------------------
  101. @manager.route('/', methods=['GET'])
  102. @login_required
  103. def list_datasets():
  104. offset = request.args.get("offset", 0)
  105. count = request.args.get("count", -1)
  106. orderby = request.args.get("orderby", "create_time")
  107. desc = request.args.get("desc", True)
  108. try:
  109. tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
  110. datasets = KnowledgebaseService.get_by_tenant_ids_by_offset(
  111. [m["tenant_id"] for m in tenants], current_user.id, int(offset), int(count), orderby, desc)
  112. return construct_json_result(data=datasets, code=RetCode.SUCCESS, message=f"List datasets successfully!")
  113. except Exception as e:
  114. return construct_error_response(e)
  115. except HTTPError as http_err:
  116. return construct_json_result(http_err)
  117. # ---------------------------------delete a dataset ----------------------------
  118. @manager.route('/<dataset_id>', methods=['DELETE'])
  119. @login_required
  120. def remove_dataset(dataset_id):
  121. try:
  122. datasets = KnowledgebaseService.query(created_by=current_user.id, id=dataset_id)
  123. # according to the id, searching for the dataset
  124. if not datasets:
  125. return construct_json_result(message=f'The dataset cannot be found for your current account.',
  126. code=RetCode.OPERATING_ERROR)
  127. # Iterating the documents inside the dataset
  128. for doc in DocumentService.query(kb_id=dataset_id):
  129. if not DocumentService.remove_document(doc, datasets[0].tenant_id):
  130. # the process of deleting failed
  131. return construct_json_result(code=RetCode.DATA_ERROR,
  132. message="There was an error during the document removal process. "
  133. "Please check the status of the RAGFlow server and try the removal again.")
  134. # delete the other files
  135. f2d = File2DocumentService.get_by_document_id(doc.id)
  136. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  137. File2DocumentService.delete_by_document_id(doc.id)
  138. # delete the dataset
  139. if not KnowledgebaseService.delete_by_id(dataset_id):
  140. return construct_json_result(code=RetCode.DATA_ERROR, message="There was an error during the dataset removal process. "
  141. "Please check the status of the RAGFlow server and try the removal again.")
  142. # success
  143. return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully")
  144. except Exception as e:
  145. return construct_error_response(e)
  146. # ------------------------------ get details of a dataset ----------------------------------------
  147. @manager.route('/<dataset_id>', methods=['GET'])
  148. @login_required
  149. def get_dataset(dataset_id):
  150. try:
  151. dataset = KnowledgebaseService.get_detail(dataset_id)
  152. if not dataset:
  153. return construct_json_result(code=RetCode.DATA_ERROR, message="Can't find this dataset!")
  154. return construct_json_result(data=dataset, code=RetCode.SUCCESS)
  155. except Exception as e:
  156. return construct_json_result(e)
  157. # ------------------------------ update a dataset --------------------------------------------
  158. @manager.route('/<dataset_id>', methods=['PUT'])
  159. @login_required
  160. def update_dataset(dataset_id):
  161. req = request.json
  162. try:
  163. # the request cannot be empty
  164. if not req:
  165. return construct_json_result(code=RetCode.DATA_ERROR, message="Please input at least one parameter that "
  166. "you want to update!")
  167. # check whether the dataset can be found
  168. if not KnowledgebaseService.query(created_by=current_user.id, id=dataset_id):
  169. return construct_json_result(message=f'Only the owner of knowledgebase is authorized for this operation!',
  170. code=RetCode.OPERATING_ERROR)
  171. exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
  172. # check whether there is this dataset
  173. if not exist:
  174. return construct_json_result(code=RetCode.DATA_ERROR, message="This dataset cannot be found!")
  175. if 'name' in req:
  176. name = req["name"].strip()
  177. # check whether there is duplicate name
  178. if name.lower() != dataset.name.lower() \
  179. and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id,
  180. status=StatusEnum.VALID.value)) > 1:
  181. return construct_json_result(code=RetCode.DATA_ERROR, message=f"The name: {name.lower()} is already used by other "
  182. f"datasets. Please choose a different name.")
  183. dataset_updating_data = {}
  184. chunk_num = req.get("chunk_num")
  185. # modify the value of 11 parameters
  186. # 2 parameters: embedding id and chunk method
  187. # only if chunk_num is 0, the user can update the embedding id
  188. if req.get('embedding_model_id'):
  189. if chunk_num == 0:
  190. dataset_updating_data['embd_id'] = req['embedding_model_id']
  191. else:
  192. construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document in this "
  193. "dataset, so you cannot change the embedding "
  194. "model.")
  195. # only if chunk_num is 0, the user can update the chunk_method
  196. if req.get("chunk_method"):
  197. if chunk_num == 0:
  198. dataset_updating_data['parser_id'] = req["chunk_method"]
  199. else:
  200. construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document "
  201. "in this dataset, so you cannot "
  202. "change the chunk method.")
  203. # convert the photo parameter to avatar
  204. if req.get("photo"):
  205. dataset_updating_data['avatar'] = req["photo"]
  206. # layout_recognize
  207. if 'layout_recognize' in req:
  208. if 'parser_config' not in dataset_updating_data:
  209. dataset_updating_data['parser_config'] = {}
  210. dataset_updating_data['parser_config']['layout_recognize'] = req['layout_recognize']
  211. # TODO: updating use_raptor needs to construct a class
  212. # 6 parameters
  213. for key in ['name', 'language', 'description', 'permission', 'id', 'token_num']:
  214. if key in req:
  215. dataset_updating_data[key] = req.get(key)
  216. # update
  217. if not KnowledgebaseService.update_by_id(dataset.id, dataset_updating_data):
  218. return construct_json_result(code=RetCode.OPERATING_ERROR, message="Failed to update! "
  219. "Please check the status of RAGFlow "
  220. "server and try again!")
  221. exist, dataset = KnowledgebaseService.get_by_id(dataset.id)
  222. if not exist:
  223. return construct_json_result(code=RetCode.DATA_ERROR, message="Failed to get the dataset "
  224. "using the dataset ID.")
  225. return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS)
  226. except Exception as e:
  227. return construct_error_response(e)
  228. # --------------------------------content management ----------------------------------------------
  229. # ----------------------------upload files-----------------------------------------------------
  230. @manager.route('/<dataset_id>/documents/', methods=['POST'])
  231. @login_required
  232. def upload_documents(dataset_id):
  233. # no files
  234. if not request.files:
  235. return construct_json_result(
  236. message='There is no file!', code=RetCode.ARGUMENT_ERROR)
  237. # the number of uploading files exceeds the limit
  238. file_objs = request.files.getlist('file')
  239. num_file_objs = len(file_objs)
  240. if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
  241. return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
  242. f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
  243. for file_obj in file_objs:
  244. # the content of the file
  245. file_content = file_obj.read()
  246. file_name = file_obj.filename
  247. # no name
  248. if not file_name:
  249. return construct_json_result(
  250. message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
  251. # TODO: support the remote files
  252. if 'http' in file_name:
  253. return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
  254. # the content is empty, raising a warning
  255. if file_content == b'':
  256. warnings.warn(f"[WARNING]: The file {file_name} is empty.")
  257. # no dataset
  258. exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
  259. if not exist:
  260. return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
  261. # get the root_folder
  262. root_folder = FileService.get_root_folder(current_user.id)
  263. # get the id of the root_folder
  264. parent_file_id = root_folder["id"] # document id
  265. # this is for the new user, create '.knowledgebase' file
  266. FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
  267. # go inside this folder, get the kb_root_folder
  268. kb_root_folder = FileService.get_kb_folder(current_user.id)
  269. # link the file management to the kb_folder
  270. kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
  271. # grab all the errs
  272. err = []
  273. MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
  274. uploaded_docs_json = []
  275. for file in file_objs:
  276. try:
  277. # TODO: get this value from the database as some tenants have this limit while others don't
  278. if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
  279. return construct_json_result(code=RetCode.DATA_ERROR,
  280. message="Exceed the maximum file number of a free user!")
  281. # deal with the duplicate name
  282. filename = duplicate_name(
  283. DocumentService.query,
  284. name=file.filename,
  285. kb_id=dataset.id)
  286. # deal with the unsupported type
  287. filetype = filename_type(filename)
  288. if filetype == FileType.OTHER.value:
  289. return construct_json_result(code=RetCode.DATA_ERROR,
  290. message="This type of file has not been supported yet!")
  291. # upload to the minio
  292. location = filename
  293. while MINIO.obj_exist(dataset_id, location):
  294. location += "_"
  295. blob = file.read()
  296. MINIO.put(dataset_id, location, blob)
  297. doc = {
  298. "id": get_uuid(),
  299. "kb_id": dataset.id,
  300. "parser_id": dataset.parser_id,
  301. "parser_config": dataset.parser_config,
  302. "created_by": current_user.id,
  303. "type": filetype,
  304. "name": filename,
  305. "location": location,
  306. "size": len(blob),
  307. "thumbnail": thumbnail(filename, blob)
  308. }
  309. if doc["type"] == FileType.VISUAL:
  310. doc["parser_id"] = ParserType.PICTURE.value
  311. if re.search(r"\.(ppt|pptx|pages)$", filename):
  312. doc["parser_id"] = ParserType.PRESENTATION.value
  313. DocumentService.insert(doc)
  314. FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
  315. uploaded_docs_json.append(doc)
  316. except Exception as e:
  317. err.append(file.filename + ": " + str(e))
  318. if err:
  319. # return all the errors
  320. return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
  321. # success
  322. return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS)
  323. # ----------------------------delete a file-----------------------------------------------------
  324. @manager.route('/<dataset_id>/documents/<document_id>', methods=['DELETE'])
  325. @login_required
  326. def delete_document(document_id, dataset_id): # string
  327. # get the root folder
  328. root_folder = FileService.get_root_folder(current_user.id)
  329. # parent file's id
  330. parent_file_id = root_folder["id"]
  331. # consider the new user
  332. FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
  333. # store all the errors that may have
  334. errors = ""
  335. try:
  336. # whether there is this document
  337. exist, doc = DocumentService.get_by_id(document_id)
  338. if not exist:
  339. return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR)
  340. # whether this doc is authorized by this tenant
  341. tenant_id = DocumentService.get_tenant_id(document_id)
  342. if not tenant_id:
  343. return construct_json_result(
  344. message=f"You cannot delete this document {document_id} due to the authorization"
  345. f" reason!", code=RetCode.AUTHENTICATION_ERROR)
  346. # get the doc's id and location
  347. real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id)
  348. if real_dataset_id != dataset_id:
  349. return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, "
  350. f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR)
  351. # there is an issue when removing
  352. if not DocumentService.remove_document(doc, tenant_id):
  353. return construct_json_result(
  354. message="There was an error during the document removal process. Please check the status of the "
  355. "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR)
  356. # fetch the File2Document record associated with the provided document ID.
  357. file_to_doc = File2DocumentService.get_by_document_id(document_id)
  358. # delete the associated File record.
  359. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id])
  360. # delete the File2Document record itself using the document ID. This removes the
  361. # association between the document and the file after the File record has been deleted.
  362. File2DocumentService.delete_by_document_id(document_id)
  363. # delete it from minio
  364. MINIO.rm(dataset_id, location)
  365. except Exception as e:
  366. errors += str(e)
  367. if errors:
  368. return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR)
  369. return construct_json_result(data=True, code=RetCode.SUCCESS)
  370. # ----------------------------list files-----------------------------------------------------
  371. @manager.route('/<dataset_id>/documents/', methods=['GET'])
  372. @login_required
  373. def list_documents(dataset_id):
  374. if not dataset_id:
  375. return construct_json_result(
  376. data=False, message='Lack of "dataset_id"', code=RetCode.ARGUMENT_ERROR)
  377. # searching keywords
  378. keywords = request.args.get("keywords", "")
  379. offset = request.args.get("offset", 0)
  380. count = request.args.get("count", -1)
  381. order_by = request.args.get("order_by", "create_time")
  382. descend = request.args.get("descend", True)
  383. try:
  384. docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by,
  385. descend, keywords)
  386. return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS)
  387. except Exception as e:
  388. return construct_error_response(e)
  389. # ----------------------------download a file-----------------------------------------------------
  390. # ----------------------------enable rename-----------------------------------------------------
  391. # ----------------------------start parsing-----------------------------------------------------
  392. # ----------------------------stop parsing-----------------------------------------------------
  393. # ----------------------------show the status of the file-----------------------------------------------------
  394. # ----------------------------list the chunks of the file-----------------------------------------------------
  395. # ----------------------------delete the chunk-----------------------------------------------------
  396. # ----------------------------edit the status of the chunk-----------------------------------------------------
  397. # ----------------------------insert a new chunk-----------------------------------------------------
  398. # ----------------------------upload a file-----------------------------------------------------
  399. # ----------------------------get a specific chunk-----------------------------------------------------
  400. # ----------------------------retrieval test-----------------------------------------------------