您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

dataset.py 9.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. from flask import request
  17. from api.db import StatusEnum, FileSource
  18. from api.db.db_models import File
  19. from api.db.services.document_service import DocumentService
  20. from api.db.services.file2document_service import File2DocumentService
  21. from api.db.services.file_service import FileService
  22. from api.db.services.knowledgebase_service import KnowledgebaseService
  23. from api.db.services.user_service import TenantService
  24. from api.settings import RetCode
  25. from api.utils import get_uuid
  26. from api.utils.api_utils import get_json_result, token_required, get_data_error_result
  27. @manager.route('/save', methods=['POST'])
  28. @token_required
  29. def save(tenant_id):
  30. req = request.json
  31. e, t = TenantService.get_by_id(tenant_id)
  32. if "id" not in req:
  33. if "tenant_id" in req or "embedding_model" in req:
  34. return get_data_error_result(
  35. retmsg="Tenant_id or embedding_model must not be provided")
  36. if "name" not in req:
  37. return get_data_error_result(
  38. retmsg="Name is not empty!")
  39. req['id'] = get_uuid()
  40. req["name"] = req["name"].strip()
  41. if req["name"] == "":
  42. return get_data_error_result(
  43. retmsg="Name is not empty string!")
  44. if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
  45. return get_data_error_result(
  46. retmsg="Duplicated knowledgebase name in creating dataset.")
  47. req["tenant_id"] = req['created_by'] = tenant_id
  48. req['embedding_model'] = t.embd_id
  49. key_mapping = {
  50. "chunk_num": "chunk_count",
  51. "doc_num": "document_count",
  52. "parser_id": "parse_method",
  53. "embd_id": "embedding_model"
  54. }
  55. mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
  56. req.update(mapped_keys)
  57. if not KnowledgebaseService.save(**req):
  58. return get_data_error_result(retmsg="Create dataset error.(Database error)")
  59. renamed_data={}
  60. e, k = KnowledgebaseService.get_by_id(req["id"])
  61. for key, value in k.to_dict().items():
  62. new_key = key_mapping.get(key, key)
  63. renamed_data[new_key] = value
  64. return get_json_result(data=renamed_data)
  65. else:
  66. invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"}
  67. if any(key in req for key in invalid_keys):
  68. return get_data_error_result(retmsg="The input parameters are invalid.")
  69. if "tenant_id" in req:
  70. if req["tenant_id"] != tenant_id:
  71. return get_data_error_result(
  72. retmsg="Can't change tenant_id.")
  73. if "embedding_model" in req:
  74. if req["embedding_model"] != t.embd_id:
  75. return get_data_error_result(
  76. retmsg="Can't change embedding_model.")
  77. req.pop("embedding_model")
  78. if not KnowledgebaseService.query(
  79. created_by=tenant_id, id=req["id"]):
  80. return get_json_result(
  81. data=False, retmsg='You do not own the dataset.',
  82. retcode=RetCode.OPERATING_ERROR)
  83. e, kb = KnowledgebaseService.get_by_id(req["id"])
  84. if "chunk_count" in req:
  85. if req["chunk_count"] != kb.chunk_num:
  86. return get_data_error_result(
  87. retmsg="Can't change chunk_count.")
  88. req.pop("chunk_count")
  89. if "document_count" in req:
  90. if req['document_count'] != kb.doc_num:
  91. return get_data_error_result(
  92. retmsg="Can't change document_count.")
  93. req.pop("document_count")
  94. if "parse_method" in req:
  95. if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
  96. return get_data_error_result(
  97. retmsg="If chunk count is not 0, parse method is not changable.")
  98. req['parser_id'] = req.pop('parse_method')
  99. if "name" in req:
  100. if req["name"].lower() != kb.name.lower() \
  101. and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
  102. status=StatusEnum.VALID.value)) > 0:
  103. return get_data_error_result(
  104. retmsg="Duplicated knowledgebase name in updating dataset.")
  105. del req["id"]
  106. if not KnowledgebaseService.update_by_id(kb.id, req):
  107. return get_data_error_result(retmsg="Update dataset error.(Database error)")
  108. return get_json_result(data=True)
  109. @manager.route('/delete', methods=['DELETE'])
  110. @token_required
  111. def delete(tenant_id):
  112. req = request.args
  113. if "id" not in req:
  114. return get_data_error_result(
  115. retmsg="id is required")
  116. kbs = KnowledgebaseService.query(
  117. created_by=tenant_id, id=req["id"])
  118. if not kbs:
  119. return get_json_result(
  120. data=False, retmsg='You do not own the dataset',
  121. retcode=RetCode.OPERATING_ERROR)
  122. for doc in DocumentService.query(kb_id=req["id"]):
  123. if not DocumentService.remove_document(doc, kbs[0].tenant_id):
  124. return get_data_error_result(
  125. retmsg="Remove document error.(Database error)")
  126. f2d = File2DocumentService.get_by_document_id(doc.id)
  127. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  128. File2DocumentService.delete_by_document_id(doc.id)
  129. if not KnowledgebaseService.delete_by_id(req["id"]):
  130. return get_data_error_result(
  131. retmsg="Delete dataset error.(Database serror)")
  132. return get_json_result(data=True)
  133. @manager.route('/list', methods=['GET'])
  134. @token_required
  135. def list_datasets(tenant_id):
  136. page_number = int(request.args.get("page", 1))
  137. items_per_page = int(request.args.get("page_size", 1024))
  138. orderby = request.args.get("orderby", "create_time")
  139. desc = bool(request.args.get("desc", True))
  140. tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
  141. kbs = KnowledgebaseService.get_by_tenant_ids(
  142. [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc)
  143. renamed_list = []
  144. for kb in kbs:
  145. key_mapping = {
  146. "chunk_num": "chunk_count",
  147. "doc_num": "document_count",
  148. "parser_id": "parse_method",
  149. "embd_id": "embedding_model"
  150. }
  151. renamed_data = {}
  152. for key, value in kb.items():
  153. new_key = key_mapping.get(key, key)
  154. renamed_data[new_key] = value
  155. renamed_list.append(renamed_data)
  156. return get_json_result(data=renamed_list)
  157. @manager.route('/detail', methods=['GET'])
  158. @token_required
  159. def detail(tenant_id):
  160. req = request.args
  161. key_mapping = {
  162. "chunk_num": "chunk_count",
  163. "doc_num": "document_count",
  164. "parser_id": "parse_method",
  165. "embd_id": "embedding_model"
  166. }
  167. renamed_data = {}
  168. if "id" in req:
  169. id = req["id"]
  170. kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"])
  171. if not kb:
  172. return get_json_result(
  173. data=False, retmsg='You do not own the dataset.',
  174. retcode=RetCode.OPERATING_ERROR)
  175. if "name" in req:
  176. name = req["name"]
  177. if kb[0].name != name:
  178. return get_json_result(
  179. data=False, retmsg='You do not own the dataset.',
  180. retcode=RetCode.OPERATING_ERROR)
  181. e, k = KnowledgebaseService.get_by_id(id)
  182. for key, value in k.to_dict().items():
  183. new_key = key_mapping.get(key, key)
  184. renamed_data[new_key] = value
  185. return get_json_result(data=renamed_data)
  186. else:
  187. if "name" in req:
  188. name = req["name"]
  189. e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id)
  190. if not e:
  191. return get_json_result(
  192. data=False, retmsg='You do not own the dataset.',
  193. retcode=RetCode.OPERATING_ERROR)
  194. for key, value in k.to_dict().items():
  195. new_key = key_mapping.get(key, key)
  196. renamed_data[new_key] = value
  197. return get_json_result(data=renamed_data)
  198. else:
  199. return get_data_error_result(
  200. retmsg="At least one of `id` or `name` must be provided.")