Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

dataset.py 9.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. from flask import request
  17. from api.db import StatusEnum, FileSource
  18. from api.db.db_models import File
  19. from api.db.services.document_service import DocumentService
  20. from api.db.services.file2document_service import File2DocumentService
  21. from api.db.services.file_service import FileService
  22. from api.db.services.knowledgebase_service import KnowledgebaseService
  23. from api.db.services.user_service import TenantService
  24. from api.settings import RetCode
  25. from api.utils import get_uuid
  26. from api.utils.api_utils import get_json_result, token_required, get_data_error_result
  27. @manager.route('/save', methods=['POST'])
  28. @token_required
  29. def save(tenant_id):
  30. req = request.json
  31. e, t = TenantService.get_by_id(tenant_id)
  32. if "id" not in req:
  33. if "tenant_id" in req or "embedding_model" in req:
  34. return get_data_error_result(
  35. retmsg="Tenant_id or embedding_model must not be provided")
  36. if "name" not in req:
  37. return get_data_error_result(
  38. retmsg="Name is not empty!")
  39. req['id'] = get_uuid()
  40. req["name"] = req["name"].strip()
  41. if req["name"] == "":
  42. return get_data_error_result(
  43. retmsg="Name is not empty string!")
  44. if KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value):
  45. return get_data_error_result(
  46. retmsg="Duplicated knowledgebase name in creating dataset.")
  47. req["tenant_id"] = req['created_by'] = tenant_id
  48. req['embedding_model'] = t.embd_id
  49. key_mapping = {
  50. "chunk_num": "chunk_count",
  51. "doc_num": "document_count",
  52. "parser_id": "parse_method",
  53. "embd_id": "embedding_model"
  54. }
  55. mapped_keys = {new_key: req[old_key] for new_key, old_key in key_mapping.items() if old_key in req}
  56. req.update(mapped_keys)
  57. if not KnowledgebaseService.save(**req):
  58. return get_data_error_result(retmsg="Create dataset error.(Database error)")
  59. renamed_data = {}
  60. e, k = KnowledgebaseService.get_by_id(req["id"])
  61. for key, value in k.to_dict().items():
  62. new_key = key_mapping.get(key, key)
  63. renamed_data[new_key] = value
  64. return get_json_result(data=renamed_data)
  65. else:
  66. invalid_keys = {"embd_id", "chunk_num", "doc_num", "parser_id"}
  67. if any(key in req for key in invalid_keys):
  68. return get_data_error_result(retmsg="The input parameters are invalid.")
  69. if "tenant_id" in req:
  70. if req["tenant_id"] != tenant_id:
  71. return get_data_error_result(
  72. retmsg="Can't change tenant_id.")
  73. if "embedding_model" in req:
  74. if req["embedding_model"] != t.embd_id:
  75. return get_data_error_result(
  76. retmsg="Can't change embedding_model.")
  77. req.pop("embedding_model")
  78. if not KnowledgebaseService.query(
  79. created_by=tenant_id, id=req["id"]):
  80. return get_json_result(
  81. data=False, retmsg='You do not own the dataset.',
  82. retcode=RetCode.OPERATING_ERROR)
  83. if not req["id"]:
  84. return get_data_error_result(
  85. retmsg="id can not be empty.")
  86. e, kb = KnowledgebaseService.get_by_id(req["id"])
  87. if "chunk_count" in req:
  88. if req["chunk_count"] != kb.chunk_num:
  89. return get_data_error_result(
  90. retmsg="Can't change chunk_count.")
  91. req.pop("chunk_count")
  92. if "document_count" in req:
  93. if req['document_count'] != kb.doc_num:
  94. return get_data_error_result(
  95. retmsg="Can't change document_count.")
  96. req.pop("document_count")
  97. if "parse_method" in req:
  98. if kb.chunk_num != 0 and req['parse_method'] != kb.parser_id:
  99. return get_data_error_result(
  100. retmsg="If chunk count is not 0, parse method is not changable.")
  101. req['parser_id'] = req.pop('parse_method')
  102. if "name" in req:
  103. req["name"] = req["name"].strip()
  104. if req["name"].lower() != kb.name.lower() \
  105. and len(KnowledgebaseService.query(name=req["name"], tenant_id=tenant_id,
  106. status=StatusEnum.VALID.value)) > 0:
  107. return get_data_error_result(
  108. retmsg="Duplicated knowledgebase name in updating dataset.")
  109. del req["id"]
  110. if not KnowledgebaseService.update_by_id(kb.id, req):
  111. return get_data_error_result(retmsg="Update dataset error.(Database error)")
  112. return get_json_result(data=True)
  113. @manager.route('/delete', methods=['DELETE'])
  114. @token_required
  115. def delete(tenant_id):
  116. req = request.args
  117. if "id" not in req:
  118. return get_data_error_result(
  119. retmsg="id is required")
  120. kbs = KnowledgebaseService.query(
  121. created_by=tenant_id, id=req["id"])
  122. if not kbs:
  123. return get_json_result(
  124. data=False, retmsg='You do not own the dataset',
  125. retcode=RetCode.OPERATING_ERROR)
  126. for doc in DocumentService.query(kb_id=req["id"]):
  127. if not DocumentService.remove_document(doc, kbs[0].tenant_id):
  128. return get_data_error_result(
  129. retmsg="Remove document error.(Database error)")
  130. f2d = File2DocumentService.get_by_document_id(doc.id)
  131. FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id])
  132. File2DocumentService.delete_by_document_id(doc.id)
  133. if not KnowledgebaseService.delete_by_id(req["id"]):
  134. return get_data_error_result(
  135. retmsg="Delete dataset error.(Database serror)")
  136. return get_json_result(data=True)
  137. @manager.route('/list', methods=['GET'])
  138. @token_required
  139. def list_datasets(tenant_id):
  140. page_number = int(request.args.get("page", 1))
  141. items_per_page = int(request.args.get("page_size", 1024))
  142. orderby = request.args.get("orderby", "create_time")
  143. desc = bool(request.args.get("desc", True))
  144. tenants = TenantService.get_joined_tenants_by_user_id(tenant_id)
  145. kbs = KnowledgebaseService.get_by_tenant_ids(
  146. [m["tenant_id"] for m in tenants], tenant_id, page_number, items_per_page, orderby, desc)
  147. renamed_list = []
  148. for kb in kbs:
  149. key_mapping = {
  150. "chunk_num": "chunk_count",
  151. "doc_num": "document_count",
  152. "parser_id": "parse_method",
  153. "embd_id": "embedding_model"
  154. }
  155. renamed_data = {}
  156. for key, value in kb.items():
  157. new_key = key_mapping.get(key, key)
  158. renamed_data[new_key] = value
  159. renamed_list.append(renamed_data)
  160. return get_json_result(data=renamed_list)
  161. @manager.route('/detail', methods=['GET'])
  162. @token_required
  163. def detail(tenant_id):
  164. req = request.args
  165. key_mapping = {
  166. "chunk_num": "chunk_count",
  167. "doc_num": "document_count",
  168. "parser_id": "parse_method",
  169. "embd_id": "embedding_model"
  170. }
  171. renamed_data = {}
  172. if "id" in req:
  173. id = req["id"]
  174. kb = KnowledgebaseService.query(created_by=tenant_id, id=req["id"])
  175. if not kb:
  176. return get_json_result(
  177. data=False, retmsg='You do not own the dataset.',
  178. retcode=RetCode.OPERATING_ERROR)
  179. if "name" in req:
  180. name = req["name"]
  181. if kb[0].name != name:
  182. return get_json_result(
  183. data=False, retmsg='You do not own the dataset.',
  184. retcode=RetCode.OPERATING_ERROR)
  185. e, k = KnowledgebaseService.get_by_id(id)
  186. for key, value in k.to_dict().items():
  187. new_key = key_mapping.get(key, key)
  188. renamed_data[new_key] = value
  189. return get_json_result(data=renamed_data)
  190. else:
  191. if "name" in req:
  192. name = req["name"]
  193. e, k = KnowledgebaseService.get_by_name(kb_name=name, tenant_id=tenant_id)
  194. if not e:
  195. return get_json_result(
  196. data=False, retmsg='You do not own the dataset.',
  197. retcode=RetCode.OPERATING_ERROR)
  198. for key, value in k.to_dict().items():
  199. new_key = key_mapping.get(key, key)
  200. renamed_data[new_key] = value
  201. return get_json_result(data=renamed_data)
  202. else:
  203. return get_data_error_result(
  204. retmsg="At least one of `id` or `name` must be provided.")