You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_chunk.py 2.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from common import create_dataset, list_dataset, rm_dataset, upload_file
  16. from common import list_document, get_docs_info, parse_docs
  17. from time import sleep
  18. from timeit import default_timer as timer
  19. def test_parse_txt_document(get_auth):
  20. # create dataset
  21. res = create_dataset(get_auth, "test_parse_txt_document")
  22. assert res.get("code") == 0, f"{res.get('message')}"
  23. # list dataset
  24. page_number = 1
  25. dataset_list = []
  26. dataset_id = None
  27. while True:
  28. res = list_dataset(get_auth, page_number)
  29. data = res.get("data").get("kbs")
  30. for item in data:
  31. dataset_id = item.get("id")
  32. dataset_list.append(dataset_id)
  33. if len(dataset_list) < page_number * 150:
  34. break
  35. page_number += 1
  36. filename = 'ragflow_test.txt'
  37. res = upload_file(get_auth, dataset_id, f"../test_sdk_api/test_data/{filename}")
  38. assert res.get("code") == 0, f"{res.get('message')}"
  39. res = list_document(get_auth, dataset_id)
  40. doc_id_list = []
  41. for doc in res['data']['docs']:
  42. doc_id_list.append(doc['id'])
  43. res = get_docs_info(get_auth, doc_id_list)
  44. print(doc_id_list)
  45. doc_count = len(doc_id_list)
  46. res = parse_docs(get_auth, doc_id_list)
  47. start_ts = timer()
  48. while True:
  49. res = get_docs_info(get_auth, doc_id_list)
  50. finished_count = 0
  51. for doc_info in res['data']:
  52. if doc_info['progress'] == 1:
  53. finished_count += 1
  54. if finished_count == doc_count:
  55. break
  56. sleep(1)
  57. print('time cost {:.1f}s'.format(timer() - start_ts))
  58. # delete dataset
  59. for dataset_id in dataset_list:
  60. res = rm_dataset(get_auth, dataset_id)
  61. assert res.get("code") == 0, f"{res.get('message')}"
  62. print(f"{len(dataset_list)} datasets are deleted")