You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_chunk.py 2.5KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. from common import HOST_ADDRESS, create_dataset, list_dataset, rm_dataset, update_dataset, upload_file, DATASET_NAME_LIMIT
  16. from common import list_document, get_docs_info, parse_docs
  17. from time import sleep
  18. from timeit import default_timer as timer
  19. import re
  20. import pytest
  21. import random
  22. import string
  23. def test_parse_txt_document(get_auth):
  24. # create dataset
  25. res = create_dataset(get_auth, "test_parse_txt_document")
  26. assert res.get("code") == 0, f"{res.get('message')}"
  27. # list dataset
  28. page_number = 1
  29. dataset_list = []
  30. dataset_id = None
  31. while True:
  32. res = list_dataset(get_auth, page_number)
  33. data = res.get("data").get("kbs")
  34. for item in data:
  35. dataset_id = item.get("id")
  36. dataset_list.append(dataset_id)
  37. if len(dataset_list) < page_number * 150:
  38. break
  39. page_number += 1
  40. filename = 'ragflow_test.txt'
  41. res = upload_file(get_auth, dataset_id, f"../test_sdk_api/test_data/{filename}")
  42. assert res.get("code") == 0, f"{res.get('message')}"
  43. res = list_document(get_auth, dataset_id)
  44. doc_id_list = []
  45. for doc in res['data']['docs']:
  46. doc_id_list.append(doc['id'])
  47. res = get_docs_info(get_auth, doc_id_list)
  48. print(doc_id_list)
  49. doc_count = len(doc_id_list)
  50. res = parse_docs(get_auth, doc_id_list)
  51. start_ts = timer()
  52. while True:
  53. res = get_docs_info(get_auth, doc_id_list)
  54. finished_count = 0
  55. for doc_info in res['data']:
  56. if doc_info['progress'] == 1:
  57. finished_count += 1
  58. if finished_count == doc_count:
  59. break
  60. sleep(1)
  61. print('time cost {:.1f}s'.format(timer() - start_ts))
  62. # delete dataset
  63. for dataset_id in dataset_list:
  64. res = rm_dataset(get_auth, dataset_id)
  65. assert res.get("code") == 0, f"{res.get('message')}"
  66. print(f"{len(dataset_list)} datasets are deleted")