|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- from common import create_dataset, list_dataset, rm_dataset, upload_file
- from common import list_document, get_docs_info, parse_docs
- from time import sleep
- from timeit import default_timer as timer
-
-
- def test_parse_txt_document(get_auth):
- # create dataset
- res = create_dataset(get_auth, "test_parse_txt_document")
- assert res.get("code") == 0, f"{res.get('message')}"
-
- # list dataset
- page_number = 1
- dataset_list = []
- dataset_id = None
- while True:
- res = list_dataset(get_auth, page_number)
- data = res.get("data").get("kbs")
- for item in data:
- dataset_id = item.get("id")
- dataset_list.append(dataset_id)
- if len(dataset_list) < page_number * 150:
- break
- page_number += 1
-
- filename = 'ragflow_test.txt'
- res = upload_file(get_auth, dataset_id, f"../test_sdk_api/test_data/{filename}")
- assert res.get("code") == 0, f"{res.get('message')}"
-
- res = list_document(get_auth, dataset_id)
-
- doc_id_list = []
- for doc in res['data']['docs']:
- doc_id_list.append(doc['id'])
-
- res = get_docs_info(get_auth, doc_id_list)
- print(doc_id_list)
- doc_count = len(doc_id_list)
- res = parse_docs(get_auth, doc_id_list)
-
- start_ts = timer()
- while True:
- res = get_docs_info(get_auth, doc_id_list)
- finished_count = 0
- for doc_info in res['data']:
- if doc_info['progress'] == 1:
- finished_count += 1
- if finished_count == doc_count:
- break
- sleep(1)
- print('time cost {:.1f}s'.format(timer() - start_ts))
-
- # delete dataset
- for dataset_id in dataset_list:
- res = rm_dataset(get_auth, dataset_id)
- assert res.get("code") == 0, f"{res.get('message')}"
- print(f"{len(dataset_list)} datasets are deleted")
|