### What problem does this PR solve? Add sdk document test cases ### Type of change - [x] Add test casestags/v0.19.1
| self.progress = 0.0 | self.progress = 0.0 | ||||
| self.progress_msg = "" | self.progress_msg = "" | ||||
| self.process_begin_at = None | self.process_begin_at = None | ||||
| self.process_duration = 0.0 | |||||
| self.process_duation = 0.0 | |||||
| self.run = "0" | self.run = "0" | ||||
| self.status = "1" | self.status = "1" | ||||
| for k in list(res_dict.keys()): | for k in list(res_dict.keys()): |
| futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(count)] | futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(count)] | ||||
| responses = list(as_completed(futures)) | responses = list(as_completed(futures)) | ||||
| assert len(responses) == count, responses | assert len(responses) == count, responses | ||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| @pytest.mark.usefixtures("clear_datasets") | @pytest.mark.usefixtures("clear_datasets") |
| futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(count)] | futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(count)] | ||||
| responses = list(as_completed(futures)) | responses = list(as_completed(futures)) | ||||
| assert len(responses) == count, responses | assert len(responses) == count, responses | ||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| class TestDatasetsDelete: | class TestDatasetsDelete: |
| futures = [executor.submit(list_datasets, api_key) for i in range(count)] | futures = [executor.submit(list_datasets, api_key) for i in range(count)] | ||||
| responses = list(as_completed(futures)) | responses = list(as_completed(futures)) | ||||
| assert len(responses) == count, responses | assert len(responses) == count, responses | ||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| @pytest.mark.usefixtures("add_datasets") | @pytest.mark.usefixtures("add_datasets") |
| futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(count)] | futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(count)] | ||||
| responses = list(as_completed(futures)) | responses = list(as_completed(futures)) | ||||
| assert len(responses) == count, responses | assert len(responses) == count, responses | ||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| class TestDatasetUpdate: | class TestDatasetUpdate: |
| document_ids = bulk_upload_documents(api_key, dataset_id, 1, ragflow_tmp_dir) | document_ids = bulk_upload_documents(api_key, dataset_id, 1, ragflow_tmp_dir) | ||||
| def cleanup(): | def cleanup(): | ||||
| delete_documents(api_key, dataset_id, {"ids": document_ids}) | |||||
| delete_documents(api_key, dataset_id, {"ids": None}) | |||||
| request.addfinalizer(cleanup) | request.addfinalizer(cleanup) | ||||
| return dataset_id, document_ids[0] | return dataset_id, document_ids[0] | ||||
| document_ids = bulk_upload_documents(api_key, dataset_id, 5, ragflow_tmp_dir) | document_ids = bulk_upload_documents(api_key, dataset_id, 5, ragflow_tmp_dir) | ||||
| def cleanup(): | def cleanup(): | ||||
| delete_documents(api_key, dataset_id, {"ids": document_ids}) | |||||
| delete_documents(api_key, dataset_id, {"ids": None}) | |||||
| request.addfinalizer(cleanup) | request.addfinalizer(cleanup) | ||||
| return dataset_id, document_ids | return dataset_id, document_ids | ||||
| document_ids = bulk_upload_documents(api_key, dataset_id, 3, ragflow_tmp_dir) | document_ids = bulk_upload_documents(api_key, dataset_id, 3, ragflow_tmp_dir) | ||||
| def cleanup(): | def cleanup(): | ||||
| delete_documents(api_key, dataset_id, {"ids": document_ids}) | |||||
| delete_documents(api_key, dataset_id, {"ids": None}) | |||||
| request.addfinalizer(cleanup) | request.addfinalizer(cleanup) | ||||
| return dataset_id, document_ids | return dataset_id, document_ids |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from concurrent.futures import ThreadPoolExecutor | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | import pytest | ||||
| from common import INVALID_API_TOKEN, bulk_upload_documents, delete_documents, list_documents | from common import INVALID_API_TOKEN, bulk_upload_documents, delete_documents, list_documents | ||||
| @pytest.mark.p3 | @pytest.mark.p3 | ||||
| def test_concurrent_deletion(api_key, add_dataset, tmp_path): | def test_concurrent_deletion(api_key, add_dataset, tmp_path): | ||||
| documents_num = 100 | |||||
| count = 100 | |||||
| dataset_id = add_dataset | dataset_id = add_dataset | ||||
| document_ids = bulk_upload_documents(api_key, dataset_id, documents_num, tmp_path) | |||||
| document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | with ThreadPoolExecutor(max_workers=5) as executor: | ||||
| futures = [ | futures = [ | ||||
| dataset_id, | dataset_id, | ||||
| {"ids": document_ids[i : i + 1]}, | {"ids": document_ids[i : i + 1]}, | ||||
| ) | ) | ||||
| for i in range(documents_num) | |||||
| for i in range(count) | |||||
| ] | ] | ||||
| responses = [f.result() for f in futures] | |||||
| assert all(r["code"] == 0 for r in responses) | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| @pytest.mark.p3 | @pytest.mark.p3 |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from concurrent.futures import ThreadPoolExecutor | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | import pytest | ||||
| from common import INVALID_API_TOKEN, list_documents | from common import INVALID_API_TOKEN, list_documents | ||||
| @pytest.mark.p3 | @pytest.mark.p3 | ||||
| def test_concurrent_list(self, api_key, add_documents): | def test_concurrent_list(self, api_key, add_documents): | ||||
| dataset_id, _ = add_documents | dataset_id, _ = add_documents | ||||
| count = 100 | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | with ThreadPoolExecutor(max_workers=5) as executor: | ||||
| futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(100)] | |||||
| responses = [f.result() for f in futures] | |||||
| assert all(r["code"] == 0 for r in responses) | |||||
| futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(count)] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| @pytest.mark.p3 | @pytest.mark.p3 | ||||
| def test_invalid_params(self, api_key, add_documents): | def test_invalid_params(self, api_key, add_documents): |
| # See the License for the specific language governing permissions and | # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from concurrent.futures import ThreadPoolExecutor | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | import pytest | ||||
| from common import INVALID_API_TOKEN, bulk_upload_documents, list_documents, parse_documents | from common import INVALID_API_TOKEN, bulk_upload_documents, list_documents, parse_documents | ||||
| return False | return False | ||||
| return True | return True | ||||
| document_num = 100 | |||||
| count = 100 | |||||
| dataset_id = add_dataset_func | dataset_id = add_dataset_func | ||||
| document_ids = bulk_upload_documents(api_key, dataset_id, document_num, tmp_path) | |||||
| document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | with ThreadPoolExecutor(max_workers=5) as executor: | ||||
| futures = [ | futures = [ | ||||
| dataset_id, | dataset_id, | ||||
| {"document_ids": document_ids[i : i + 1]}, | {"document_ids": document_ids[i : i + 1]}, | ||||
| ) | ) | ||||
| for i in range(document_num) | |||||
| for i in range(count) | |||||
| ] | ] | ||||
| responses = [f.result() for f in futures] | |||||
| assert all(r["code"] == 0 for r in responses) | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| condition(api_key, dataset_id, document_num) | |||||
| condition(api_key, dataset_id, count) | |||||
| validate_document_details(api_key, dataset_id, document_ids) | validate_document_details(api_key, dataset_id, document_ids) |
| futures = [executor.submit(upload_documents, api_key, dataset_id, fps[i : i + 1]) for i in range(count)] | futures = [executor.submit(upload_documents, api_key, dataset_id, fps[i : i + 1]) for i in range(count)] | ||||
| responses = list(as_completed(futures)) | responses = list(as_completed(futures)) | ||||
| assert len(responses) == count, responses | assert len(responses) == count, responses | ||||
| assert all(futures.result()["code"] == 0 for futures in futures) | |||||
| res = list_datasets(api_key, {"id": dataset_id}) | res = list_datasets(api_key, {"id": dataset_id}) | ||||
| assert res["data"][0]["document_count"] == count | assert res["data"][0]["document_count"] == count |
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from pathlib import Path | |||||
| from ragflow_sdk import DataSet, Document, RAGFlow | |||||
| from utils.file_utils import create_txt_file | |||||
| # DATASET MANAGEMENT | # DATASET MANAGEMENT | ||||
| def batch_create_datasets(client, num): | |||||
| def batch_create_datasets(client: RAGFlow, num: int) -> list[DataSet]: | |||||
| datasets = [] | datasets = [] | ||||
| for i in range(num): | for i in range(num): | ||||
| dataset = client.create_dataset(name=f"dataset_{i}") | dataset = client.create_dataset(name=f"dataset_{i}") | ||||
| datasets.append(dataset) | datasets.append(dataset) | ||||
| return datasets | return datasets | ||||
| # FILE MANAGEMENT WITHIN DATASET | |||||
| def bulk_upload_documents(dataset: DataSet, num: int, tmp_path: Path) -> list[Document]: | |||||
| document_infos = [] | |||||
| for i in range(num): | |||||
| fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| document_infos.append({"display_name": fp.name, "blob": blob}) | |||||
| return dataset.upload_documents(document_infos) |
| # limitations under the License. | # limitations under the License. | ||||
| # | # | ||||
| from pathlib import Path | |||||
| import pytest | import pytest | ||||
| from common import ( | from common import ( | ||||
| batch_create_datasets, | batch_create_datasets, | ||||
| bulk_upload_documents, | |||||
| ) | ) | ||||
| from configs import HOST_ADDRESS, VERSION | from configs import HOST_ADDRESS, VERSION | ||||
| from ragflow_sdk import RAGFlow | |||||
| from pytest import FixtureRequest | |||||
| from ragflow_sdk import DataSet, RAGFlow | |||||
| from utils import wait_for | |||||
| from utils.file_utils import ( | |||||
| create_docx_file, | |||||
| create_eml_file, | |||||
| create_excel_file, | |||||
| create_html_file, | |||||
| create_image_file, | |||||
| create_json_file, | |||||
| create_md_file, | |||||
| create_pdf_file, | |||||
| create_ppt_file, | |||||
| create_txt_file, | |||||
| ) | |||||
| @wait_for(30, 1, "Document parsing timeout") | |||||
| def condition(_dataset: DataSet): | |||||
| documents = DataSet.list_documents(page_size=1000) | |||||
| for document in documents: | |||||
| if document.run != "DONE": | |||||
| return False | |||||
| return True | |||||
| @pytest.fixture | |||||
| def generate_test_files(request, tmp_path): | |||||
| file_creators = { | |||||
| "docx": (tmp_path / "ragflow_test.docx", create_docx_file), | |||||
| "excel": (tmp_path / "ragflow_test.xlsx", create_excel_file), | |||||
| "ppt": (tmp_path / "ragflow_test.pptx", create_ppt_file), | |||||
| "image": (tmp_path / "ragflow_test.png", create_image_file), | |||||
| "pdf": (tmp_path / "ragflow_test.pdf", create_pdf_file), | |||||
| "txt": (tmp_path / "ragflow_test.txt", create_txt_file), | |||||
| "md": (tmp_path / "ragflow_test.md", create_md_file), | |||||
| "json": (tmp_path / "ragflow_test.json", create_json_file), | |||||
| "eml": (tmp_path / "ragflow_test.eml", create_eml_file), | |||||
| "html": (tmp_path / "ragflow_test.html", create_html_file), | |||||
| } | |||||
| files = {} | |||||
| for file_type, (file_path, creator_func) in file_creators.items(): | |||||
| if request.param in ["", file_type]: | |||||
| creator_func(file_path) | |||||
| files[file_type] = file_path | |||||
| return files | |||||
| @pytest.fixture(scope="class") | |||||
| def ragflow_tmp_dir(request, tmp_path_factory) -> Path: | |||||
| class_name = request.cls.__name__ | |||||
| return tmp_path_factory.mktemp(class_name) | |||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||
| def client(token): | |||||
| def client(token) -> RAGFlow: | |||||
| return RAGFlow(api_key=token, base_url=HOST_ADDRESS, version=VERSION) | return RAGFlow(api_key=token, base_url=HOST_ADDRESS, version=VERSION) | ||||
| @pytest.fixture(scope="function") | @pytest.fixture(scope="function") | ||||
| def clear_datasets(request, client): | |||||
| def clear_datasets(request: FixtureRequest, client: RAGFlow): | |||||
| def cleanup(): | def cleanup(): | ||||
| client.delete_datasets(ids=None) | client.delete_datasets(ids=None) | ||||
| request.addfinalizer(cleanup) | request.addfinalizer(cleanup) | ||||
| @pytest.fixture(scope="function") | |||||
| def add_dataset_func(request, client): | |||||
| @pytest.fixture(scope="class") | |||||
| def add_dataset(request: FixtureRequest, client: RAGFlow): | |||||
| def cleanup(): | def cleanup(): | ||||
| client.delete_datasets(ids=None) | client.delete_datasets(ids=None) | ||||
| request.addfinalizer(cleanup) | request.addfinalizer(cleanup) | ||||
| dataset_ids = batch_create_datasets(client, 1) | |||||
| return dataset_ids[0] | |||||
| @pytest.fixture(scope="function") | |||||
| def add_dataset_func(request: FixtureRequest, client: RAGFlow) -> DataSet: | |||||
| def cleanup(): | |||||
| client.delete_datasets(ids=None) | |||||
| request.addfinalizer(cleanup) | |||||
| return batch_create_datasets(client, 1)[0] | return batch_create_datasets(client, 1)[0] | ||||
| @pytest.fixture(scope="class") | |||||
| def add_document(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir): | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir) | |||||
| def cleanup(): | |||||
| dataset.delete_documents(ids=None) | |||||
| request.addfinalizer(cleanup) | |||||
| return dataset, documents[0] |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import pytest | |||||
| from common import bulk_upload_documents | |||||
| from pytest import FixtureRequest | |||||
| from ragflow_sdk import DataSet, Document | |||||
| @pytest.fixture(scope="function") | |||||
| def add_document_func(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, Document]: | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir) | |||||
| def cleanup(): | |||||
| dataset.delete_documents(ids=None) | |||||
| request.addfinalizer(cleanup) | |||||
| return dataset, documents[0] | |||||
| @pytest.fixture(scope="class") | |||||
| def add_documents(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]: | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, 5, ragflow_tmp_dir) | |||||
| def cleanup(): | |||||
| dataset.delete_documents(ids=None) | |||||
| request.addfinalizer(cleanup) | |||||
| return dataset, documents | |||||
| @pytest.fixture(scope="function") | |||||
| def add_documents_func(request: FixtureRequest, add_dataset_func: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]: | |||||
| dataset = add_dataset_func | |||||
| documents = bulk_upload_documents(dataset, 3, ragflow_tmp_dir) | |||||
| def cleanup(): | |||||
| dataset.delete_documents(ids=None) | |||||
| request.addfinalizer(cleanup) | |||||
| return dataset, documents |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | |||||
| from common import bulk_upload_documents | |||||
| class TestDocumentsDeletion: | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "payload, expected_message, remaining", | |||||
| [ | |||||
| ({"ids": None}, "", 0), | |||||
| ({"ids": []}, "", 0), | |||||
| ({"ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", 3), | |||||
| ({"ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", 3), | |||||
| ("not json", "must be a mapping", 3), | |||||
| (lambda r: {"ids": r[:1]}, "", 2), | |||||
| (lambda r: {"ids": r}, "", 0), | |||||
| ], | |||||
| ) | |||||
| def test_basic_scenarios( | |||||
| self, | |||||
| add_documents_func, | |||||
| payload, | |||||
| expected_message, | |||||
| remaining, | |||||
| ): | |||||
| dataset, documents = add_documents_func | |||||
| if callable(payload): | |||||
| payload = payload([document.id for document in documents]) | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.delete_documents(**payload) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| dataset.delete_documents(**payload) | |||||
| documents = dataset.list_documents() | |||||
| assert len(documents) == remaining, str(documents) | |||||
| @pytest.mark.p2 | |||||
| @pytest.mark.parametrize( | |||||
| "payload", | |||||
| [ | |||||
| lambda r: {"ids": ["invalid_id"] + r}, | |||||
| lambda r: {"ids": r[:1] + ["invalid_id"] + r[1:3]}, | |||||
| lambda r: {"ids": r + ["invalid_id"]}, | |||||
| ], | |||||
| ) | |||||
| def test_delete_partial_invalid_id(self, add_documents_func, payload): | |||||
| dataset, documents = add_documents_func | |||||
| payload = payload([document.id for document in documents]) | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.delete_documents(**payload) | |||||
| assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value) | |||||
| documents = dataset.list_documents() | |||||
| assert len(documents) == 0, str(documents) | |||||
| @pytest.mark.p2 | |||||
| def test_repeated_deletion(self, add_documents_func): | |||||
| dataset, documents = add_documents_func | |||||
| document_ids = [document.id for document in documents] | |||||
| dataset.delete_documents(ids=document_ids) | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.delete_documents(ids=document_ids) | |||||
| assert "Documents not found" in str(excinfo.value), str(excinfo.value) | |||||
| @pytest.mark.p2 | |||||
| def test_duplicate_deletion(self, add_documents_func): | |||||
| dataset, documents = add_documents_func | |||||
| document_ids = [document.id for document in documents] | |||||
| dataset.delete_documents(ids=document_ids + document_ids) | |||||
| assert len(dataset.list_documents()) == 0, str(dataset.list_documents()) | |||||
| @pytest.mark.p3 | |||||
| def test_concurrent_deletion(add_dataset, tmp_path): | |||||
| count = 100 | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, count, tmp_path) | |||||
| def delete_doc(doc_id): | |||||
| dataset.delete_documents(ids=[doc_id]) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | |||||
| futures = [executor.submit(delete_doc, doc.id) for doc in documents] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| @pytest.mark.p3 | |||||
| def test_delete_1k(add_dataset, tmp_path): | |||||
| count = 1_000 | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, count, tmp_path) | |||||
| assert len(dataset.list_documents(page_size=count * 2)) == count | |||||
| dataset.delete_documents(ids=[doc.id for doc in documents]) | |||||
| assert len(dataset.list_documents()) == 0 |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | |||||
| from common import bulk_upload_documents | |||||
| from utils import compare_by_hash | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "generate_test_files", | |||||
| [ | |||||
| "docx", | |||||
| "excel", | |||||
| "ppt", | |||||
| "image", | |||||
| "pdf", | |||||
| "txt", | |||||
| "md", | |||||
| "json", | |||||
| "eml", | |||||
| "html", | |||||
| ], | |||||
| indirect=True, | |||||
| ) | |||||
| def test_file_type_validation(add_dataset, generate_test_files, request): | |||||
| dataset = add_dataset | |||||
| fp = generate_test_files[request.node.callspec.params["generate_test_files"]] | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| for document in documents: | |||||
| with fp.with_stem("ragflow_test_download").open("wb") as f: | |||||
| f.write(document.download()) | |||||
| assert compare_by_hash(fp, fp.with_stem("ragflow_test_download")) | |||||
| class TestDocumentDownload: | |||||
| @pytest.mark.p3 | |||||
| def test_same_file_repeat(self, add_documents, tmp_path, ragflow_tmp_dir): | |||||
| num = 5 | |||||
| _, documents = add_documents | |||||
| for i in range(num): | |||||
| download_path = tmp_path / f"ragflow_test_download_{i}.txt" | |||||
| with download_path.open("wb") as f: | |||||
| f.write(documents[0].download()) | |||||
| assert compare_by_hash(ragflow_tmp_dir / "ragflow_test_upload_0.txt", download_path), f"Downloaded file {i} does not match original" | |||||
| @pytest.mark.p3 | |||||
| def test_concurrent_download(add_dataset, tmp_path): | |||||
| count = 20 | |||||
| dataset = add_dataset | |||||
| documents = bulk_upload_documents(dataset, count, tmp_path) | |||||
| def download_doc(document, i): | |||||
| download_path = tmp_path / f"ragflow_test_download_{i}.txt" | |||||
| with download_path.open("wb") as f: | |||||
| f.write(document.download()) | |||||
| # assert compare_by_hash(tmp_path / f"ragflow_test_upload_{i}.txt", download_path), str(download_path) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | |||||
| futures = [executor.submit(download_doc, documents[i], i) for i in range(count)] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| for i in range(count): | |||||
| assert compare_by_hash( | |||||
| tmp_path / f"ragflow_test_upload_{i}.txt", | |||||
| tmp_path / f"ragflow_test_download_{i}.txt", | |||||
| ) |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | |||||
| class TestDocumentsList: | |||||
| @pytest.mark.p1 | |||||
| def test_default(self, add_documents): | |||||
| dataset, _ = add_documents | |||||
| documents = dataset.list_documents() | |||||
| assert len(documents) == 5, str(documents) | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_page_size, expected_message", | |||||
| [ | |||||
| ({"page": None, "page_size": 2}, 2, "not instance of"), | |||||
| ({"page": 0, "page_size": 2}, 2, ""), | |||||
| ({"page": 2, "page_size": 2}, 2, ""), | |||||
| ({"page": 3, "page_size": 2}, 1, ""), | |||||
| ({"page": "3", "page_size": 2}, 1, "not instance of"), | |||||
| pytest.param( | |||||
| {"page": -1, "page_size": 2}, | |||||
| 0, | |||||
| "Invalid page number", | |||||
| marks=pytest.mark.skip(reason="issues/5851"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"page": "a", "page_size": 2}, | |||||
| 0, | |||||
| "Invalid page value", | |||||
| marks=pytest.mark.skip(reason="issues/5851"), | |||||
| ), | |||||
| ], | |||||
| ) | |||||
| def test_page(self, add_documents, params, expected_page_size, expected_message): | |||||
| dataset, _ = add_documents | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_page_size, str(documents) | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_page_size, expected_message", | |||||
| [ | |||||
| ({"page_size": None}, 5, "not instance of"), | |||||
| ({"page_size": 0}, 0, ""), | |||||
| ({"page_size": 1}, 1, ""), | |||||
| ({"page_size": 6}, 5, ""), | |||||
| ({"page_size": "1"}, 1, "not instance of"), | |||||
| pytest.param( | |||||
| {"page_size": -1}, | |||||
| 0, | |||||
| "Invalid page size", | |||||
| marks=pytest.mark.skip(reason="issues/5851"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"page_size": "a"}, | |||||
| 0, | |||||
| "Invalid page size value", | |||||
| marks=pytest.mark.skip(reason="issues/5851"), | |||||
| ), | |||||
| ], | |||||
| ) | |||||
| def test_page_size(self, add_documents, params, expected_page_size, expected_message): | |||||
| dataset, _ = add_documents | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_page_size, str(documents) | |||||
| @pytest.mark.p3 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_message", | |||||
| [ | |||||
| ({"orderby": None}, "not instance of"), | |||||
| ({"orderby": "create_time"}, ""), | |||||
| ({"orderby": "update_time"}, ""), | |||||
| pytest.param({"orderby": "name", "desc": "False"}, "", marks=pytest.mark.skip(reason="issues/5851")), | |||||
| pytest.param({"orderby": "unknown"}, "orderby should be create_time or update_time", marks=pytest.mark.skip(reason="issues/5851")), | |||||
| ], | |||||
| ) | |||||
| def test_orderby(self, add_documents, params, expected_message): | |||||
| dataset, _ = add_documents | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| dataset.list_documents(**params) | |||||
| @pytest.mark.p3 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_message", | |||||
| [ | |||||
| ({"desc": None}, "not instance of"), | |||||
| ({"desc": "true"}, "not instance of"), | |||||
| ({"desc": "True"}, "not instance of"), | |||||
| ({"desc": True}, ""), | |||||
| pytest.param({"desc": "false"}, "", marks=pytest.mark.skip(reason="issues/5851")), | |||||
| ({"desc": "False"}, "not instance of"), | |||||
| ({"desc": False}, ""), | |||||
| ({"desc": "False", "orderby": "update_time"}, "not instance of"), | |||||
| pytest.param({"desc": "unknown"}, "desc should be true or false", marks=pytest.mark.skip(reason="issues/5851")), | |||||
| ], | |||||
| ) | |||||
| def test_desc(self, add_documents, params, expected_message): | |||||
| dataset, _ = add_documents | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| dataset.list_documents(**params) | |||||
| @pytest.mark.p2 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_num", | |||||
| [ | |||||
| ({"keywords": None}, 5), | |||||
| ({"keywords": ""}, 5), | |||||
| ({"keywords": "0"}, 1), | |||||
| ({"keywords": "ragflow_test_upload"}, 5), | |||||
| ({"keywords": "unknown"}, 0), | |||||
| ], | |||||
| ) | |||||
| def test_keywords(self, add_documents, params, expected_num): | |||||
| dataset, _ = add_documents | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_num, str(documents) | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "params, expected_num, expected_message", | |||||
| [ | |||||
| ({"name": None}, 5, ""), | |||||
| ({"name": ""}, 5, ""), | |||||
| ({"name": "ragflow_test_upload_0.txt"}, 1, ""), | |||||
| ({"name": "unknown.txt"}, 0, "You don't own the document unknown.txt"), | |||||
| ], | |||||
| ) | |||||
| def test_name(self, add_documents, params, expected_num, expected_message): | |||||
| dataset, _ = add_documents | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_num, str(documents) | |||||
| if params["name"] not in [None, ""]: | |||||
| assert documents[0].name == params["name"], str(documents) | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "document_id, expected_num, expected_message", | |||||
| [ | |||||
| (None, 5, ""), | |||||
| ("", 5, ""), | |||||
| (lambda docs: docs[0].id, 1, ""), | |||||
| ("unknown.txt", 0, "You don't own the document unknown.txt"), | |||||
| ], | |||||
| ) | |||||
| def test_id(self, add_documents, document_id, expected_num, expected_message): | |||||
| dataset, documents = add_documents | |||||
| if callable(document_id): | |||||
| params = {"id": document_id(documents)} | |||||
| else: | |||||
| params = {"id": document_id} | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_num, str(documents) | |||||
| if params["id"] not in [None, ""]: | |||||
| assert documents[0].id == params["id"], str(documents) | |||||
| @pytest.mark.p3 | |||||
| @pytest.mark.parametrize( | |||||
| "document_id, name, expected_num, expected_message", | |||||
| [ | |||||
| (lambda docs: docs[0].id, "ragflow_test_upload_0.txt", 1, ""), | |||||
| (lambda docs: docs[0].id, "ragflow_test_upload_1.txt", 0, ""), | |||||
| (lambda docs: docs[0].id, "unknown", 0, "You don't own the document unknown"), | |||||
| ("invalid_id", "ragflow_test_upload_0.txt", 0, "You don't own the document invalid_id"), | |||||
| ], | |||||
| ) | |||||
| def test_name_and_id(self, add_documents, document_id, name, expected_num, expected_message): | |||||
| dataset, documents = add_documents | |||||
| params = {"id": document_id(documents) if callable(document_id) else document_id, "name": name} | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| documents = dataset.list_documents(**params) | |||||
| assert len(documents) == expected_num, str(documents) | |||||
| @pytest.mark.p3 | |||||
| def test_concurrent_list(self, add_documents): | |||||
| dataset, _ = add_documents | |||||
| count = 100 | |||||
| def list_docs(): | |||||
| return dataset.list_documents() | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | |||||
| futures = [executor.submit(list_docs) for _ in range(count)] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| for future in futures: | |||||
| docs = future.result() | |||||
| assert len(docs) == 5, str(docs) | |||||
| @pytest.mark.p3 | |||||
| def test_invalid_params(self, add_documents): | |||||
| dataset, _ = add_documents | |||||
| params = {"a": "b"} | |||||
| with pytest.raises(TypeError) as excinfo: | |||||
| dataset.list_documents(**params) | |||||
| assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value) |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | |||||
| from common import bulk_upload_documents | |||||
| from ragflow_sdk import DataSet | |||||
| from utils import wait_for | |||||
| @wait_for(30, 1, "Document parsing timeout") | |||||
| def condition(_dataset: DataSet, _document_ids=None): | |||||
| documents = _dataset.list_documents(page_size=1000) | |||||
| if _document_ids is None: | |||||
| for document in documents: | |||||
| if document.run != "DONE": | |||||
| return False | |||||
| return True | |||||
| target_ids = set(_document_ids) | |||||
| for document in documents: | |||||
| if document.id in target_ids: | |||||
| if document.run != "DONE": | |||||
| return False | |||||
| return True | |||||
| def validate_document_details(dataset, document_ids): | |||||
| documents = dataset.list_documents(page_size=1000) | |||||
| for document in documents: | |||||
| if document.id in document_ids: | |||||
| assert document.run == "DONE" | |||||
| assert len(document.process_begin_at) > 0 | |||||
| assert document.process_duation > 0 | |||||
| assert document.progress > 0 | |||||
| assert "Task done" in document.progress_msg | |||||
| class TestDocumentsParse: | |||||
| @pytest.mark.parametrize( | |||||
| "payload, expected_message", | |||||
| [ | |||||
| pytest.param(None, "AttributeError", marks=pytest.mark.skip), | |||||
| pytest.param({"document_ids": []}, "`document_ids` is required", marks=pytest.mark.p1), | |||||
| pytest.param({"document_ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", marks=pytest.mark.p3), | |||||
| pytest.param({"document_ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", marks=pytest.mark.p3), | |||||
| pytest.param("not json", "AttributeError", marks=pytest.mark.skip), | |||||
| pytest.param(lambda r: {"document_ids": r[:1]}, "", marks=pytest.mark.p1), | |||||
| pytest.param(lambda r: {"document_ids": r}, "", marks=pytest.mark.p1), | |||||
| ], | |||||
| ) | |||||
| def test_basic_scenarios(self, add_documents_func, payload, expected_message): | |||||
| dataset, documents = add_documents_func | |||||
| if callable(payload): | |||||
| payload = payload([doc.id for doc in documents]) | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.async_parse_documents(**payload) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| dataset.async_parse_documents(**payload) | |||||
| condition(dataset, payload["document_ids"]) | |||||
| validate_document_details(dataset, payload["document_ids"]) | |||||
| @pytest.mark.parametrize( | |||||
| "payload", | |||||
| [ | |||||
| pytest.param(lambda r: {"document_ids": ["invalid_id"] + r}, marks=pytest.mark.p3), | |||||
| pytest.param(lambda r: {"document_ids": r[:1] + ["invalid_id"] + r[1:3]}, marks=pytest.mark.p1), | |||||
| pytest.param(lambda r: {"document_ids": r + ["invalid_id"]}, marks=pytest.mark.p3), | |||||
| ], | |||||
| ) | |||||
| def test_parse_partial_invalid_document_id(self, add_documents_func, payload): | |||||
| dataset, documents = add_documents_func | |||||
| document_ids = [doc.id for doc in documents] | |||||
| payload = payload(document_ids) | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.async_parse_documents(**payload) | |||||
| assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value) | |||||
| condition(dataset, document_ids) | |||||
| validate_document_details(dataset, document_ids) | |||||
| @pytest.mark.p3 | |||||
| def test_repeated_parse(self, add_documents_func): | |||||
| dataset, documents = add_documents_func | |||||
| document_ids = [doc.id for doc in documents] | |||||
| dataset.async_parse_documents(document_ids=document_ids) | |||||
| condition(dataset, document_ids) | |||||
| dataset.async_parse_documents(document_ids=document_ids) | |||||
| @pytest.mark.p3 | |||||
| def test_duplicate_parse(self, add_documents_func): | |||||
| dataset, documents = add_documents_func | |||||
| document_ids = [doc.id for doc in documents] | |||||
| dataset.async_parse_documents(document_ids=document_ids + document_ids) | |||||
| condition(dataset, document_ids) | |||||
| validate_document_details(dataset, document_ids) | |||||
| @pytest.mark.p3 | |||||
| def test_parse_100_files(add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| documents = bulk_upload_documents(dataset, 100, tmp_path) | |||||
| document_ids = [doc.id for doc in documents] | |||||
| dataset.async_parse_documents(document_ids=document_ids) | |||||
| condition(dataset, document_ids) | |||||
| validate_document_details(dataset, document_ids) | |||||
| @pytest.mark.p3 | |||||
| def test_concurrent_parse(add_dataset_func, tmp_path): | |||||
| count = 100 | |||||
| dataset = add_dataset_func | |||||
| documents = bulk_upload_documents(dataset, count, tmp_path) | |||||
| document_ids = [doc.id for doc in documents] | |||||
| def parse_doc(doc_id): | |||||
| dataset.async_parse_documents(document_ids=[doc_id]) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | |||||
| futures = [executor.submit(parse_doc, doc.id) for doc in documents] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| condition(dataset, document_ids) | |||||
| validate_document_details(dataset, document_ids) |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import pytest | |||||
| def validate_document_parse_done(dataset, document_ids): | |||||
| documents = dataset.list_documents(page_size=1000) | |||||
| for document in documents: | |||||
| if document.id in document_ids: | |||||
| assert document.run == "DONE" | |||||
| assert len(document.process_begin_at) > 0 | |||||
| assert document.process_duation > 0 | |||||
| assert document.progress > 0 | |||||
| assert "Task done" in document.progress_msg | |||||
| def validate_document_parse_cancel(dataset, document_ids): | |||||
| documents = dataset.list_documents(page_size=1000) | |||||
| for document in documents: | |||||
| assert document.run == "CANCEL" | |||||
| assert len(document.process_begin_at) > 0 | |||||
| assert document.progress == 0.0 | |||||
| @pytest.mark.skip | |||||
| class TestDocumentsParseStop: | |||||
| pass |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import pytest | |||||
| from configs import DOCUMENT_NAME_LIMIT | |||||
| from ragflow_sdk import DataSet | |||||
| class TestDocumentsUpdated: | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "name, expected_message", | |||||
| [ | |||||
| ("new_name.txt", ""), | |||||
| (f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt", "The name should be less than 128 bytes"), | |||||
| (0, "AttributeError"), | |||||
| (None, "AttributeError"), | |||||
| ("", "The extension of file can't be changed"), | |||||
| ("ragflow_test_upload_0", "The extension of file can't be changed"), | |||||
| ("ragflow_test_upload_1.txt", "Duplicated document name in the same dataset"), | |||||
| ("RAGFLOW_TEST_UPLOAD_1.TXT", ""), | |||||
| ], | |||||
| ) | |||||
| def test_name(self, add_documents, name, expected_message): | |||||
| dataset, documents = add_documents | |||||
| document = documents[0] | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| document.update({"name": name}) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| document.update({"name": name}) | |||||
| updated_doc = dataset.list_documents(id=document.id)[0] | |||||
| assert updated_doc.name == name, str(updated_doc) | |||||
| @pytest.mark.p3 | |||||
| @pytest.mark.parametrize( | |||||
| "meta_fields, expected_message", | |||||
| [ | |||||
| ({"test": "test"}, ""), | |||||
| ("test", "meta_fields must be a dictionary"), | |||||
| ], | |||||
| ) | |||||
| def test_meta_fields(self, add_documents, meta_fields, expected_message): | |||||
| _, documents = add_documents | |||||
| document = documents[0] | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| document.update({"meta_fields": meta_fields}) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| document.update({"meta_fields": meta_fields}) | |||||
| @pytest.mark.p2 | |||||
| @pytest.mark.parametrize( | |||||
| "chunk_method, expected_message", | |||||
| [ | |||||
| ("naive", ""), | |||||
| ("manual", ""), | |||||
| ("qa", ""), | |||||
| ("table", ""), | |||||
| ("paper", ""), | |||||
| ("book", ""), | |||||
| ("laws", ""), | |||||
| ("presentation", ""), | |||||
| ("picture", ""), | |||||
| ("one", ""), | |||||
| ("knowledge_graph", ""), | |||||
| ("email", ""), | |||||
| ("tag", ""), | |||||
| ("", "`chunk_method` doesn't exist"), | |||||
| ("other_chunk_method", "`chunk_method` other_chunk_method doesn't exist"), | |||||
| ], | |||||
| ) | |||||
| def test_chunk_method(self, add_documents, chunk_method, expected_message): | |||||
| dataset, documents = add_documents | |||||
| document = documents[0] | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| document.update({"chunk_method": chunk_method}) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| document.update({"chunk_method": chunk_method}) | |||||
| updated_doc = dataset.list_documents(id=document.id)[0] | |||||
| assert updated_doc.chunk_method == chunk_method, str(updated_doc) | |||||
| @pytest.mark.p3 | |||||
| @pytest.mark.parametrize( | |||||
| "payload, expected_message", | |||||
| [ | |||||
| ({"chunk_count": 1}, "Can't change `chunk_count`"), | |||||
| pytest.param( | |||||
| {"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"create_time": 1}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"created_by": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"dataset_id": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"id": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"location": "ragflow_test.txt"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"process_begin_at": 1}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"process_duation": 1.0}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| ({"progress": 1.0}, "Can't change `progress`"), | |||||
| pytest.param( | |||||
| {"progress_msg": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"run": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"size": 1}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"source_type": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"thumbnail": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| ({"token_count": 1}, "Can't change `token_count`"), | |||||
| pytest.param( | |||||
| {"type": "ragflow_test"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| pytest.param( | |||||
| {"update_time": 1}, | |||||
| "The input parameters are invalid", | |||||
| marks=pytest.mark.skip(reason="issues/6104"), | |||||
| ), | |||||
| ], | |||||
| ) | |||||
| def test_invalid_field(self, add_documents, payload, expected_message): | |||||
| _, documents = add_documents | |||||
| document = documents[0] | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| document.update(payload) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| class TestUpdateDocumentParserConfig: | |||||
| @pytest.mark.p2 | |||||
| @pytest.mark.parametrize( | |||||
| "chunk_method, parser_config, expected_message", | |||||
| [ | |||||
| ("naive", {}, ""), | |||||
| ( | |||||
| "naive", | |||||
| { | |||||
| "chunk_token_num": 128, | |||||
| "layout_recognize": "DeepDOC", | |||||
| "html4excel": False, | |||||
| "delimiter": r"\n", | |||||
| "task_page_size": 12, | |||||
| "raptor": {"use_raptor": False}, | |||||
| }, | |||||
| "", | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"chunk_token_num": -1}, | |||||
| "chunk_token_num should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"chunk_token_num": 0}, | |||||
| "chunk_token_num should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"chunk_token_num": 100000000}, | |||||
| "chunk_token_num should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"chunk_token_num": 3.14}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"chunk_token_num": "1024"}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| ("naive", {"layout_recognize": "DeepDOC"}, ""), | |||||
| ("naive", {"layout_recognize": "Naive"}, ""), | |||||
| ("naive", {"html4excel": True}, ""), | |||||
| ("naive", {"html4excel": False}, ""), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"html4excel": 1}, | |||||
| "html4excel should be True or False", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| ("naive", {"delimiter": ""}, ""), | |||||
| ("naive", {"delimiter": "`##`"}, ""), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"delimiter": 1}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"task_page_size": -1}, | |||||
| "task_page_size should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"task_page_size": 0}, | |||||
| "task_page_size should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"task_page_size": 100000000}, | |||||
| "task_page_size should be in range from 1 to 100000000", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"task_page_size": 3.14}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"task_page_size": "1024"}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| ("naive", {"raptor": {"use_raptor": True}}, ""), | |||||
| ("naive", {"raptor": {"use_raptor": False}}, ""), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"invalid_key": "invalid_value"}, | |||||
| "Abnormal 'parser_config'. Invalid key: invalid_key", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_keywords": -1}, | |||||
| "auto_keywords should be in range from 0 to 32", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_keywords": 32}, | |||||
| "auto_keywords should be in range from 0 to 32", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_keywords": 3.14}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_keywords": "1024"}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_questions": -1}, | |||||
| "auto_questions should be in range from 0 to 10", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_questions": 10}, | |||||
| "auto_questions should be in range from 0 to 10", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_questions": 3.14}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"auto_questions": "1024"}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"topn_tags": -1}, | |||||
| "topn_tags should be in range from 0 to 10", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"topn_tags": 10}, | |||||
| "topn_tags should be in range from 0 to 10", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"topn_tags": 3.14}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| pytest.param( | |||||
| "naive", | |||||
| {"topn_tags": "1024"}, | |||||
| "", | |||||
| marks=pytest.mark.skip(reason="issues/6098"), | |||||
| ), | |||||
| ], | |||||
| ) | |||||
| def test_parser_config(self, client, add_documents, chunk_method, parser_config, expected_message): | |||||
| dataset, documents = add_documents | |||||
| document = documents[0] | |||||
| from operator import attrgetter | |||||
| update_data = {"chunk_method": chunk_method, "parser_config": parser_config} | |||||
| if expected_message: | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| document.update(update_data) | |||||
| assert expected_message in str(excinfo.value), str(excinfo.value) | |||||
| else: | |||||
| document.update(update_data) | |||||
| updated_doc = dataset.list_documents(id=document.id)[0] | |||||
| if parser_config: | |||||
| for k, v in parser_config.items(): | |||||
| if isinstance(v, dict): | |||||
| for kk, vv in v.items(): | |||||
| assert attrgetter(f"{k}.{kk}")(updated_doc.parser_config) == vv, str(updated_doc) | |||||
| else: | |||||
| assert attrgetter(k)(updated_doc.parser_config) == v, str(updated_doc) | |||||
| else: | |||||
| expected_config = DataSet.ParserConfig( | |||||
| client, | |||||
| { | |||||
| "chunk_token_num": 128, | |||||
| "delimiter": r"\n", | |||||
| "html4excel": False, | |||||
| "layout_recognize": "DeepDOC", | |||||
| "raptor": {"use_raptor": False}, | |||||
| }, | |||||
| ) | |||||
| assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc) |
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| import string | |||||
| from concurrent.futures import ThreadPoolExecutor, as_completed | |||||
| import pytest | |||||
| from configs import DOCUMENT_NAME_LIMIT | |||||
| from utils.file_utils import create_txt_file | |||||
| class TestDocumentsUpload: | |||||
| @pytest.mark.p1 | |||||
| def test_valid_single_upload(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = create_txt_file(tmp_path / "ragflow_test.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| for document in documents: | |||||
| assert document.dataset_id == dataset.id, str(document) | |||||
| assert document.name == fp.name, str(document) | |||||
| @pytest.mark.p1 | |||||
| @pytest.mark.parametrize( | |||||
| "generate_test_files", | |||||
| [ | |||||
| "docx", | |||||
| "excel", | |||||
| "ppt", | |||||
| "image", | |||||
| "pdf", | |||||
| "txt", | |||||
| "md", | |||||
| "json", | |||||
| "eml", | |||||
| "html", | |||||
| ], | |||||
| indirect=True, | |||||
| ) | |||||
| def test_file_type_validation(self, add_dataset_func, generate_test_files, request): | |||||
| dataset = add_dataset_func | |||||
| fp = generate_test_files[request.node.callspec.params["generate_test_files"]] | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| for document in documents: | |||||
| assert document.dataset_id == dataset.id, str(document) | |||||
| assert document.name == fp.name, str(document) | |||||
| @pytest.mark.p2 | |||||
| @pytest.mark.parametrize( | |||||
| "file_type", | |||||
| ["exe", "unknown"], | |||||
| ) | |||||
| def test_unsupported_file_type(self, add_dataset_func, tmp_path, file_type): | |||||
| dataset = add_dataset_func | |||||
| fp = tmp_path / f"ragflow_test.{file_type}" | |||||
| fp.touch() | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| assert str(excinfo.value) == f"ragflow_test.{file_type}: This type of file has not been supported yet!", str(excinfo.value) | |||||
| @pytest.mark.p2 | |||||
| def test_missing_file(self, add_dataset_func): | |||||
| dataset = add_dataset_func | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.upload_documents([]) | |||||
| assert str(excinfo.value) == "No file part!", str(excinfo.value) | |||||
| @pytest.mark.p3 | |||||
| def test_empty_file(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = tmp_path / "empty.txt" | |||||
| fp.touch() | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| for document in documents: | |||||
| assert document.size == 0, str(document) | |||||
| @pytest.mark.p3 | |||||
| def test_filename_empty(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = create_txt_file(tmp_path / "ragflow_test.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.upload_documents([{"display_name": "", "blob": blob}]) | |||||
| assert str(excinfo.value) == "No file selected!", str(excinfo.value) | |||||
| @pytest.mark.p2 | |||||
| def test_filename_exceeds_max_length(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| with pytest.raises(Exception) as excinfo: | |||||
| dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| assert str(excinfo.value) == "File name should be less than 128 bytes.", str(excinfo.value) | |||||
| @pytest.mark.p2 | |||||
| def test_duplicate_files(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = create_txt_file(tmp_path / "ragflow_test.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}, {"display_name": fp.name, "blob": blob}]) | |||||
| assert len(documents) == 2, str(documents) | |||||
| for i, document in enumerate(documents): | |||||
| assert document.dataset_id == dataset.id, str(document) | |||||
| expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}" | |||||
| assert document.name == expected_name, str(document) | |||||
| @pytest.mark.p2 | |||||
| def test_same_file_repeat(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| fp = create_txt_file(tmp_path / "ragflow_test.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| for i in range(3): | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| assert len(documents) == 1, str(documents) | |||||
| document = documents[0] | |||||
| assert document.dataset_id == dataset.id, str(document) | |||||
| expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}" | |||||
| assert document.name == expected_name, str(document) | |||||
| @pytest.mark.p3 | |||||
| def test_filename_special_characters(self, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| illegal_chars = '<>:"/\\|?*' | |||||
| translation_table = str.maketrans({char: "_" for char in illegal_chars}) | |||||
| safe_filename = string.punctuation.translate(translation_table) | |||||
| fp = tmp_path / f"{safe_filename}.txt" | |||||
| fp.write_text("Sample text content") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| assert len(documents) == 1, str(documents) | |||||
| document = documents[0] | |||||
| assert document.dataset_id == dataset.id, str(document) | |||||
| assert document.name == fp.name, str(document) | |||||
| @pytest.mark.p1 | |||||
| def test_multiple_files(self, client, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| expected_document_count = 20 | |||||
| document_infos = [] | |||||
| for i in range(expected_document_count): | |||||
| fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt") | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| document_infos.append({"display_name": fp.name, "blob": blob}) | |||||
| documents = dataset.upload_documents(document_infos) | |||||
| assert len(documents) == expected_document_count, str(documents) | |||||
| retrieved_dataset = client.get_dataset(name=dataset.name) | |||||
| assert retrieved_dataset.document_count == expected_document_count, str(retrieved_dataset) | |||||
| @pytest.mark.p3 | |||||
| def test_concurrent_upload(self, client, add_dataset_func, tmp_path): | |||||
| dataset = add_dataset_func | |||||
| count = 20 | |||||
| fps = [create_txt_file(tmp_path / f"ragflow_test_{i}.txt") for i in range(count)] | |||||
| def upload_file(fp): | |||||
| with fp.open("rb") as f: | |||||
| blob = f.read() | |||||
| return dataset.upload_documents([{"display_name": fp.name, "blob": blob}]) | |||||
| with ThreadPoolExecutor(max_workers=5) as executor: | |||||
| futures = [executor.submit(upload_file, fp) for fp in fps] | |||||
| responses = list(as_completed(futures)) | |||||
| assert len(responses) == count, responses | |||||
| retrieved_dataset = client.get_dataset(name=dataset.name) | |||||
| assert retrieved_dataset.document_count == count, str(retrieved_dataset) |