Browse Source

Test: add sdk Document test cases (#8094)

### What problem does this PR solve?

Add sdk document test cases

### Type of change

- [x] Add test cases
tags/v0.19.1
Liu An 4 months ago
parent
commit
cc1b2c8f09
No account linked to committer's email address
20 changed files with 1450 additions and 27 deletions
  1. 1
    1
      sdk/python/ragflow_sdk/modules/document.py
  2. 1
    0
      test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
  3. 1
    0
      test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py
  4. 1
    0
      test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py
  5. 1
    0
      test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
  6. 3
    3
      test/testcases/test_http_api/test_file_management_within_dataset/conftest.py
  7. 8
    6
      test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py
  8. 6
    4
      test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py
  9. 8
    7
      test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py
  10. 1
    0
      test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py
  11. 19
    1
      test/testcases/test_sdk_api/common.py
  12. 82
    5
      test/testcases/test_sdk_api/conftest.py
  13. 57
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py
  14. 118
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_delete_documents.py
  15. 89
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_download_document.py
  16. 247
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py
  17. 145
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_parse_documents.py
  18. 41
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_stop_parse_documents.py
  19. 411
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py
  20. 210
    0
      test/testcases/test_sdk_api/test_file_management_within_dataset/test_upload_documents.py

+ 1
- 1
sdk/python/ragflow_sdk/modules/document.py View File

@@ -41,7 +41,7 @@ class Document(Base):
self.progress = 0.0
self.progress_msg = ""
self.process_begin_at = None
self.process_duration = 0.0
self.process_duation = 0.0
self.run = "0"
self.status = "1"
for k in list(res_dict.keys()):

+ 1
- 0
test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py View File

@@ -85,6 +85,7 @@ class TestCapability:
futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)


@pytest.mark.usefixtures("clear_datasets")

+ 1
- 0
test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py View File

@@ -93,6 +93,7 @@ class TestCapability:
futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)


class TestDatasetsDelete:

+ 1
- 0
test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py View File

@@ -49,6 +49,7 @@ class TestCapability:
futures = [executor.submit(list_datasets, api_key) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)


@pytest.mark.usefixtures("add_datasets")

+ 1
- 0
test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py View File

@@ -95,6 +95,7 @@ class TestCapability:
futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)


class TestDatasetUpdate:

+ 3
- 3
test/testcases/test_http_api/test_file_management_within_dataset/conftest.py View File

@@ -25,7 +25,7 @@ def add_document_func(request, api_key, add_dataset, ragflow_tmp_dir):
document_ids = bulk_upload_documents(api_key, dataset_id, 1, ragflow_tmp_dir)

def cleanup():
delete_documents(api_key, dataset_id, {"ids": document_ids})
delete_documents(api_key, dataset_id, {"ids": None})

request.addfinalizer(cleanup)
return dataset_id, document_ids[0]
@@ -37,7 +37,7 @@ def add_documents(request, api_key, add_dataset, ragflow_tmp_dir):
document_ids = bulk_upload_documents(api_key, dataset_id, 5, ragflow_tmp_dir)

def cleanup():
delete_documents(api_key, dataset_id, {"ids": document_ids})
delete_documents(api_key, dataset_id, {"ids": None})

request.addfinalizer(cleanup)
return dataset_id, document_ids
@@ -49,7 +49,7 @@ def add_documents_func(request, api_key, add_dataset_func, ragflow_tmp_dir):
document_ids = bulk_upload_documents(api_key, dataset_id, 3, ragflow_tmp_dir)

def cleanup():
delete_documents(api_key, dataset_id, {"ids": document_ids})
delete_documents(api_key, dataset_id, {"ids": None})

request.addfinalizer(cleanup)
return dataset_id, document_ids

+ 8
- 6
test/testcases/test_http_api/test_file_management_within_dataset/test_delete_documents.py View File

@@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed


import pytest
from common import INVALID_API_TOKEN, bulk_upload_documents, delete_documents, list_documents
@@ -148,9 +149,9 @@ class TestDocumentsDeletion:

@pytest.mark.p3
def test_concurrent_deletion(api_key, add_dataset, tmp_path):
documents_num = 100
count = 100
dataset_id = add_dataset
document_ids = bulk_upload_documents(api_key, dataset_id, documents_num, tmp_path)
document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path)

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
@@ -160,10 +161,11 @@ def test_concurrent_deletion(api_key, add_dataset, tmp_path):
dataset_id,
{"ids": document_ids[i : i + 1]},
)
for i in range(documents_num)
for i in range(count)
]
responses = [f.result() for f in futures]
assert all(r["code"] == 0 for r in responses)
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)


@pytest.mark.p3

+ 6
- 4
test/testcases/test_http_api/test_file_management_within_dataset/test_list_documents.py View File

@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from common import INVALID_API_TOKEN, list_documents
@@ -342,11 +342,13 @@ class TestDocumentsList:
@pytest.mark.p3
def test_concurrent_list(self, api_key, add_documents):
dataset_id, _ = add_documents
count = 100

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(100)]
responses = [f.result() for f in futures]
assert all(r["code"] == 0 for r in responses)
futures = [executor.submit(list_documents, api_key, dataset_id) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)

@pytest.mark.p3
def test_invalid_params(self, api_key, add_documents):

+ 8
- 7
test/testcases/test_http_api/test_file_management_within_dataset/test_parse_documents.py View File

@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from common import INVALID_API_TOKEN, bulk_upload_documents, list_documents, parse_documents
@@ -195,9 +195,9 @@ def test_concurrent_parse(api_key, add_dataset_func, tmp_path):
return False
return True

document_num = 100
count = 100
dataset_id = add_dataset_func
document_ids = bulk_upload_documents(api_key, dataset_id, document_num, tmp_path)
document_ids = bulk_upload_documents(api_key, dataset_id, count, tmp_path)

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [
@@ -207,11 +207,12 @@ def test_concurrent_parse(api_key, add_dataset_func, tmp_path):
dataset_id,
{"document_ids": document_ids[i : i + 1]},
)
for i in range(document_num)
for i in range(count)
]
responses = [f.result() for f in futures]
assert all(r["code"] == 0 for r in responses)
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)

condition(api_key, dataset_id, document_num)
condition(api_key, dataset_id, count)

validate_document_details(api_key, dataset_id, document_ids)

+ 1
- 0
test/testcases/test_http_api/test_file_management_within_dataset/test_upload_documents.py View File

@@ -213,6 +213,7 @@ class TestDocumentsUpload:
futures = [executor.submit(upload_documents, api_key, dataset_id, fps[i : i + 1]) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
assert all(futures.result()["code"] == 0 for futures in futures)

res = list_datasets(api_key, {"id": dataset_id})
assert res["data"][0]["document_count"] == count

+ 19
- 1
test/testcases/test_sdk_api/common.py View File

@@ -14,10 +14,28 @@
# limitations under the License.
#

from pathlib import Path

from ragflow_sdk import DataSet, Document, RAGFlow
from utils.file_utils import create_txt_file


# DATASET MANAGEMENT
def batch_create_datasets(client, num):
def batch_create_datasets(client: RAGFlow, num: int) -> list[DataSet]:
datasets = []
for i in range(num):
dataset = client.create_dataset(name=f"dataset_{i}")
datasets.append(dataset)
return datasets


# FILE MANAGEMENT WITHIN DATASET
def bulk_upload_documents(dataset: DataSet, num: int, tmp_path: Path) -> list[Document]:
document_infos = []
for i in range(num):
fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt")
with fp.open("rb") as f:
blob = f.read()
document_infos.append({"display_name": fp.name, "blob": blob})

return dataset.upload_documents(document_infos)

+ 82
- 5
test/testcases/test_sdk_api/conftest.py View File

@@ -14,32 +14,109 @@
# limitations under the License.
#

from pathlib import Path

import pytest
from common import (
batch_create_datasets,
bulk_upload_documents,
)
from configs import HOST_ADDRESS, VERSION
from ragflow_sdk import RAGFlow
from pytest import FixtureRequest
from ragflow_sdk import DataSet, RAGFlow
from utils import wait_for
from utils.file_utils import (
create_docx_file,
create_eml_file,
create_excel_file,
create_html_file,
create_image_file,
create_json_file,
create_md_file,
create_pdf_file,
create_ppt_file,
create_txt_file,
)


@wait_for(30, 1, "Document parsing timeout")
def condition(_dataset: DataSet):
documents = DataSet.list_documents(page_size=1000)
for document in documents:
if document.run != "DONE":
return False
return True


@pytest.fixture
def generate_test_files(request, tmp_path):
file_creators = {
"docx": (tmp_path / "ragflow_test.docx", create_docx_file),
"excel": (tmp_path / "ragflow_test.xlsx", create_excel_file),
"ppt": (tmp_path / "ragflow_test.pptx", create_ppt_file),
"image": (tmp_path / "ragflow_test.png", create_image_file),
"pdf": (tmp_path / "ragflow_test.pdf", create_pdf_file),
"txt": (tmp_path / "ragflow_test.txt", create_txt_file),
"md": (tmp_path / "ragflow_test.md", create_md_file),
"json": (tmp_path / "ragflow_test.json", create_json_file),
"eml": (tmp_path / "ragflow_test.eml", create_eml_file),
"html": (tmp_path / "ragflow_test.html", create_html_file),
}

files = {}
for file_type, (file_path, creator_func) in file_creators.items():
if request.param in ["", file_type]:
creator_func(file_path)
files[file_type] = file_path
return files


@pytest.fixture(scope="class")
def ragflow_tmp_dir(request, tmp_path_factory) -> Path:
class_name = request.cls.__name__
return tmp_path_factory.mktemp(class_name)


@pytest.fixture(scope="session")
def client(token):
def client(token) -> RAGFlow:
return RAGFlow(api_key=token, base_url=HOST_ADDRESS, version=VERSION)


@pytest.fixture(scope="function")
def clear_datasets(request, client):
def clear_datasets(request: FixtureRequest, client: RAGFlow):
def cleanup():
client.delete_datasets(ids=None)

request.addfinalizer(cleanup)


@pytest.fixture(scope="function")
def add_dataset_func(request, client):
@pytest.fixture(scope="class")
def add_dataset(request: FixtureRequest, client: RAGFlow):
def cleanup():
client.delete_datasets(ids=None)

request.addfinalizer(cleanup)

dataset_ids = batch_create_datasets(client, 1)
return dataset_ids[0]


@pytest.fixture(scope="function")
def add_dataset_func(request: FixtureRequest, client: RAGFlow) -> DataSet:
def cleanup():
client.delete_datasets(ids=None)

request.addfinalizer(cleanup)
return batch_create_datasets(client, 1)[0]


@pytest.fixture(scope="class")
def add_document(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir):
dataset = add_dataset
documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir)

def cleanup():
dataset.delete_documents(ids=None)

request.addfinalizer(cleanup)
return dataset, documents[0]

+ 57
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/conftest.py View File

@@ -0,0 +1,57 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


import pytest
from common import bulk_upload_documents
from pytest import FixtureRequest
from ragflow_sdk import DataSet, Document


@pytest.fixture(scope="function")
def add_document_func(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, Document]:
dataset = add_dataset
documents = bulk_upload_documents(dataset, 1, ragflow_tmp_dir)

def cleanup():
dataset.delete_documents(ids=None)

request.addfinalizer(cleanup)
return dataset, documents[0]


@pytest.fixture(scope="class")
def add_documents(request: FixtureRequest, add_dataset: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]:
dataset = add_dataset
documents = bulk_upload_documents(dataset, 5, ragflow_tmp_dir)

def cleanup():
dataset.delete_documents(ids=None)

request.addfinalizer(cleanup)
return dataset, documents


@pytest.fixture(scope="function")
def add_documents_func(request: FixtureRequest, add_dataset_func: DataSet, ragflow_tmp_dir) -> tuple[DataSet, list[Document]]:
dataset = add_dataset_func
documents = bulk_upload_documents(dataset, 3, ragflow_tmp_dir)

def cleanup():
dataset.delete_documents(ids=None)

request.addfinalizer(cleanup)
return dataset, documents

+ 118
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_delete_documents.py View File

@@ -0,0 +1,118 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from common import bulk_upload_documents


class TestDocumentsDeletion:
@pytest.mark.p1
@pytest.mark.parametrize(
"payload, expected_message, remaining",
[
({"ids": None}, "", 0),
({"ids": []}, "", 0),
({"ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", 3),
({"ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", 3),
("not json", "must be a mapping", 3),
(lambda r: {"ids": r[:1]}, "", 2),
(lambda r: {"ids": r}, "", 0),
],
)
def test_basic_scenarios(
self,
add_documents_func,
payload,
expected_message,
remaining,
):
dataset, documents = add_documents_func
if callable(payload):
payload = payload([document.id for document in documents])

if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.delete_documents(**payload)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
dataset.delete_documents(**payload)

documents = dataset.list_documents()
assert len(documents) == remaining, str(documents)

@pytest.mark.p2
@pytest.mark.parametrize(
"payload",
[
lambda r: {"ids": ["invalid_id"] + r},
lambda r: {"ids": r[:1] + ["invalid_id"] + r[1:3]},
lambda r: {"ids": r + ["invalid_id"]},
],
)
def test_delete_partial_invalid_id(self, add_documents_func, payload):
dataset, documents = add_documents_func
payload = payload([document.id for document in documents])

with pytest.raises(Exception) as excinfo:
dataset.delete_documents(**payload)
assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value)

documents = dataset.list_documents()
assert len(documents) == 0, str(documents)

@pytest.mark.p2
def test_repeated_deletion(self, add_documents_func):
dataset, documents = add_documents_func
document_ids = [document.id for document in documents]
dataset.delete_documents(ids=document_ids)
with pytest.raises(Exception) as excinfo:
dataset.delete_documents(ids=document_ids)
assert "Documents not found" in str(excinfo.value), str(excinfo.value)

@pytest.mark.p2
def test_duplicate_deletion(self, add_documents_func):
dataset, documents = add_documents_func
document_ids = [document.id for document in documents]
dataset.delete_documents(ids=document_ids + document_ids)
assert len(dataset.list_documents()) == 0, str(dataset.list_documents())


@pytest.mark.p3
def test_concurrent_deletion(add_dataset, tmp_path):
count = 100
dataset = add_dataset
documents = bulk_upload_documents(dataset, count, tmp_path)

def delete_doc(doc_id):
dataset.delete_documents(ids=[doc_id])

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(delete_doc, doc.id) for doc in documents]

responses = list(as_completed(futures))
assert len(responses) == count, responses


@pytest.mark.p3
def test_delete_1k(add_dataset, tmp_path):
count = 1_000
dataset = add_dataset
documents = bulk_upload_documents(dataset, count, tmp_path)
assert len(dataset.list_documents(page_size=count * 2)) == count

dataset.delete_documents(ids=[doc.id for doc in documents])
assert len(dataset.list_documents()) == 0

+ 89
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_download_document.py View File

@@ -0,0 +1,89 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from common import bulk_upload_documents
from utils import compare_by_hash


@pytest.mark.p1
@pytest.mark.parametrize(
"generate_test_files",
[
"docx",
"excel",
"ppt",
"image",
"pdf",
"txt",
"md",
"json",
"eml",
"html",
],
indirect=True,
)
def test_file_type_validation(add_dataset, generate_test_files, request):
dataset = add_dataset
fp = generate_test_files[request.node.callspec.params["generate_test_files"]]
with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])

for document in documents:
with fp.with_stem("ragflow_test_download").open("wb") as f:
f.write(document.download())

assert compare_by_hash(fp, fp.with_stem("ragflow_test_download"))


class TestDocumentDownload:
@pytest.mark.p3
def test_same_file_repeat(self, add_documents, tmp_path, ragflow_tmp_dir):
num = 5
_, documents = add_documents

for i in range(num):
download_path = tmp_path / f"ragflow_test_download_{i}.txt"
with download_path.open("wb") as f:
f.write(documents[0].download())
assert compare_by_hash(ragflow_tmp_dir / "ragflow_test_upload_0.txt", download_path), f"Downloaded file {i} does not match original"


@pytest.mark.p3
def test_concurrent_download(add_dataset, tmp_path):
count = 20
dataset = add_dataset
documents = bulk_upload_documents(dataset, count, tmp_path)

def download_doc(document, i):
download_path = tmp_path / f"ragflow_test_download_{i}.txt"
with download_path.open("wb") as f:
f.write(document.download())
# assert compare_by_hash(tmp_path / f"ragflow_test_upload_{i}.txt", download_path), str(download_path)

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(download_doc, documents[i], i) for i in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses

for i in range(count):
assert compare_by_hash(
tmp_path / f"ragflow_test_upload_{i}.txt",
tmp_path / f"ragflow_test_download_{i}.txt",
)

+ 247
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_list_documents.py View File

@@ -0,0 +1,247 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest


class TestDocumentsList:
@pytest.mark.p1
def test_default(self, add_documents):
dataset, _ = add_documents
documents = dataset.list_documents()
assert len(documents) == 5, str(documents)

@pytest.mark.p1
@pytest.mark.parametrize(
"params, expected_page_size, expected_message",
[
({"page": None, "page_size": 2}, 2, "not instance of"),
({"page": 0, "page_size": 2}, 2, ""),
({"page": 2, "page_size": 2}, 2, ""),
({"page": 3, "page_size": 2}, 1, ""),
({"page": "3", "page_size": 2}, 1, "not instance of"),
pytest.param(
{"page": -1, "page_size": 2},
0,
"Invalid page number",
marks=pytest.mark.skip(reason="issues/5851"),
),
pytest.param(
{"page": "a", "page_size": 2},
0,
"Invalid page value",
marks=pytest.mark.skip(reason="issues/5851"),
),
],
)
def test_page(self, add_documents, params, expected_page_size, expected_message):
dataset, _ = add_documents
if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
documents = dataset.list_documents(**params)
assert len(documents) == expected_page_size, str(documents)

@pytest.mark.p1
@pytest.mark.parametrize(
"params, expected_page_size, expected_message",
[
({"page_size": None}, 5, "not instance of"),
({"page_size": 0}, 0, ""),
({"page_size": 1}, 1, ""),
({"page_size": 6}, 5, ""),
({"page_size": "1"}, 1, "not instance of"),
pytest.param(
{"page_size": -1},
0,
"Invalid page size",
marks=pytest.mark.skip(reason="issues/5851"),
),
pytest.param(
{"page_size": "a"},
0,
"Invalid page size value",
marks=pytest.mark.skip(reason="issues/5851"),
),
],
)
def test_page_size(self, add_documents, params, expected_page_size, expected_message):
dataset, _ = add_documents
if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
documents = dataset.list_documents(**params)
assert len(documents) == expected_page_size, str(documents)

@pytest.mark.p3
@pytest.mark.parametrize(
"params, expected_message",
[
({"orderby": None}, "not instance of"),
({"orderby": "create_time"}, ""),
({"orderby": "update_time"}, ""),
pytest.param({"orderby": "name", "desc": "False"}, "", marks=pytest.mark.skip(reason="issues/5851")),
pytest.param({"orderby": "unknown"}, "orderby should be create_time or update_time", marks=pytest.mark.skip(reason="issues/5851")),
],
)
def test_orderby(self, add_documents, params, expected_message):
dataset, _ = add_documents
if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
dataset.list_documents(**params)

@pytest.mark.p3
@pytest.mark.parametrize(
"params, expected_message",
[
({"desc": None}, "not instance of"),
({"desc": "true"}, "not instance of"),
({"desc": "True"}, "not instance of"),
({"desc": True}, ""),
pytest.param({"desc": "false"}, "", marks=pytest.mark.skip(reason="issues/5851")),
({"desc": "False"}, "not instance of"),
({"desc": False}, ""),
({"desc": "False", "orderby": "update_time"}, "not instance of"),
pytest.param({"desc": "unknown"}, "desc should be true or false", marks=pytest.mark.skip(reason="issues/5851")),
],
)
def test_desc(self, add_documents, params, expected_message):
dataset, _ = add_documents
if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
dataset.list_documents(**params)

@pytest.mark.p2
@pytest.mark.parametrize(
"params, expected_num",
[
({"keywords": None}, 5),
({"keywords": ""}, 5),
({"keywords": "0"}, 1),
({"keywords": "ragflow_test_upload"}, 5),
({"keywords": "unknown"}, 0),
],
)
def test_keywords(self, add_documents, params, expected_num):
dataset, _ = add_documents
documents = dataset.list_documents(**params)
assert len(documents) == expected_num, str(documents)

@pytest.mark.p1
@pytest.mark.parametrize(
"params, expected_num, expected_message",
[
({"name": None}, 5, ""),
({"name": ""}, 5, ""),
({"name": "ragflow_test_upload_0.txt"}, 1, ""),
({"name": "unknown.txt"}, 0, "You don't own the document unknown.txt"),
],
)
def test_name(self, add_documents, params, expected_num, expected_message):
dataset, _ = add_documents
if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
documents = dataset.list_documents(**params)
assert len(documents) == expected_num, str(documents)
if params["name"] not in [None, ""]:
assert documents[0].name == params["name"], str(documents)

@pytest.mark.p1
@pytest.mark.parametrize(
"document_id, expected_num, expected_message",
[
(None, 5, ""),
("", 5, ""),
(lambda docs: docs[0].id, 1, ""),
("unknown.txt", 0, "You don't own the document unknown.txt"),
],
)
def test_id(self, add_documents, document_id, expected_num, expected_message):
dataset, documents = add_documents
if callable(document_id):
params = {"id": document_id(documents)}
else:
params = {"id": document_id}

if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
documents = dataset.list_documents(**params)
assert len(documents) == expected_num, str(documents)
if params["id"] not in [None, ""]:
assert documents[0].id == params["id"], str(documents)

@pytest.mark.p3
@pytest.mark.parametrize(
"document_id, name, expected_num, expected_message",
[
(lambda docs: docs[0].id, "ragflow_test_upload_0.txt", 1, ""),
(lambda docs: docs[0].id, "ragflow_test_upload_1.txt", 0, ""),
(lambda docs: docs[0].id, "unknown", 0, "You don't own the document unknown"),
("invalid_id", "ragflow_test_upload_0.txt", 0, "You don't own the document invalid_id"),
],
)
def test_name_and_id(self, add_documents, document_id, name, expected_num, expected_message):
dataset, documents = add_documents
params = {"id": document_id(documents) if callable(document_id) else document_id, "name": name}

if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.list_documents(**params)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
documents = dataset.list_documents(**params)
assert len(documents) == expected_num, str(documents)

@pytest.mark.p3
def test_concurrent_list(self, add_documents):
dataset, _ = add_documents
count = 100

def list_docs():
return dataset.list_documents()

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(list_docs) for _ in range(count)]
responses = list(as_completed(futures))
assert len(responses) == count, responses
for future in futures:
docs = future.result()
assert len(docs) == 5, str(docs)

@pytest.mark.p3
def test_invalid_params(self, add_documents):
dataset, _ = add_documents
params = {"a": "b"}
with pytest.raises(TypeError) as excinfo:
dataset.list_documents(**params)
assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)

+ 145
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_parse_documents.py View File

@@ -0,0 +1,145 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from common import bulk_upload_documents
from ragflow_sdk import DataSet
from utils import wait_for


@wait_for(30, 1, "Document parsing timeout")
def condition(_dataset: DataSet, _document_ids=None):
documents = _dataset.list_documents(page_size=1000)

if _document_ids is None:
for document in documents:
if document.run != "DONE":
return False
return True

target_ids = set(_document_ids)
for document in documents:
if document.id in target_ids:
if document.run != "DONE":
return False
return True


def validate_document_details(dataset, document_ids):
documents = dataset.list_documents(page_size=1000)
for document in documents:
if document.id in document_ids:
assert document.run == "DONE"
assert len(document.process_begin_at) > 0
assert document.process_duation > 0
assert document.progress > 0
assert "Task done" in document.progress_msg


class TestDocumentsParse:
@pytest.mark.parametrize(
"payload, expected_message",
[
pytest.param(None, "AttributeError", marks=pytest.mark.skip),
pytest.param({"document_ids": []}, "`document_ids` is required", marks=pytest.mark.p1),
pytest.param({"document_ids": ["invalid_id"]}, "Documents not found: ['invalid_id']", marks=pytest.mark.p3),
pytest.param({"document_ids": ["\n!?。;!?\"'"]}, "Documents not found: ['\\n!?。;!?\"\\'']", marks=pytest.mark.p3),
pytest.param("not json", "AttributeError", marks=pytest.mark.skip),
pytest.param(lambda r: {"document_ids": r[:1]}, "", marks=pytest.mark.p1),
pytest.param(lambda r: {"document_ids": r}, "", marks=pytest.mark.p1),
],
)
def test_basic_scenarios(self, add_documents_func, payload, expected_message):
dataset, documents = add_documents_func
if callable(payload):
payload = payload([doc.id for doc in documents])

if expected_message:
with pytest.raises(Exception) as excinfo:
dataset.async_parse_documents(**payload)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
dataset.async_parse_documents(**payload)
condition(dataset, payload["document_ids"])
validate_document_details(dataset, payload["document_ids"])

@pytest.mark.parametrize(
"payload",
[
pytest.param(lambda r: {"document_ids": ["invalid_id"] + r}, marks=pytest.mark.p3),
pytest.param(lambda r: {"document_ids": r[:1] + ["invalid_id"] + r[1:3]}, marks=pytest.mark.p1),
pytest.param(lambda r: {"document_ids": r + ["invalid_id"]}, marks=pytest.mark.p3),
],
)
def test_parse_partial_invalid_document_id(self, add_documents_func, payload):
dataset, documents = add_documents_func
document_ids = [doc.id for doc in documents]
payload = payload(document_ids)

with pytest.raises(Exception) as excinfo:
dataset.async_parse_documents(**payload)
assert "Documents not found: ['invalid_id']" in str(excinfo.value), str(excinfo.value)

condition(dataset, document_ids)
validate_document_details(dataset, document_ids)

@pytest.mark.p3
def test_repeated_parse(self, add_documents_func):
dataset, documents = add_documents_func
document_ids = [doc.id for doc in documents]
dataset.async_parse_documents(document_ids=document_ids)
condition(dataset, document_ids)
dataset.async_parse_documents(document_ids=document_ids)

@pytest.mark.p3
def test_duplicate_parse(self, add_documents_func):
dataset, documents = add_documents_func
document_ids = [doc.id for doc in documents]
dataset.async_parse_documents(document_ids=document_ids + document_ids)
condition(dataset, document_ids)
validate_document_details(dataset, document_ids)


@pytest.mark.p3
def test_parse_100_files(add_dataset_func, tmp_path):
dataset = add_dataset_func
documents = bulk_upload_documents(dataset, 100, tmp_path)
document_ids = [doc.id for doc in documents]

dataset.async_parse_documents(document_ids=document_ids)
condition(dataset, document_ids)
validate_document_details(dataset, document_ids)


@pytest.mark.p3
def test_concurrent_parse(add_dataset_func, tmp_path):
count = 100
dataset = add_dataset_func
documents = bulk_upload_documents(dataset, count, tmp_path)
document_ids = [doc.id for doc in documents]

def parse_doc(doc_id):
dataset.async_parse_documents(document_ids=[doc_id])

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(parse_doc, doc.id) for doc in documents]

responses = list(as_completed(futures))
assert len(responses) == count, responses

condition(dataset, document_ids)
validate_document_details(dataset, document_ids)

+ 41
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_stop_parse_documents.py View File

@@ -0,0 +1,41 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest


def validate_document_parse_done(dataset, document_ids):
documents = dataset.list_documents(page_size=1000)
for document in documents:
if document.id in document_ids:
assert document.run == "DONE"
assert len(document.process_begin_at) > 0
assert document.process_duation > 0
assert document.progress > 0
assert "Task done" in document.progress_msg


def validate_document_parse_cancel(dataset, document_ids):
documents = dataset.list_documents(page_size=1000)
for document in documents:
assert document.run == "CANCEL"
assert len(document.process_begin_at) > 0
assert document.progress == 0.0


@pytest.mark.skip
class TestDocumentsParseStop:
pass

+ 411
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py View File

@@ -0,0 +1,411 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import pytest
from configs import DOCUMENT_NAME_LIMIT
from ragflow_sdk import DataSet


class TestDocumentsUpdated:
@pytest.mark.p1
@pytest.mark.parametrize(
"name, expected_message",
[
("new_name.txt", ""),
(f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt", "The name should be less than 128 bytes"),
(0, "AttributeError"),
(None, "AttributeError"),
("", "The extension of file can't be changed"),
("ragflow_test_upload_0", "The extension of file can't be changed"),
("ragflow_test_upload_1.txt", "Duplicated document name in the same dataset"),
("RAGFLOW_TEST_UPLOAD_1.TXT", ""),
],
)
def test_name(self, add_documents, name, expected_message):
dataset, documents = add_documents
document = documents[0]

if expected_message:
with pytest.raises(Exception) as excinfo:
document.update({"name": name})
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
document.update({"name": name})
updated_doc = dataset.list_documents(id=document.id)[0]
assert updated_doc.name == name, str(updated_doc)

@pytest.mark.p3
@pytest.mark.parametrize(
"meta_fields, expected_message",
[
({"test": "test"}, ""),
("test", "meta_fields must be a dictionary"),
],
)
def test_meta_fields(self, add_documents, meta_fields, expected_message):
_, documents = add_documents
document = documents[0]

if expected_message:
with pytest.raises(Exception) as excinfo:
document.update({"meta_fields": meta_fields})
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
document.update({"meta_fields": meta_fields})

@pytest.mark.p2
@pytest.mark.parametrize(
"chunk_method, expected_message",
[
("naive", ""),
("manual", ""),
("qa", ""),
("table", ""),
("paper", ""),
("book", ""),
("laws", ""),
("presentation", ""),
("picture", ""),
("one", ""),
("knowledge_graph", ""),
("email", ""),
("tag", ""),
("", "`chunk_method` doesn't exist"),
("other_chunk_method", "`chunk_method` other_chunk_method doesn't exist"),
],
)
def test_chunk_method(self, add_documents, chunk_method, expected_message):
dataset, documents = add_documents
document = documents[0]

if expected_message:
with pytest.raises(Exception) as excinfo:
document.update({"chunk_method": chunk_method})
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
document.update({"chunk_method": chunk_method})
updated_doc = dataset.list_documents(id=document.id)[0]
assert updated_doc.chunk_method == chunk_method, str(updated_doc)

@pytest.mark.p3
@pytest.mark.parametrize(
"payload, expected_message",
[
({"chunk_count": 1}, "Can't change `chunk_count`"),
pytest.param(
{"create_date": "Fri, 14 Mar 2025 16:53:42 GMT"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"create_time": 1},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"created_by": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"dataset_id": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"id": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"location": "ragflow_test.txt"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"process_begin_at": 1},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"process_duation": 1.0},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
({"progress": 1.0}, "Can't change `progress`"),
pytest.param(
{"progress_msg": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"run": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"size": 1},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"source_type": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"thumbnail": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
({"token_count": 1}, "Can't change `token_count`"),
pytest.param(
{"type": "ragflow_test"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"update_date": "Fri, 14 Mar 2025 16:33:17 GMT"},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
pytest.param(
{"update_time": 1},
"The input parameters are invalid",
marks=pytest.mark.skip(reason="issues/6104"),
),
],
)
def test_invalid_field(self, add_documents, payload, expected_message):
_, documents = add_documents
document = documents[0]

with pytest.raises(Exception) as excinfo:
document.update(payload)
assert expected_message in str(excinfo.value), str(excinfo.value)


class TestUpdateDocumentParserConfig:
@pytest.mark.p2
@pytest.mark.parametrize(
"chunk_method, parser_config, expected_message",
[
("naive", {}, ""),
(
"naive",
{
"chunk_token_num": 128,
"layout_recognize": "DeepDOC",
"html4excel": False,
"delimiter": r"\n",
"task_page_size": 12,
"raptor": {"use_raptor": False},
},
"",
),
pytest.param(
"naive",
{"chunk_token_num": -1},
"chunk_token_num should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"chunk_token_num": 0},
"chunk_token_num should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"chunk_token_num": 100000000},
"chunk_token_num should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"chunk_token_num": 3.14},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"chunk_token_num": "1024"},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
("naive", {"layout_recognize": "DeepDOC"}, ""),
("naive", {"layout_recognize": "Naive"}, ""),
("naive", {"html4excel": True}, ""),
("naive", {"html4excel": False}, ""),
pytest.param(
"naive",
{"html4excel": 1},
"html4excel should be True or False",
marks=pytest.mark.skip(reason="issues/6098"),
),
("naive", {"delimiter": ""}, ""),
("naive", {"delimiter": "`##`"}, ""),
pytest.param(
"naive",
{"delimiter": 1},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"task_page_size": -1},
"task_page_size should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"task_page_size": 0},
"task_page_size should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"task_page_size": 100000000},
"task_page_size should be in range from 1 to 100000000",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"task_page_size": 3.14},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"task_page_size": "1024"},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
("naive", {"raptor": {"use_raptor": True}}, ""),
("naive", {"raptor": {"use_raptor": False}}, ""),
pytest.param(
"naive",
{"invalid_key": "invalid_value"},
"Abnormal 'parser_config'. Invalid key: invalid_key",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_keywords": -1},
"auto_keywords should be in range from 0 to 32",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_keywords": 32},
"auto_keywords should be in range from 0 to 32",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_keywords": 3.14},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_keywords": "1024"},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_questions": -1},
"auto_questions should be in range from 0 to 10",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_questions": 10},
"auto_questions should be in range from 0 to 10",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_questions": 3.14},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"auto_questions": "1024"},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"topn_tags": -1},
"topn_tags should be in range from 0 to 10",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"topn_tags": 10},
"topn_tags should be in range from 0 to 10",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"topn_tags": 3.14},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
pytest.param(
"naive",
{"topn_tags": "1024"},
"",
marks=pytest.mark.skip(reason="issues/6098"),
),
],
)
def test_parser_config(self, client, add_documents, chunk_method, parser_config, expected_message):
dataset, documents = add_documents
document = documents[0]
from operator import attrgetter

update_data = {"chunk_method": chunk_method, "parser_config": parser_config}

if expected_message:
with pytest.raises(Exception) as excinfo:
document.update(update_data)
assert expected_message in str(excinfo.value), str(excinfo.value)
else:
document.update(update_data)
updated_doc = dataset.list_documents(id=document.id)[0]
if parser_config:
for k, v in parser_config.items():
if isinstance(v, dict):
for kk, vv in v.items():
assert attrgetter(f"{k}.{kk}")(updated_doc.parser_config) == vv, str(updated_doc)
else:
assert attrgetter(k)(updated_doc.parser_config) == v, str(updated_doc)
else:
expected_config = DataSet.ParserConfig(
client,
{
"chunk_token_num": 128,
"delimiter": r"\n",
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
},
)
assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)

+ 210
- 0
test/testcases/test_sdk_api/test_file_management_within_dataset/test_upload_documents.py View File

@@ -0,0 +1,210 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import string
from concurrent.futures import ThreadPoolExecutor, as_completed

import pytest
from configs import DOCUMENT_NAME_LIMIT
from utils.file_utils import create_txt_file


class TestDocumentsUpload:
@pytest.mark.p1
def test_valid_single_upload(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = create_txt_file(tmp_path / "ragflow_test.txt")
with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
for document in documents:
assert document.dataset_id == dataset.id, str(document)
assert document.name == fp.name, str(document)

@pytest.mark.p1
@pytest.mark.parametrize(
"generate_test_files",
[
"docx",
"excel",
"ppt",
"image",
"pdf",
"txt",
"md",
"json",
"eml",
"html",
],
indirect=True,
)
def test_file_type_validation(self, add_dataset_func, generate_test_files, request):
dataset = add_dataset_func
fp = generate_test_files[request.node.callspec.params["generate_test_files"]]

with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
for document in documents:
assert document.dataset_id == dataset.id, str(document)
assert document.name == fp.name, str(document)

@pytest.mark.p2
@pytest.mark.parametrize(
"file_type",
["exe", "unknown"],
)
def test_unsupported_file_type(self, add_dataset_func, tmp_path, file_type):
dataset = add_dataset_func
fp = tmp_path / f"ragflow_test.{file_type}"
fp.touch()

with fp.open("rb") as f:
blob = f.read()

with pytest.raises(Exception) as excinfo:
dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
assert str(excinfo.value) == f"ragflow_test.{file_type}: This type of file has not been supported yet!", str(excinfo.value)

@pytest.mark.p2
def test_missing_file(self, add_dataset_func):
dataset = add_dataset_func
with pytest.raises(Exception) as excinfo:
dataset.upload_documents([])
assert str(excinfo.value) == "No file part!", str(excinfo.value)

@pytest.mark.p3
def test_empty_file(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = tmp_path / "empty.txt"
fp.touch()

with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
for document in documents:
assert document.size == 0, str(document)

@pytest.mark.p3
def test_filename_empty(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = create_txt_file(tmp_path / "ragflow_test.txt")

with fp.open("rb") as f:
blob = f.read()

with pytest.raises(Exception) as excinfo:
dataset.upload_documents([{"display_name": "", "blob": blob}])
assert str(excinfo.value) == "No file selected!", str(excinfo.value)

@pytest.mark.p2
def test_filename_exceeds_max_length(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt")

with fp.open("rb") as f:
blob = f.read()

with pytest.raises(Exception) as excinfo:
dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
assert str(excinfo.value) == "File name should be less than 128 bytes.", str(excinfo.value)

@pytest.mark.p2
def test_duplicate_files(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = create_txt_file(tmp_path / "ragflow_test.txt")

with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}, {"display_name": fp.name, "blob": blob}])

assert len(documents) == 2, str(documents)
for i, document in enumerate(documents):
assert document.dataset_id == dataset.id, str(document)
expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}"
assert document.name == expected_name, str(document)

@pytest.mark.p2
def test_same_file_repeat(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
fp = create_txt_file(tmp_path / "ragflow_test.txt")

with fp.open("rb") as f:
blob = f.read()

for i in range(3):
documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
assert len(documents) == 1, str(documents)
document = documents[0]
assert document.dataset_id == dataset.id, str(document)
expected_name = fp.name if i == 0 else f"{fp.stem}({i}){fp.suffix}"
assert document.name == expected_name, str(document)

@pytest.mark.p3
def test_filename_special_characters(self, add_dataset_func, tmp_path):
dataset = add_dataset_func
illegal_chars = '<>:"/\\|?*'
translation_table = str.maketrans({char: "_" for char in illegal_chars})
safe_filename = string.punctuation.translate(translation_table)
fp = tmp_path / f"{safe_filename}.txt"
fp.write_text("Sample text content")

with fp.open("rb") as f:
blob = f.read()

documents = dataset.upload_documents([{"display_name": fp.name, "blob": blob}])
assert len(documents) == 1, str(documents)
document = documents[0]
assert document.dataset_id == dataset.id, str(document)
assert document.name == fp.name, str(document)

@pytest.mark.p1
def test_multiple_files(self, client, add_dataset_func, tmp_path):
dataset = add_dataset_func
expected_document_count = 20
document_infos = []
for i in range(expected_document_count):
fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt")
with fp.open("rb") as f:
blob = f.read()
document_infos.append({"display_name": fp.name, "blob": blob})
documents = dataset.upload_documents(document_infos)
assert len(documents) == expected_document_count, str(documents)

retrieved_dataset = client.get_dataset(name=dataset.name)
assert retrieved_dataset.document_count == expected_document_count, str(retrieved_dataset)

@pytest.mark.p3
def test_concurrent_upload(self, client, add_dataset_func, tmp_path):
dataset = add_dataset_func
count = 20
fps = [create_txt_file(tmp_path / f"ragflow_test_{i}.txt") for i in range(count)]

def upload_file(fp):
with fp.open("rb") as f:
blob = f.read()
return dataset.upload_documents([{"display_name": fp.name, "blob": blob}])

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(upload_file, fp) for fp in fps]
responses = list(as_completed(futures))
assert len(responses) == count, responses

retrieved_dataset = client.get_dataset(name=dataset.name)
assert retrieved_dataset.document_count == count, str(retrieved_dataset)

Loading…
Cancel
Save