### What problem does this PR solve? Add sdk dataset test cases ### Type of change - [x] Add test case

4 months ago · ee52000870
--- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
@@ -80,10 +80,11 @@ class TestCapability:

    @pytest.mark.p3
    def test_create_dataset_concurrent(self, api_key):
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(100)]
            futures = [executor.submit(create_dataset, api_key, {"name": f"dataset_{i}"}) for i in range(count)]
        responses = list(as_completed(futures))
        assert all(r["code"] == 0 for r in responses), responses
        assert len(responses) == count, responses


@pytest.mark.usefixtures("clear_datasets")
--- a/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_delete_datasets.py
@@ -14,7 +14,7 @@
 #  limitations under the License.
 #
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import ThreadPoolExecutor, as_completed

 import pytest
 from common import (
@@ -86,13 +86,13 @@ class TestCapability:

    @pytest.mark.p3
    def test_concurrent_deletion(self, api_key):
        dataset_num = 1_000
        ids = batch_create_datasets(api_key, dataset_num)
        count = 1_000
        ids = batch_create_datasets(api_key, count)

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(dataset_num)]
        responses = [f.result() for f in futures]
        assert all(r["code"] == 0 for r in responses), responses
            futures = [executor.submit(delete_datasets, api_key, {"ids": ids[i : i + 1]}) for i in range(count)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


 class TestDatasetsDelete:
--- a/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_list_datasets.py
@@ -14,7 +14,7 @@
 #  limitations under the License.
 #
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import ThreadPoolExecutor, as_completed

 import pytest
 from common import INVALID_API_TOKEN, list_datasets
@@ -44,10 +44,11 @@ class TestAuthorization:
 class TestCapability:
    @pytest.mark.p3
    def test_concurrent_list(self, api_key):
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(list_datasets, api_key) for i in range(100)]
        responses = [f.result() for f in futures]
        assert all(r["code"] == 0 for r in responses), responses
            futures = [executor.submit(list_datasets, api_key) for i in range(count)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


@pytest.mark.usefixtures("add_datasets")
@@ -173,7 +174,7 @@ class TestDatasetsList:

    @pytest.mark.p3
    def test_orderby_none(self, api_key):
        params = {"order_by": None}
        params = {"orderby": None}
        res = list_datasets(api_key, params)
        assert res["code"] == 0, res
        assert is_sorted(res["data"], "create_time", True), res
--- a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
@@ -14,7 +14,7 @@
 #  limitations under the License.
 #
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from concurrent.futures import ThreadPoolExecutor, as_completed

 import pytest
 from common import DATASET_NAME_LIMIT, INVALID_API_TOKEN, list_datasets, update_dataset
@@ -90,10 +90,11 @@ class TestCapability:
    @pytest.mark.p3
    def test_update_dateset_concurrent(self, api_key, add_dataset_func):
        dataset_id = add_dataset_func
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(100)]
        responses = [f.result() for f in futures]
        assert all(r["code"] == 0 for r in responses), responses
            futures = [executor.submit(update_dataset, api_key, dataset_id, {"name": f"dataset_{i}"}) for i in range(count)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


 class TestDatasetUpdate:
@@ -811,10 +812,4 @@ class TestDatasetUpdate:
        assert res["data"][0]["permission"] == original_data["permission"], res
        assert res["data"][0]["chunk_method"] == original_data["chunk_method"], res
        assert res["data"][0]["pagerank"] == original_data["pagerank"], res
        assert res["data"][0]["parser_config"] == {
            "chunk_token_num": 128,
            "delimiter": r"\n",
            "html4excel": False,
            "layout_recognize": "DeepDOC",
            "raptor": {"use_raptor": False},
        }, res
        assert res["data"][0]["parser_config"] == original_data["parser_config"], res
--- a/test/testcases/test_sdk_api/common.py
+++ b/test/testcases/test_sdk_api/common.py
@@ -0,0 +1,23 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #

 # DATASET MANAGEMENT
 def batch_create_datasets(client, num):
    datasets = []
    for i in range(num):
        dataset = client.create_dataset(name=f"dataset_{i}")
        datasets.append(dataset)
    return datasets
--- a/test/testcases/test_sdk_api/conftest.py
+++ b/test/testcases/test_sdk_api/conftest.py
@@ -0,0 +1,45 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #

 import pytest
 from common import (
    batch_create_datasets,
 )
 from configs import HOST_ADDRESS, VERSION
 from ragflow_sdk import RAGFlow


@pytest.fixture(scope="session")
 def client(token):
    return RAGFlow(api_key=token, base_url=HOST_ADDRESS, version=VERSION)


@pytest.fixture(scope="function")
 def clear_datasets(request, client):
    def cleanup():
        client.delete_datasets(ids=None)

    request.addfinalizer(cleanup)


@pytest.fixture(scope="function")
 def add_dataset_func(request, client):
    def cleanup():
        client.delete_datasets(ids=None)

    request.addfinalizer(cleanup)

    return batch_create_datasets(client, 1)[0]
--- a/test/testcases/test_sdk_api/test_dataset_mangement/conftest.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/conftest.py
@@ -0,0 +1,39 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #


 import pytest
 from common import batch_create_datasets


@pytest.fixture(scope="class")
 def add_datasets(client, request):
    def cleanup():
        client.delete_datasets(**{"ids": None})

    request.addfinalizer(cleanup)

    return batch_create_datasets(client, 5)


@pytest.fixture(scope="function")
 def add_datasets_func(client, request):
    def cleanup():
        client.delete_datasets(**{"ids": None})

    request.addfinalizer(cleanup)

    return batch_create_datasets(client, 3)
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py
@@ -0,0 +1,698 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from operator import attrgetter

 import pytest
 from configs import DATASET_NAME_LIMIT, HOST_ADDRESS, INVALID_API_TOKEN
 from hypothesis import example, given, settings
 from ragflow_sdk import DataSet, RAGFlow
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names


@pytest.mark.usefixtures("clear_datasets")
 class TestAuthorization:
    @pytest.mark.p1
    @pytest.mark.parametrize(
        "invalid_auth, expected_message",
        [
            (None, "Authentication error: API key is invalid!"),
            (INVALID_API_TOKEN, "Authentication error: API key is invalid!"),
        ],
        ids=["empty_auth", "invalid_api_token"],
    )
    def test_auth_invalid(self, invalid_auth, expected_message):
        client = RAGFlow(invalid_auth, HOST_ADDRESS)
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**{"name": "auth_test"})
        assert str(excinfo.value) == expected_message


@pytest.mark.usefixtures("clear_datasets")
 class TestCapability:
    @pytest.mark.p3
    def test_create_dataset_1k(self, client):
        count = 1_000
        for i in range(count):
            payload = {"name": f"dataset_{i}"}
            client.create_dataset(**payload)
        assert len(client.list_datasets(page_size=2000)) == count

    @pytest.mark.p3
    def test_create_dataset_concurrent(self, client):
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(client.create_dataset, **{"name": f"dataset_{i}"}) for i in range(100)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


@pytest.mark.usefixtures("clear_datasets")
 class TestDatasetCreate:
    @pytest.mark.p1
    @given(name=valid_names())
    @example("a" * 128)
    @settings(max_examples=20)
    def test_name(self, client, name):
        dataset = client.create_dataset(**{"name": name})
        assert dataset.name == name, str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, expected_message",
        [
            ("", "String should have at least 1 character"),
            (" ", "String should have at least 1 character"),
            ("a" * (DATASET_NAME_LIMIT + 1), "String should have at most 128 characters"),
            (0, "not instance of"),
            (None, "not instance of"),
        ],
        ids=["empty_name", "space_name", "too_long_name", "invalid_name", "None_name"],
    )
    def test_name_invalid(self, client, name, expected_message):
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**{"name": name})
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_name_duplicated(self, client):
        name = "duplicated_name"
        payload = {"name": name}
        client.create_dataset(**payload)

        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert str(excinfo.value) == f"Dataset name '{name}' already exists", str(excinfo.value)

    @pytest.mark.p3
    def test_name_case_insensitive(self, client):
        name = "CaseInsensitive"
        payload = {"name": name.upper()}
        client.create_dataset(**payload)

        payload = {"name": name.lower()}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert str(excinfo.value) == f"Dataset name '{name.lower()}' already exists", str(excinfo.value)

    @pytest.mark.p2
    def test_avatar(self, client, tmp_path):
        fn = create_image_file(tmp_path / "ragflow_test.png")
        payload = {
            "name": "avatar",
            "avatar": f"data:image/png;base64,{encode_avatar(fn)}",
        }
        client.create_dataset(**payload)

    @pytest.mark.p2
    def test_avatar_exceeds_limit_length(self, client):
        payload = {"name": "avatar_exceeds_limit_length", "avatar": "a" * 65536}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "String should have at most 65535 characters" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "name, prefix, expected_message",
        [
            ("empty_prefix", "", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
            ("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
            ("unsupported_mine_type", "invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"),
            ("invalid_mine_type", "data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"),
        ],
        ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"],
    )
    def test_avatar_invalid_prefix(self, client, tmp_path, name, prefix, expected_message):
        fn = create_image_file(tmp_path / "ragflow_test.png")
        payload = {
            "name": name,
            "avatar": f"{prefix}{encode_avatar(fn)}",
        }
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_avatar_unset(self, client):
        payload = {"name": "avatar_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.avatar is None, str(dataset)

    @pytest.mark.p2
    def test_description(self, client):
        payload = {"name": "description", "description": "description"}
        dataset = client.create_dataset(**payload)
        assert dataset.description == "description", str(dataset)

    @pytest.mark.p2
    def test_description_exceeds_limit_length(self, client):
        payload = {"name": "description_exceeds_limit_length", "description": "a" * 65536}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "String should have at most 65535 characters" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_description_unset(self, client):
        payload = {"name": "description_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.description is None, str(dataset)

    @pytest.mark.p3
    def test_description_none(self, client):
        payload = {"name": "description_none", "description": None}
        dataset = client.create_dataset(**payload)
        assert dataset.description is None, str(dataset)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "name, embedding_model",
        [
            ("BAAI/bge-large-zh-v1.5@BAAI", "BAAI/bge-large-zh-v1.5@BAAI"),
            ("maidalun1020/bce-embedding-base_v1@Youdao", "maidalun1020/bce-embedding-base_v1@Youdao"),
            ("embedding-3@ZHIPU-AI", "embedding-3@ZHIPU-AI"),
        ],
        ids=["builtin_baai", "builtin_youdao", "tenant_zhipu"],
    )
    def test_embedding_model(self, client, name, embedding_model):
        payload = {"name": name, "embedding_model": embedding_model}
        dataset = client.create_dataset(**payload)
        assert dataset.embedding_model == embedding_model, str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, embedding_model",
        [
            ("unknown_llm_name", "unknown@ZHIPU-AI"),
            ("unknown_llm_factory", "embedding-3@unknown"),
            ("tenant_no_auth_default_tenant_llm", "text-embedding-v3@Tongyi-Qianwen"),
            ("tenant_no_auth", "text-embedding-3-small@OpenAI"),
        ],
        ids=["unknown_llm_name", "unknown_llm_factory", "tenant_no_auth_default_tenant_llm", "tenant_no_auth"],
    )
    def test_embedding_model_invalid(self, client, name, embedding_model):
        payload = {"name": name, "embedding_model": embedding_model}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        if "tenant_no_auth" in name:
            assert str(excinfo.value) == f"Unauthorized model: <{embedding_model}>", str(excinfo.value)
        else:
            assert str(excinfo.value) == f"Unsupported model: <{embedding_model}>", str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, embedding_model",
        [
            ("missing_at", "BAAI/bge-large-zh-v1.5BAAI"),
            ("missing_model_name", "@BAAI"),
            ("missing_provider", "BAAI/bge-large-zh-v1.5@"),
            ("whitespace_only_model_name", " @BAAI"),
            ("whitespace_only_provider", "BAAI/bge-large-zh-v1.5@ "),
        ],
        ids=["missing_at", "empty_model_name", "empty_provider", "whitespace_only_model_name", "whitespace_only_provider"],
    )
    def test_embedding_model_format(self, client, name, embedding_model):
        payload = {"name": name, "embedding_model": embedding_model}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        if name == "missing_at":
            assert "Embedding model identifier must follow <model_name>@<provider> format" in str(excinfo.value), str(excinfo.value)
        else:
            assert "Both model_name and provider must be non-empty strings" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_embedding_model_unset(self, client):
        payload = {"name": "embedding_model_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.embedding_model == "BAAI/bge-large-zh-v1.5@BAAI", str(dataset)

    @pytest.mark.p2
    def test_embedding_model_none(self, client):
        payload = {"name": "embedding_model_none", "embedding_model": None}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "Input should be a valid string" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "name, permission",
        [
            ("me", "me"),
            ("team", "team"),
            ("me_upercase", "ME"),
            ("team_upercase", "TEAM"),
            ("whitespace", " ME "),
        ],
        ids=["me", "team", "me_upercase", "team_upercase", "whitespace"],
    )
    def test_permission(self, client, name, permission):
        payload = {"name": name, "permission": permission}
        dataset = client.create_dataset(**payload)
        assert dataset.permission == permission.lower().strip(), str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, permission",
        [
            ("empty", ""),
            ("unknown", "unknown"),
        ],
        ids=["empty", "unknown"],
    )
    def test_permission_invalid(self, client, name, permission):
        payload = {"name": name, "permission": permission}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "Input should be 'me' or 'team'" in str(excinfo.value)

    @pytest.mark.p2
    def test_permission_unset(self, client):
        payload = {"name": "permission_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.permission == "me", str(dataset)

    @pytest.mark.p3
    def test_permission_none(self, client):
        payload = {"name": "permission_none", "permission": None}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "name, chunk_method",
        [
            ("naive", "naive"),
            ("book", "book"),
            ("email", "email"),
            ("laws", "laws"),
            ("manual", "manual"),
            ("one", "one"),
            ("paper", "paper"),
            ("picture", "picture"),
            ("presentation", "presentation"),
            ("qa", "qa"),
            ("table", "table"),
            ("tag", "tag"),
        ],
        ids=["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
    )
    def test_chunk_method(self, client, name, chunk_method):
        payload = {"name": name, "chunk_method": chunk_method}
        dataset = client.create_dataset(**payload)
        assert dataset.chunk_method == chunk_method, str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, chunk_method",
        [
            ("empty", ""),
            ("unknown", "unknown"),
        ],
        ids=["empty", "unknown"],
    )
    def test_chunk_method_invalid(self, client, name, chunk_method):
        payload = {"name": name, "chunk_method": chunk_method}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table' or 'tag'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_chunk_method_unset(self, client):
        payload = {"name": "chunk_method_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.chunk_method == "naive", str(dataset)

    @pytest.mark.p3
    def test_chunk_method_none(self, client):
        payload = {"name": "chunk_method_none", "chunk_method": None}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, pagerank",
        [
            ("pagerank_min", 0),
            ("pagerank_mid", 50),
            ("pagerank_max", 100),
        ],
        ids=["min", "mid", "max"],
    )
    def test_pagerank(self, client, name, pagerank):
        payload = {"name": name, "pagerank": pagerank}
        dataset = client.create_dataset(**payload)
        assert dataset.pagerank == pagerank, str(dataset)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "name, pagerank, expected_message",
        [
            ("pagerank_min_limit", -1, "Input should be greater than or equal to 0"),
            ("pagerank_max_limit", 101, "Input should be less than or equal to 100"),
        ],
        ids=["min_limit", "max_limit"],
    )
    def test_pagerank_invalid(self, client, name, pagerank, expected_message):
        payload = {"name": name, "pagerank": pagerank}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_pagerank_unset(self, client):
        payload = {"name": "pagerank_unset"}
        dataset = client.create_dataset(**payload)
        assert dataset.pagerank == 0, str(dataset)

    @pytest.mark.p3
    def test_pagerank_none(self, client):
        payload = {"name": "pagerank_unset", "pagerank": None}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "name, parser_config",
        [
            ("auto_keywords_min", {"auto_keywords": 0}),
            ("auto_keywords_mid", {"auto_keywords": 16}),
            ("auto_keywords_max", {"auto_keywords": 32}),
            ("auto_questions_min", {"auto_questions": 0}),
            ("auto_questions_mid", {"auto_questions": 5}),
            ("auto_questions_max", {"auto_questions": 10}),
            ("chunk_token_num_min", {"chunk_token_num": 1}),
            ("chunk_token_num_mid", {"chunk_token_num": 1024}),
            ("chunk_token_num_max", {"chunk_token_num": 2048}),
            ("delimiter", {"delimiter": "\n"}),
            ("delimiter_space", {"delimiter": " "}),
            ("html4excel_true", {"html4excel": True}),
            ("html4excel_false", {"html4excel": False}),
            ("layout_recognize_DeepDOC", {"layout_recognize": "DeepDOC"}),
            ("layout_recognize_navie", {"layout_recognize": "Plain Text"}),
            ("tag_kb_ids", {"tag_kb_ids": ["1", "2"]}),
            ("topn_tags_min", {"topn_tags": 1}),
            ("topn_tags_mid", {"topn_tags": 5}),
            ("topn_tags_max", {"topn_tags": 10}),
            ("filename_embd_weight_min", {"filename_embd_weight": 0.1}),
            ("filename_embd_weight_mid", {"filename_embd_weight": 0.5}),
            ("filename_embd_weight_max", {"filename_embd_weight": 1.0}),
            ("task_page_size_min", {"task_page_size": 1}),
            ("task_page_size_None", {"task_page_size": None}),
            ("pages", {"pages": [[1, 100]]}),
            ("pages_none", {"pages": None}),
            ("graphrag_true", {"graphrag": {"use_graphrag": True}}),
            ("graphrag_false", {"graphrag": {"use_graphrag": False}}),
            ("graphrag_entity_types", {"graphrag": {"entity_types": ["age", "sex", "height", "weight"]}}),
            ("graphrag_method_general", {"graphrag": {"method": "general"}}),
            ("graphrag_method_light", {"graphrag": {"method": "light"}}),
            ("graphrag_community_true", {"graphrag": {"community": True}}),
            ("graphrag_community_false", {"graphrag": {"community": False}}),
            ("graphrag_resolution_true", {"graphrag": {"resolution": True}}),
            ("graphrag_resolution_false", {"graphrag": {"resolution": False}}),
            ("raptor_true", {"raptor": {"use_raptor": True}}),
            ("raptor_false", {"raptor": {"use_raptor": False}}),
            ("raptor_prompt", {"raptor": {"prompt": "Who are you?"}}),
            ("raptor_max_token_min", {"raptor": {"max_token": 1}}),
            ("raptor_max_token_mid", {"raptor": {"max_token": 1024}}),
            ("raptor_max_token_max", {"raptor": {"max_token": 2048}}),
            ("raptor_threshold_min", {"raptor": {"threshold": 0.0}}),
            ("raptor_threshold_mid", {"raptor": {"threshold": 0.5}}),
            ("raptor_threshold_max", {"raptor": {"threshold": 1.0}}),
            ("raptor_max_cluster_min", {"raptor": {"max_cluster": 1}}),
            ("raptor_max_cluster_mid", {"raptor": {"max_cluster": 512}}),
            ("raptor_max_cluster_max", {"raptor": {"max_cluster": 1024}}),
            ("raptor_random_seed_min", {"raptor": {"random_seed": 0}}),
        ],
        ids=[
            "auto_keywords_min",
            "auto_keywords_mid",
            "auto_keywords_max",
            "auto_questions_min",
            "auto_questions_mid",
            "auto_questions_max",
            "chunk_token_num_min",
            "chunk_token_num_mid",
            "chunk_token_num_max",
            "delimiter",
            "delimiter_space",
            "html4excel_true",
            "html4excel_false",
            "layout_recognize_DeepDOC",
            "layout_recognize_navie",
            "tag_kb_ids",
            "topn_tags_min",
            "topn_tags_mid",
            "topn_tags_max",
            "filename_embd_weight_min",
            "filename_embd_weight_mid",
            "filename_embd_weight_max",
            "task_page_size_min",
            "task_page_size_None",
            "pages",
            "pages_none",
            "graphrag_true",
            "graphrag_false",
            "graphrag_entity_types",
            "graphrag_method_general",
            "graphrag_method_light",
            "graphrag_community_true",
            "graphrag_community_false",
            "graphrag_resolution_true",
            "graphrag_resolution_false",
            "raptor_true",
            "raptor_false",
            "raptor_prompt",
            "raptor_max_token_min",
            "raptor_max_token_mid",
            "raptor_max_token_max",
            "raptor_threshold_min",
            "raptor_threshold_mid",
            "raptor_threshold_max",
            "raptor_max_cluster_min",
            "raptor_max_cluster_mid",
            "raptor_max_cluster_max",
            "raptor_random_seed_min",
        ],
    )
    def test_parser_config(self, client, name, parser_config):
        parser_config_o = DataSet.ParserConfig(client, parser_config)
        payload = {"name": name, "parser_config": parser_config_o}
        dataset = client.create_dataset(**payload)
        for k, v in parser_config.items():
            if isinstance(v, dict):
                for kk, vv in v.items():
                    assert attrgetter(f"{k}.{kk}")(dataset.parser_config) == vv, str(dataset)
            else:
                assert attrgetter(k)(dataset.parser_config) == v, str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, parser_config, expected_message",
        [
            ("auto_keywords_min_limit", {"auto_keywords": -1}, "Input should be greater than or equal to 0"),
            ("auto_keywords_max_limit", {"auto_keywords": 33}, "Input should be less than or equal to 32"),
            ("auto_keywords_float_not_allowed", {"auto_keywords": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ("auto_keywords_type_invalid", {"auto_keywords": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ("auto_questions_min_limit", {"auto_questions": -1}, "Input should be greater than or equal to 0"),
            ("auto_questions_max_limit", {"auto_questions": 11}, "Input should be less than or equal to 10"),
            ("auto_questions_float_not_allowed", {"auto_questions": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ("auto_questions_type_invalid", {"auto_questions": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ("chunk_token_num_min_limit", {"chunk_token_num": 0}, "Input should be greater than or equal to 1"),
            ("chunk_token_num_max_limit", {"chunk_token_num": 2049}, "Input should be less than or equal to 2048"),
            ("chunk_token_num_float_not_allowed", {"chunk_token_num": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ("chunk_token_num_type_invalid", {"chunk_token_num": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ("delimiter_empty", {"delimiter": ""}, "String should have at least 1 character"),
            ("html4excel_type_invalid", {"html4excel": "string"}, "Input should be a valid boolean, unable to interpret input"),
            ("tag_kb_ids_not_list", {"tag_kb_ids": "1,2"}, "Input should be a valid list"),
            ("tag_kb_ids_int_in_list", {"tag_kb_ids": [1, 2]}, "Input should be a valid string"),
            ("topn_tags_min_limit", {"topn_tags": 0}, "Input should be greater than or equal to 1"),
            ("topn_tags_max_limit", {"topn_tags": 11}, "Input should be less than or equal to 10"),
            ("topn_tags_float_not_allowed", {"topn_tags": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ("topn_tags_type_invalid", {"topn_tags": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ("filename_embd_weight_min_limit", {"filename_embd_weight": -1}, "Input should be greater than or equal to 0"),
            ("filename_embd_weight_max_limit", {"filename_embd_weight": 1.1}, "Input should be less than or equal to 1"),
            ("filename_embd_weight_type_invalid", {"filename_embd_weight": "string"}, "Input should be a valid number, unable to parse string as a number"),
            ("task_page_size_min_limit", {"task_page_size": 0}, "Input should be greater than or equal to 1"),
            ("task_page_size_float_not_allowed", {"task_page_size": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ("task_page_size_type_invalid", {"task_page_size": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ("pages_not_list", {"pages": "1,2"}, "Input should be a valid list"),
            ("pages_not_list_in_list", {"pages": ["1,2"]}, "Input should be a valid list"),
            ("pages_not_int_list", {"pages": [["string1", "string2"]]}, "Input should be a valid integer, unable to parse string as an integer"),
            ("graphrag_type_invalid", {"graphrag": {"use_graphrag": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ("graphrag_entity_types_not_list", {"graphrag": {"entity_types": "1,2"}}, "Input should be a valid list"),
            ("graphrag_entity_types_not_str_in_list", {"graphrag": {"entity_types": [1, 2]}}, "nput should be a valid string"),
            ("graphrag_method_unknown", {"graphrag": {"method": "unknown"}}, "Input should be 'light' or 'general'"),
            ("graphrag_method_none", {"graphrag": {"method": None}}, "Input should be 'light' or 'general'"),
            ("graphrag_community_type_invalid", {"graphrag": {"community": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ("graphrag_resolution_type_invalid", {"graphrag": {"resolution": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ("raptor_type_invalid", {"raptor": {"use_raptor": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ("raptor_prompt_empty", {"raptor": {"prompt": ""}}, "String should have at least 1 character"),
            ("raptor_prompt_space", {"raptor": {"prompt": " "}}, "String should have at least 1 character"),
            ("raptor_max_token_min_limit", {"raptor": {"max_token": 0}}, "Input should be greater than or equal to 1"),
            ("raptor_max_token_max_limit", {"raptor": {"max_token": 2049}}, "Input should be less than or equal to 2048"),
            ("raptor_max_token_float_not_allowed", {"raptor": {"max_token": 3.14}}, "Input should be a valid integer, got a number with a fractional part"),
            ("raptor_max_token_type_invalid", {"raptor": {"max_token": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ("raptor_threshold_min_limit", {"raptor": {"threshold": -0.1}}, "Input should be greater than or equal to 0"),
            ("raptor_threshold_max_limit", {"raptor": {"threshold": 1.1}}, "Input should be less than or equal to 1"),
            ("raptor_threshold_type_invalid", {"raptor": {"threshold": "string"}}, "Input should be a valid number, unable to parse string as a number"),
            ("raptor_max_cluster_min_limit", {"raptor": {"max_cluster": 0}}, "Input should be greater than or equal to 1"),
            ("raptor_max_cluster_max_limit", {"raptor": {"max_cluster": 1025}}, "Input should be less than or equal to 1024"),
            ("raptor_max_cluster_float_not_allowed", {"raptor": {"max_cluster": 3.14}}, "Input should be a valid integer, got a number with a fractional par"),
            ("raptor_max_cluster_type_invalid", {"raptor": {"max_cluster": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ("raptor_random_seed_min_limit", {"raptor": {"random_seed": -1}}, "Input should be greater than or equal to 0"),
            ("raptor_random_seed_float_not_allowed", {"raptor": {"random_seed": 3.14}}, "Input should be a valid integer, got a number with a fractional part"),
            ("raptor_random_seed_type_invalid", {"raptor": {"random_seed": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ("parser_config_type_invalid", {"delimiter": "a" * 65536}, "Parser config exceeds size limit (max 65,535 characters)"),
        ],
        ids=[
            "auto_keywords_min_limit",
            "auto_keywords_max_limit",
            "auto_keywords_float_not_allowed",
            "auto_keywords_type_invalid",
            "auto_questions_min_limit",
            "auto_questions_max_limit",
            "auto_questions_float_not_allowed",
            "auto_questions_type_invalid",
            "chunk_token_num_min_limit",
            "chunk_token_num_max_limit",
            "chunk_token_num_float_not_allowed",
            "chunk_token_num_type_invalid",
            "delimiter_empty",
            "html4excel_type_invalid",
            "tag_kb_ids_not_list",
            "tag_kb_ids_int_in_list",
            "topn_tags_min_limit",
            "topn_tags_max_limit",
            "topn_tags_float_not_allowed",
            "topn_tags_type_invalid",
            "filename_embd_weight_min_limit",
            "filename_embd_weight_max_limit",
            "filename_embd_weight_type_invalid",
            "task_page_size_min_limit",
            "task_page_size_float_not_allowed",
            "task_page_size_type_invalid",
            "pages_not_list",
            "pages_not_list_in_list",
            "pages_not_int_list",
            "graphrag_type_invalid",
            "graphrag_entity_types_not_list",
            "graphrag_entity_types_not_str_in_list",
            "graphrag_method_unknown",
            "graphrag_method_none",
            "graphrag_community_type_invalid",
            "graphrag_resolution_type_invalid",
            "raptor_type_invalid",
            "raptor_prompt_empty",
            "raptor_prompt_space",
            "raptor_max_token_min_limit",
            "raptor_max_token_max_limit",
            "raptor_max_token_float_not_allowed",
            "raptor_max_token_type_invalid",
            "raptor_threshold_min_limit",
            "raptor_threshold_max_limit",
            "raptor_threshold_type_invalid",
            "raptor_max_cluster_min_limit",
            "raptor_max_cluster_max_limit",
            "raptor_max_cluster_float_not_allowed",
            "raptor_max_cluster_type_invalid",
            "raptor_random_seed_min_limit",
            "raptor_random_seed_float_not_allowed",
            "raptor_random_seed_type_invalid",
            "parser_config_type_invalid",
        ],
    )
    def test_parser_config_invalid(self, client, name, parser_config, expected_message):
        parser_config_o = DataSet.ParserConfig(client, parser_config)
        payload = {"name": name, "parser_config": parser_config_o}
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_parser_config_empty(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
            {
                "chunk_token_num": 128,
                "delimiter": r"\n",
                "html4excel": False,
                "layout_recognize": "DeepDOC",
                "raptor": {"use_raptor": False},
            },
        )
        parser_config_o = DataSet.ParserConfig(client, {})
        payload = {"name": "parser_config_empty", "parser_config": parser_config_o}
        dataset = client.create_dataset(**payload)
        assert str(dataset.parser_config) == str(excepted_value), str(dataset)

    @pytest.mark.p2
    def test_parser_config_unset(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
            {
                "chunk_token_num": 128,
                "delimiter": r"\n",
                "html4excel": False,
                "layout_recognize": "DeepDOC",
                "raptor": {"use_raptor": False},
            },
        )
        payload = {"name": "parser_config_unset"}
        dataset = client.create_dataset(**payload)
        assert str(dataset.parser_config) == str(excepted_value), str(dataset)

    @pytest.mark.p3
    def test_parser_config_none(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
            {
                "chunk_token_num": 128,
                "delimiter": r"\n",
                "html4excel": False,
                "layout_recognize": "DeepDOC",
                "raptor": {"use_raptor": False},
            },
        )
        payload = {"name": "parser_config_empty", "parser_config": None}
        dataset = client.create_dataset(**payload)
        assert str(dataset.parser_config) == str(excepted_value), str(dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "payload",
        [
            {"name": "id", "id": "id"},
            {"name": "tenant_id", "tenant_id": "e57c1966f99211efb41e9e45646e0111"},
            {"name": "created_by", "created_by": "created_by"},
            {"name": "create_date", "create_date": "Tue, 11 Mar 2025 13:37:23 GMT"},
            {"name": "create_time", "create_time": 1741671443322},
            {"name": "update_date", "update_date": "Tue, 11 Mar 2025 13:37:23 GMT"},
            {"name": "update_time", "update_time": 1741671443339},
            {"name": "document_count", "document_count": 1},
            {"name": "chunk_count", "chunk_count": 1},
            {"name": "token_num", "token_num": 1},
            {"name": "status", "status": "1"},
            {"name": "unknown_field", "unknown_field": "unknown_field"},
        ],
    )
    def test_unsupported_field(self, client, payload):
        with pytest.raises(Exception) as excinfo:
            client.create_dataset(**payload)
        assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_delete_datasets.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_delete_datasets.py
@@ -0,0 +1,178 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed

 import pytest
 from common import batch_create_datasets
 from configs import HOST_ADDRESS, INVALID_API_TOKEN
 from ragflow_sdk import RAGFlow


 class TestAuthorization:
    @pytest.mark.p1
    @pytest.mark.parametrize(
        "invalid_auth, expected_message",
        [
            (None, "Authentication error: API key is invalid!"),
            (INVALID_API_TOKEN, "Authentication error: API key is invalid!"),
        ],
    )
    def test_auth_invalid(self, invalid_auth, expected_message):
        client = RAGFlow(invalid_auth, HOST_ADDRESS)
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets()
        assert str(excinfo.value) == expected_message


 class TestCapability:
    @pytest.mark.p3
    def test_delete_dataset_1k(self, client):
        datasets = batch_create_datasets(client, 1_000)
        client.delete_datasets(**{"ids": [dataset.id for dataset in datasets]})

        datasets = client.list_datasets()
        assert len(datasets) == 0, datasets

    @pytest.mark.p3
    def test_concurrent_deletion(self, client):
        count = 1_000
        datasets = batch_create_datasets(client, count)
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(client.delete_datasets, **{"ids": [dataset.id for dataset in datasets][i : i + 1]}) for i in range(count)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses

        datasets = client.list_datasets()
        assert len(datasets) == 0, datasets


 class TestDatasetsDelete:
    @pytest.mark.p1
    @pytest.mark.parametrize(
        "func, remaining",
        [
            (lambda r: {"ids": r[:1]}, 2),
            (lambda r: {"ids": r}, 0),
        ],
        ids=["single_dataset", "multiple_datasets"],
    )
    def test_ids(self, client, add_datasets_func, func, remaining):
        if callable(func):
            payload = func([dataset.id for dataset in add_datasets_func])
        client.delete_datasets(**payload)

        datasets = client.list_datasets()
        assert len(datasets) == remaining, str(datasets)

    @pytest.mark.p1
    @pytest.mark.usefixtures("add_dataset_func")
    def test_ids_empty(self, client):
        payload = {"ids": []}
        client.delete_datasets(**payload)

        datasets = client.list_datasets()
        assert len(datasets) == 1, str(datasets)

    @pytest.mark.p1
    @pytest.mark.usefixtures("add_datasets_func")
    def test_ids_none(self, client):
        payload = {"ids": None}
        client.delete_datasets(**payload)

        datasets = client.list_datasets()
        assert len(datasets) == 0, str(datasets)

    @pytest.mark.p2
    @pytest.mark.usefixtures("add_dataset_func")
    def test_id_not_uuid(self, client):
        payload = {"ids": ["not_uuid"]}
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "Invalid UUID1 format" in str(excinfo.value), str(excinfo.value)

        datasets = client.list_datasets()
        assert len(datasets) == 1, str(datasets)

    @pytest.mark.p3
    @pytest.mark.usefixtures("add_dataset_func")
    def test_id_not_uuid1(self, client):
        payload = {"ids": [uuid.uuid4().hex]}
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "Invalid UUID1 format" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.usefixtures("add_dataset_func")
    def test_id_wrong_uuid(self, client):
        payload = {"ids": ["d94a8dc02c9711f0930f7fbc369eab6d"]}
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

        datasets = client.list_datasets()
        assert len(datasets) == 1, str(datasets)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "func",
        [
            lambda r: {"ids": ["d94a8dc02c9711f0930f7fbc369eab6d"] + r},
            lambda r: {"ids": r[:1] + ["d94a8dc02c9711f0930f7fbc369eab6d"] + r[1:3]},
            lambda r: {"ids": r + ["d94a8dc02c9711f0930f7fbc369eab6d"]},
        ],
    )
    def test_ids_partial_invalid(self, client, add_datasets_func, func):
        if callable(func):
            payload = func([dataset.id for dataset in add_datasets_func])
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

        datasets = client.list_datasets()
        assert len(datasets) == 3, str(datasets)

    @pytest.mark.p2
    def test_ids_duplicate(self, client, add_datasets_func):
        dataset_ids = [dataset.id for dataset in add_datasets_func]
        payload = {"ids": dataset_ids + dataset_ids}
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "Duplicate ids:" in str(excinfo.value), str(excinfo.value)

        datasets = client.list_datasets()
        assert len(datasets) == 3, str(datasets)

    @pytest.mark.p2
    def test_repeated_delete(self, client, add_datasets_func):
        dataset_ids = [dataset.id for dataset in add_datasets_func]
        payload = {"ids": dataset_ids}
        client.delete_datasets(**payload)

        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.usefixtures("add_dataset_func")
    def test_field_unsupported(self, client):
        payload = {"unknown_field": "unknown_field"}
        with pytest.raises(Exception) as excinfo:
            client.delete_datasets(**payload)
        assert "got an unexpected keyword argument 'unknown_field'" in str(excinfo.value), str(excinfo.value)

        datasets = client.list_datasets()
        assert len(datasets) == 1, str(datasets)
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_list_datasets.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_list_datasets.py
@@ -0,0 +1,313 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 import uuid
 from concurrent.futures import ThreadPoolExecutor, as_completed

 import pytest
 from configs import HOST_ADDRESS, INVALID_API_TOKEN
 from ragflow_sdk import RAGFlow


 class TestAuthorization:
    @pytest.mark.p1
    @pytest.mark.parametrize(
        "invalid_auth, expected_message",
        [
            (None, "Authentication error: API key is invalid!"),
            (INVALID_API_TOKEN, "Authentication error: API key is invalid!"),
        ],
    )
    def test_auth_invalid(self, invalid_auth, expected_message):
        client = RAGFlow(invalid_auth, HOST_ADDRESS)
        with pytest.raises(Exception) as excinfo:
            client.list_datasets()
        assert expected_message in str(excinfo.value)


 class TestCapability:
    @pytest.mark.p3
    def test_concurrent_list(self, client):
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [
                executor.submit(
                    client.list_datasets,
                )
                for i in range(count)
            ]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


@pytest.mark.usefixtures("add_datasets")
 class TestDatasetsList:
    @pytest.mark.p1
    def test_params_unset(self, client):
        datasets = client.list_datasets()
        assert len(datasets) == 5, str(datasets)

    @pytest.mark.p2
    def test_params_empty(self, client):
        datasets = client.list_datasets(**{})
        assert len(datasets) == 5, str(datasets)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "params, expected_page_size",
        [
            ({"page": 2, "page_size": 2}, 2),
            ({"page": 3, "page_size": 2}, 1),
            ({"page": 4, "page_size": 2}, 0),
            ({"page": 1, "page_size": 10}, 5),
        ],
        ids=["normal_middle_page", "normal_last_partial_page", "beyond_max_page", "full_data_single_page"],
    )
    def test_page(self, client, params, expected_page_size):
        datasets = client.list_datasets(**params)
        assert len(datasets) == expected_page_size, str(datasets)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "params, expected_message",
        [
            ({"page": 0}, "Input should be greater than or equal to 1"),
            ({"page": "a"}, "not instance of"),
        ],
        ids=["page_0", "page_a"],
    )
    def test_page_invalid(self, client, params, expected_message):
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_page_none(self, client):
        params = {"page": None}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "params, expected_page_size",
        [
            ({"page_size": 1}, 1),
            ({"page_size": 3}, 3),
            ({"page_size": 5}, 5),
            ({"page_size": 6}, 5),
        ],
        ids=["min_valid_page_size", "medium_page_size", "page_size_equals_total", "page_size_exceeds_total"],
    )
    def test_page_size(self, client, params, expected_page_size):
        datasets = client.list_datasets(**params)
        assert len(datasets) == expected_page_size, str(datasets)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "params, expected_message",
        [
            ({"page_size": 0}, "Input should be greater than or equal to 1"),
            ({"page_size": "a"}, "not instance of"),
        ],
    )
    def test_page_size_invalid(self, client, params, expected_message):
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_page_size_none(self, client):
        params = {"page_size": None}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "params",
        [
            {"orderby": "create_time"},
            {"orderby": "update_time"},
            {"orderby": "CREATE_TIME"},
            {"orderby": "UPDATE_TIME"},
            {"orderby": " create_time "},
        ],
        ids=["orderby_create_time", "orderby_update_time", "orderby_create_time_upper", "orderby_update_time_upper", "whitespace"],
    )
    def test_orderby(self, client, params):
        client.list_datasets(**params)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "params",
        [
            {"orderby": ""},
            {"orderby": "unknown"},
        ],
        ids=["empty", "unknown"],
    )
    def test_orderby_invalid(self, client, params):
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "Input should be 'create_time' or 'update_time'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_orderby_none(self, client):
        params = {"orderby": None}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "params",
        [
            {"desc": True},
            {"desc": False},
        ],
        ids=["desc=True", "desc=False"],
    )
    def test_desc(self, client, params):
        client.list_datasets(**params)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "params",
        [
            {"desc": 3.14},
            {"desc": "unknown"},
        ],
        ids=["float_value", "invalid_string"],
    )
    def test_desc_invalid(self, client, params):
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_desc_none(self, client):
        params = {"desc": None}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "not instance of" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    def test_name(self, client):
        params = {"name": "dataset_1"}
        datasets = client.list_datasets(**params)
        assert len(datasets) == 1, str(datasets)
        assert datasets[0].name == "dataset_1", str(datasets)

    @pytest.mark.p2
    def test_name_wrong(self, client):
        params = {"name": "wrong name"}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_name_empty(self, client):
        params = {"name": ""}
        datasets = client.list_datasets(**params)
        assert len(datasets) == 5, str(datasets)

    @pytest.mark.p2
    def test_name_none(self, client):
        params = {"name": None}
        datasets = client.list_datasets(**params)
        assert len(datasets) == 5, str(datasets)

    @pytest.mark.p1
    def test_id(self, client, add_datasets):
        dataset_ids = [dataset.id for dataset in add_datasets]
        params = {"id": dataset_ids[0]}
        datasets = client.list_datasets(**params)
        assert len(datasets) == 1, str(datasets)
        assert datasets[0].id == dataset_ids[0], str(datasets)

    @pytest.mark.p2
    def test_id_not_uuid(self, client):
        params = {"id": "not_uuid"}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "Invalid UUID1 format" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_id_not_uuid1(self, client):
        params = {"id": uuid.uuid4().hex}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "Invalid UUID1 format" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_id_wrong_uuid(self, client):
        params = {"id": "d94a8dc02c9711f0930f7fbc369eab6d"}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_id_empty(self, client):
        params = {"id": ""}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "Invalid UUID1 format" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_id_none(self, client):
        params = {"id": None}
        datasets = client.list_datasets(**params)
        assert len(datasets) == 5, str(datasets)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "func, name, expected_num",
        [
            (lambda r: r[0].id, "dataset_0", 1),
            (lambda r: r[0].id, "dataset_1", 0),
        ],
        ids=["name_and_id_match", "name_and_id_mismatch"],
    )
    def test_name_and_id(self, client, add_datasets, func, name, expected_num):
        if callable(func):
            params = {"id": func(add_datasets), "name": name}
        datasets = client.list_datasets(**params)
        assert len(datasets) == expected_num, str(datasets)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "dataset_id, name",
        [
            (lambda r: r[0].id, "wrong_name"),
            (uuid.uuid1().hex, "dataset_0"),
        ],
        ids=["name", "id"],
    )
    def test_name_and_id_wrong(self, client, add_datasets, dataset_id, name):
        if callable(dataset_id):
            params = {"id": dataset_id(add_datasets), "name": name}
        else:
            params = {"id": dataset_id, "name": name}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "lacks permission for dataset" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_field_unsupported(self, client):
        params = {"unknown_field": "unknown_field"}
        with pytest.raises(Exception) as excinfo:
            client.list_datasets(**params)
        assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py
@@ -0,0 +1,724 @@
 #
 #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 #
 #  Licensed under the Apache License, Version 2.0 (the "License");
 #  you may not use this file except in compliance with the License.
 #  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from operator import attrgetter

 import pytest
 from configs import DATASET_NAME_LIMIT
 from hypothesis import HealthCheck, example, given, settings
 from ragflow_sdk import DataSet
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names


 class TestRquest:
    @pytest.mark.p2
    def test_payload_empty(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({})
        assert "No properties were modified" in str(excinfo.value), str(excinfo.value)


 class TestCapability:
    @pytest.mark.p3
    def test_update_dateset_concurrent(self, add_dataset_func):
        dataset = add_dataset_func
        count = 100
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(dataset.update, {"name": f"dataset_{i}"}) for i in range(count)]
        responses = list(as_completed(futures))
        assert len(responses) == count, responses


 class TestDatasetUpdate:
    @pytest.mark.p1
    @given(name=valid_names())
    @example("a" * 128)
    @settings(max_examples=20, suppress_health_check=[HealthCheck.function_scoped_fixture])
    def test_name(self, client, add_dataset_func, name):
        dataset = add_dataset_func
        payload = {"name": name}
        dataset.update(payload)
        assert dataset.name == name, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.name == name, str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, expected_message",
        [
            ("", "String should have at least 1 character"),
            (" ", "String should have at least 1 character"),
            ("a" * (DATASET_NAME_LIMIT + 1), "String should have at most 128 characters"),
            (0, "Input should be a valid string"),
            (None, "Input should be a valid string"),
        ],
        ids=["empty_name", "space_name", "too_long_name", "invalid_name", "None_name"],
    )
    def test_name_invalid(self, add_dataset_func, name, expected_message):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"name": name})
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_name_duplicated(self, add_datasets_func):
        datasets = add_datasets_func
        name = "dataset_1"
        with pytest.raises(Exception) as excinfo:
            datasets[0].update({"name": name})
        assert f"Dataset name '{name}' already exists" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_name_case_insensitive(self, add_datasets_func):
        dataset = add_datasets_func[0]
        name = "DATASET_1"
        with pytest.raises(Exception) as excinfo:
            dataset.update({"name": name})
        assert f"Dataset name '{name}' already exists" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_avatar(self, client, add_dataset_func, tmp_path):
        dataset = add_dataset_func
        fn = create_image_file(tmp_path / "ragflow_test.png")
        avatar_data = f"data:image/png;base64,{encode_avatar(fn)}"
        dataset.update({"avatar": avatar_data})
        assert dataset.avatar == avatar_data, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.avatar == avatar_data, str(retrieved_dataset)

    @pytest.mark.p2
    def test_avatar_exceeds_limit_length(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"avatar": "a" * 65536})
        assert "String should have at most 65535 characters" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    @pytest.mark.parametrize(
        "avatar_prefix, expected_message",
        [
            ("", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
            ("data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
            ("invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"),
            ("data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"),
        ],
        ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"],
    )
    def test_avatar_invalid_prefix(self, add_dataset_func, tmp_path, avatar_prefix, expected_message):
        dataset = add_dataset_func
        fn = create_image_file(tmp_path / "ragflow_test.png")
        with pytest.raises(Exception) as excinfo:
            dataset.update({"avatar": f"{avatar_prefix}{encode_avatar(fn)}"})
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_avatar_none(self, client, add_dataset_func):
        dataset = add_dataset_func
        dataset.update({"avatar": None})
        assert dataset.avatar is None, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.avatar is None, str(retrieved_dataset)

    @pytest.mark.p2
    def test_description(self, client, add_dataset_func):
        dataset = add_dataset_func
        dataset.update({"description": "description"})
        assert dataset.description == "description", str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.description == "description", str(retrieved_dataset)

    @pytest.mark.p2
    def test_description_exceeds_limit_length(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"description": "a" * 65536})
        assert "String should have at most 65535 characters" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_description_none(self, client, add_dataset_func):
        dataset = add_dataset_func
        dataset.update({"description": None})
        assert dataset.description is None, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.description is None, str(retrieved_dataset)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "embedding_model",
        [
            "BAAI/bge-large-zh-v1.5@BAAI",
            "maidalun1020/bce-embedding-base_v1@Youdao",
            "embedding-3@ZHIPU-AI",
        ],
        ids=["builtin_baai", "builtin_youdao", "tenant_zhipu"],
    )
    def test_embedding_model(self, client, add_dataset_func, embedding_model):
        dataset = add_dataset_func
        dataset.update({"embedding_model": embedding_model})
        assert dataset.embedding_model == embedding_model, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.embedding_model == embedding_model, str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, embedding_model",
        [
            ("unknown_llm_name", "unknown@ZHIPU-AI"),
            ("unknown_llm_factory", "embedding-3@unknown"),
            ("tenant_no_auth_default_tenant_llm", "text-embedding-v3@Tongyi-Qianwen"),
            ("tenant_no_auth", "text-embedding-3-small@OpenAI"),
        ],
        ids=["unknown_llm_name", "unknown_llm_factory", "tenant_no_auth_default_tenant_llm", "tenant_no_auth"],
    )
    def test_embedding_model_invalid(self, add_dataset_func, name, embedding_model):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"name": name, "embedding_model": embedding_model})
        error_msg = str(excinfo.value)
        if "tenant_no_auth" in name:
            assert error_msg == f"Unauthorized model: <{embedding_model}>", error_msg
        else:
            assert error_msg == f"Unsupported model: <{embedding_model}>", error_msg

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "name, embedding_model",
        [
            ("missing_at", "BAAI/bge-large-zh-v1.5BAAI"),
            ("missing_model_name", "@BAAI"),
            ("missing_provider", "BAAI/bge-large-zh-v1.5@"),
            ("whitespace_only_model_name", " @BAAI"),
            ("whitespace_only_provider", "BAAI/bge-large-zh-v1.5@ "),
        ],
        ids=["missing_at", "empty_model_name", "empty_provider", "whitespace_only_model_name", "whitespace_only_provider"],
    )
    def test_embedding_model_format(self, add_dataset_func, name, embedding_model):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"name": name, "embedding_model": embedding_model})
        error_msg = str(excinfo.value)
        if name == "missing_at":
            assert "Embedding model identifier must follow <model_name>@<provider> format" in error_msg, error_msg
        else:
            assert "Both model_name and provider must be non-empty strings" in error_msg, error_msg

    @pytest.mark.p2
    def test_embedding_model_none(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"embedding_model": None})
        assert "Input should be a valid string" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "permission",
        [
            "me",
            "team",
            "ME",
            "TEAM",
            " ME ",
        ],
        ids=["me", "team", "me_upercase", "team_upercase", "whitespace"],
    )
    def test_permission(self, client, add_dataset_func, permission):
        dataset = add_dataset_func
        dataset.update({"permission": permission})
        assert dataset.permission == permission.lower().strip(), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.permission == permission.lower().strip(), str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "permission",
        [
            "",
            "unknown",
            list(),
        ],
        ids=["empty", "unknown", "type_error"],
    )
    def test_permission_invalid(self, add_dataset_func, permission):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"permission": permission})
        assert "Input should be 'me' or 'team'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_permission_none(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"permission": None})
        assert "Input should be 'me' or 'team'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "chunk_method",
        [
            "naive",
            "book",
            "email",
            "laws",
            "manual",
            "one",
            "paper",
            "picture",
            "presentation",
            "qa",
            "table",
            "tag",
        ],
        ids=["naive", "book", "email", "laws", "manual", "one", "paper", "picture", "presentation", "qa", "table", "tag"],
    )
    def test_chunk_method(self, client, add_dataset_func, chunk_method):
        dataset = add_dataset_func
        dataset.update({"chunk_method": chunk_method})
        assert dataset.chunk_method == chunk_method, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.chunk_method == chunk_method, str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "chunk_method",
        [
            "",
            "unknown",
            list(),
        ],
        ids=["empty", "unknown", "type_error"],
    )
    def test_chunk_method_invalid(self, add_dataset_func, chunk_method):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"chunk_method": chunk_method})
        assert "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table' or 'tag'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_chunk_method_none(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"chunk_method": None})
        assert "Input should be 'naive', 'book', 'email', 'laws', 'manual', 'one', 'paper', 'picture', 'presentation', 'qa', 'table' or 'tag'" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    @pytest.mark.parametrize("pagerank", [0, 50, 100], ids=["min", "mid", "max"])
    def test_pagerank(self, client, add_dataset_func, pagerank):
        dataset = add_dataset_func
        dataset.update({"pagerank": pagerank})
        assert dataset.pagerank == pagerank, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert retrieved_dataset.pagerank == pagerank, str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "pagerank, expected_message",
        [
            (-1, "Input should be greater than or equal to 0"),
            (101, "Input should be less than or equal to 100"),
        ],
        ids=["min_limit", "max_limit"],
    )
    def test_pagerank_invalid(self, add_dataset_func, pagerank, expected_message):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"pagerank": pagerank})
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p3
    def test_pagerank_none(self, add_dataset_func):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"pagerank": None})
        assert "Input should be a valid integer" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p1
    @pytest.mark.parametrize(
        "parser_config",
        [
            {"auto_keywords": 0},
            {"auto_keywords": 16},
            {"auto_keywords": 32},
            {"auto_questions": 0},
            {"auto_questions": 5},
            {"auto_questions": 10},
            {"chunk_token_num": 1},
            {"chunk_token_num": 1024},
            {"chunk_token_num": 2048},
            {"delimiter": "\n"},
            {"delimiter": " "},
            {"html4excel": True},
            {"html4excel": False},
            {"layout_recognize": "DeepDOC"},
            {"layout_recognize": "Plain Text"},
            {"tag_kb_ids": ["1", "2"]},
            {"topn_tags": 1},
            {"topn_tags": 5},
            {"topn_tags": 10},
            {"filename_embd_weight": 0.1},
            {"filename_embd_weight": 0.5},
            {"filename_embd_weight": 1.0},
            {"task_page_size": 1},
            {"task_page_size": None},
            {"pages": [[1, 100]]},
            {"pages": None},
            {"graphrag": {"use_graphrag": True}},
            {"graphrag": {"use_graphrag": False}},
            {"graphrag": {"entity_types": ["age", "sex", "height", "weight"]}},
            {"graphrag": {"method": "general"}},
            {"graphrag": {"method": "light"}},
            {"graphrag": {"community": True}},
            {"graphrag": {"community": False}},
            {"graphrag": {"resolution": True}},
            {"graphrag": {"resolution": False}},
            {"raptor": {"use_raptor": True}},
            {"raptor": {"use_raptor": False}},
            {"raptor": {"prompt": "Who are you?"}},
            {"raptor": {"max_token": 1}},
            {"raptor": {"max_token": 1024}},
            {"raptor": {"max_token": 2048}},
            {"raptor": {"threshold": 0.0}},
            {"raptor": {"threshold": 0.5}},
            {"raptor": {"threshold": 1.0}},
            {"raptor": {"max_cluster": 1}},
            {"raptor": {"max_cluster": 512}},
            {"raptor": {"max_cluster": 1024}},
            {"raptor": {"random_seed": 0}},
        ],
        ids=[
            "auto_keywords_min",
            "auto_keywords_mid",
            "auto_keywords_max",
            "auto_questions_min",
            "auto_questions_mid",
            "auto_questions_max",
            "chunk_token_num_min",
            "chunk_token_num_mid",
            "chunk_token_num_max",
            "delimiter",
            "delimiter_space",
            "html4excel_true",
            "html4excel_false",
            "layout_recognize_DeepDOC",
            "layout_recognize_navie",
            "tag_kb_ids",
            "topn_tags_min",
            "topn_tags_mid",
            "topn_tags_max",
            "filename_embd_weight_min",
            "filename_embd_weight_mid",
            "filename_embd_weight_max",
            "task_page_size_min",
            "task_page_size_None",
            "pages",
            "pages_none",
            "graphrag_true",
            "graphrag_false",
            "graphrag_entity_types",
            "graphrag_method_general",
            "graphrag_method_light",
            "graphrag_community_true",
            "graphrag_community_false",
            "graphrag_resolution_true",
            "graphrag_resolution_false",
            "raptor_true",
            "raptor_false",
            "raptor_prompt",
            "raptor_max_token_min",
            "raptor_max_token_mid",
            "raptor_max_token_max",
            "raptor_threshold_min",
            "raptor_threshold_mid",
            "raptor_threshold_max",
            "raptor_max_cluster_min",
            "raptor_max_cluster_mid",
            "raptor_max_cluster_max",
            "raptor_random_seed_min",
        ],
    )
    def test_parser_config(self, client, add_dataset_func, parser_config):
        dataset = add_dataset_func
        dataset.update({"parser_config": parser_config})
        for k, v in parser_config.items():
            if isinstance(v, dict):
                for kk, vv in v.items():
                    assert attrgetter(f"{k}.{kk}")(dataset.parser_config) == vv, str(dataset)
            else:
                assert attrgetter(k)(dataset.parser_config) == v, str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        for k, v in parser_config.items():
            if isinstance(v, dict):
                for kk, vv in v.items():
                    assert attrgetter(f"{k}.{kk}")(retrieved_dataset.parser_config) == vv, str(retrieved_dataset)
            else:
                assert attrgetter(k)(retrieved_dataset.parser_config) == v, str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "parser_config, expected_message",
        [
            ({"auto_keywords": -1}, "Input should be greater than or equal to 0"),
            ({"auto_keywords": 33}, "Input should be less than or equal to 32"),
            ({"auto_keywords": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ({"auto_keywords": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"auto_questions": -1}, "Input should be greater than or equal to 0"),
            ({"auto_questions": 11}, "Input should be less than or equal to 10"),
            ({"auto_questions": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ({"auto_questions": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"chunk_token_num": 0}, "Input should be greater than or equal to 1"),
            ({"chunk_token_num": 2049}, "Input should be less than or equal to 2048"),
            ({"chunk_token_num": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ({"chunk_token_num": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"delimiter": ""}, "String should have at least 1 character"),
            ({"html4excel": "string"}, "Input should be a valid boolean, unable to interpret input"),
            ({"tag_kb_ids": "1,2"}, "Input should be a valid list"),
            ({"tag_kb_ids": [1, 2]}, "Input should be a valid string"),
            ({"topn_tags": 0}, "Input should be greater than or equal to 1"),
            ({"topn_tags": 11}, "Input should be less than or equal to 10"),
            ({"topn_tags": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ({"topn_tags": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"filename_embd_weight": -1}, "Input should be greater than or equal to 0"),
            ({"filename_embd_weight": 1.1}, "Input should be less than or equal to 1"),
            ({"filename_embd_weight": "string"}, "Input should be a valid number, unable to parse string as a number"),
            ({"task_page_size": 0}, "Input should be greater than or equal to 1"),
            ({"task_page_size": 3.14}, "Input should be a valid integer, got a number with a fractional part"),
            ({"task_page_size": "string"}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"pages": "1,2"}, "Input should be a valid list"),
            ({"pages": ["1,2"]}, "Input should be a valid list"),
            ({"pages": [["string1", "string2"]]}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"graphrag": {"use_graphrag": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ({"graphrag": {"entity_types": "1,2"}}, "Input should be a valid list"),
            ({"graphrag": {"entity_types": [1, 2]}}, "nput should be a valid string"),
            ({"graphrag": {"method": "unknown"}}, "Input should be 'light' or 'general'"),
            ({"graphrag": {"method": None}}, "Input should be 'light' or 'general'"),
            ({"graphrag": {"community": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ({"graphrag": {"resolution": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ({"raptor": {"use_raptor": "string"}}, "Input should be a valid boolean, unable to interpret input"),
            ({"raptor": {"prompt": ""}}, "String should have at least 1 character"),
            ({"raptor": {"prompt": " "}}, "String should have at least 1 character"),
            ({"raptor": {"max_token": 0}}, "Input should be greater than or equal to 1"),
            ({"raptor": {"max_token": 2049}}, "Input should be less than or equal to 2048"),
            ({"raptor": {"max_token": 3.14}}, "Input should be a valid integer, got a number with a fractional part"),
            ({"raptor": {"max_token": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"raptor": {"threshold": -0.1}}, "Input should be greater than or equal to 0"),
            ({"raptor": {"threshold": 1.1}}, "Input should be less than or equal to 1"),
            ({"raptor": {"threshold": "string"}}, "Input should be a valid number, unable to parse string as a number"),
            ({"raptor": {"max_cluster": 0}}, "Input should be greater than or equal to 1"),
            ({"raptor": {"max_cluster": 1025}}, "Input should be less than or equal to 1024"),
            ({"raptor": {"max_cluster": 3.14}}, "Input should be a valid integer, got a number with a fractional par"),
            ({"raptor": {"max_cluster": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"raptor": {"random_seed": -1}}, "Input should be greater than or equal to 0"),
            ({"raptor": {"random_seed": 3.14}}, "Input should be a valid integer, got a number with a fractional part"),
            ({"raptor": {"random_seed": "string"}}, "Input should be a valid integer, unable to parse string as an integer"),
            ({"delimiter": "a" * 65536}, "Parser config exceeds size limit (max 65,535 characters)"),
        ],
        ids=[
            "auto_keywords_min_limit",
            "auto_keywords_max_limit",
            "auto_keywords_float_not_allowed",
            "auto_keywords_type_invalid",
            "auto_questions_min_limit",
            "auto_questions_max_limit",
            "auto_questions_float_not_allowed",
            "auto_questions_type_invalid",
            "chunk_token_num_min_limit",
            "chunk_token_num_max_limit",
            "chunk_token_num_float_not_allowed",
            "chunk_token_num_type_invalid",
            "delimiter_empty",
            "html4excel_type_invalid",
            "tag_kb_ids_not_list",
            "tag_kb_ids_int_in_list",
            "topn_tags_min_limit",
            "topn_tags_max_limit",
            "topn_tags_float_not_allowed",
            "topn_tags_type_invalid",
            "filename_embd_weight_min_limit",
            "filename_embd_weight_max_limit",
            "filename_embd_weight_type_invalid",
            "task_page_size_min_limit",
            "task_page_size_float_not_allowed",
            "task_page_size_type_invalid",
            "pages_not_list",
            "pages_not_list_in_list",
            "pages_not_int_list",
            "graphrag_type_invalid",
            "graphrag_entity_types_not_list",
            "graphrag_entity_types_not_str_in_list",
            "graphrag_method_unknown",
            "graphrag_method_none",
            "graphrag_community_type_invalid",
            "graphrag_resolution_type_invalid",
            "raptor_type_invalid",
            "raptor_prompt_empty",
            "raptor_prompt_space",
            "raptor_max_token_min_limit",
            "raptor_max_token_max_limit",
            "raptor_max_token_float_not_allowed",
            "raptor_max_token_type_invalid",
            "raptor_threshold_min_limit",
            "raptor_threshold_max_limit",
            "raptor_threshold_type_invalid",
            "raptor_max_cluster_min_limit",
            "raptor_max_cluster_max_limit",
            "raptor_max_cluster_float_not_allowed",
            "raptor_max_cluster_type_invalid",
            "raptor_random_seed_min_limit",
            "raptor_random_seed_float_not_allowed",
            "raptor_random_seed_type_invalid",
            "parser_config_type_invalid",
        ],
    )
    def test_parser_config_invalid(self, add_dataset_func, parser_config, expected_message):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update({"parser_config": parser_config})
        assert expected_message in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_parser_config_empty(self, client, add_dataset_func):
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
            {
                "chunk_token_num": 128,
                "delimiter": r"\n",
                "html4excel": False,
                "layout_recognize": "DeepDOC",
                "raptor": {"use_raptor": False},
            },
        )
        dataset.update({"parser_config": {}})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert str(retrieved_dataset.parser_config) == str(expected_config), str(retrieved_dataset)

    @pytest.mark.p3
    def test_parser_config_none(self, client, add_dataset_func):
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
            {
                "chunk_token_num": 128,
                "delimiter": r"\n",
                "html4excel": False,
                "layout_recognize": "DeepDOC",
                "raptor": {"use_raptor": False},
            },
        )
        dataset.update({"parser_config": None})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert str(retrieved_dataset.parser_config) == str(expected_config), str(retrieved_dataset)

    @pytest.mark.p3
    def test_parser_config_empty_with_chunk_method_change(self, client, add_dataset_func):
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
            {
                "raptor": {"use_raptor": False},
            },
        )
        dataset.update({"chunk_method": "qa", "parser_config": {}})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert str(retrieved_dataset.parser_config) == str(expected_config), str(retrieved_dataset)

    @pytest.mark.p3
    def test_parser_config_unset_with_chunk_method_change(self, client, add_dataset_func):
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
            {
                "raptor": {"use_raptor": False},
            },
        )
        dataset.update({"chunk_method": "qa"})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert str(retrieved_dataset.parser_config) == str(expected_config), str(retrieved_dataset)

    @pytest.mark.p3
    def test_parser_config_none_with_chunk_method_change(self, client, add_dataset_func):
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
            {
                "raptor": {"use_raptor": False},
            },
        )
        dataset.update({"chunk_method": "qa", "parser_config": None})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)

        retrieved_dataset = client.get_dataset(name=dataset.name)
        assert str(retrieved_dataset.parser_config) == str(expected_config), str(retrieved_dataset)

    @pytest.mark.p2
    @pytest.mark.parametrize(
        "payload",
        [
            {"id": "id"},
            {"tenant_id": "e57c1966f99211efb41e9e45646e0111"},
            {"created_by": "created_by"},
            {"create_date": "Tue, 11 Mar 2025 13:37:23 GMT"},
            {"create_time": 1741671443322},
            {"update_date": "Tue, 11 Mar 2025 13:37:23 GMT"},
            {"update_time": 1741671443339},
            {"document_count": 1},
            {"chunk_count": 1},
            {"token_num": 1},
            {"status": "1"},
            {"unknown_field": "unknown_field"},
        ],
    )
    def test_field_unsupported(self, add_dataset_func, payload):
        dataset = add_dataset_func
        with pytest.raises(Exception) as excinfo:
            dataset.update(payload)
        assert "Extra inputs are not permitted" in str(excinfo.value), str(excinfo.value)

    @pytest.mark.p2
    def test_field_unset(self, client, add_dataset_func):
        dataset = add_dataset_func
        original_dataset = client.get_dataset(name=dataset.name)

        dataset.update({"name": "default_unset"})

        updated_dataset = client.get_dataset(name="default_unset")
        assert updated_dataset.avatar == original_dataset.avatar, str(updated_dataset)
        assert updated_dataset.description == original_dataset.description, str(updated_dataset)
        assert updated_dataset.embedding_model == original_dataset.embedding_model, str(updated_dataset)
        assert updated_dataset.permission == original_dataset.permission, str(updated_dataset)
        assert updated_dataset.chunk_method == original_dataset.chunk_method, str(updated_dataset)
        assert updated_dataset.pagerank == original_dataset.pagerank, str(updated_dataset)
        assert str(updated_dataset.parser_config) == str(original_dataset.parser_config), str(updated_dataset)