Просмотр исходного кода

Fix(api): correct default value handling in dataset parser config (#7589)

### What problem does this PR solve?

Fix  HTTP API Create/Update dataset parser config default value error

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.19.0
liu an 5 месяцев назад
Родитель
Сommit
f8cc557892
Аккаунт пользователя с таким Email не найден

+ 5
- 2
api/apps/sdk/dataset.py Просмотреть файл

if req.get("parser_config"): if req.get("parser_config"):
req["parser_config"] = deep_merge(kb.parser_config, req["parser_config"]) req["parser_config"] = deep_merge(kb.parser_config, req["parser_config"])


if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id and req.get("parser_config") is None:
req["parser_config"] = get_parser_config(chunk_method, None)
if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id:
if not req.get("parser_config"):
req["parser_config"] = get_parser_config(chunk_method, None)
elif "parser_config" in req and not req["parser_config"]:
del req["parser_config"]


if "name" in req and req["name"].lower() != kb.name.lower(): if "name" in req and req["name"].lower() != kb.name.lower():
try: try:

+ 34
- 4
api/utils/validation_utils.py Просмотреть файл

permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)] permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)]
chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")] chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")]
pagerank: int = Field(default=0, ge=0, le=100) pagerank: int = Field(default=0, ge=0, le=100)
parser_config: ParserConfig = Field(default_factory=dict)
parser_config: ParserConfig | None = Field(default=None)


@field_validator("avatar") @field_validator("avatar")
@classmethod @classmethod
""" """
return v.lower() if isinstance(v, str) else v return v.lower() if isinstance(v, str) else v


@field_validator("parser_config", mode="before")
@classmethod
def normalize_empty_parser_config(cls, v: Any) -> Any:
"""
Normalizes empty parser configuration by converting empty dictionaries to None.

This validator ensures consistent handling of empty parser configurations across
the application by converting empty dicts to None values.

Args:
v (Any): Raw input value for the parser config field

Returns:
Any: Returns None if input is an empty dict, otherwise returns the original value

Example:
>>> normalize_empty_parser_config({})
None

>>> normalize_empty_parser_config({"key": "value"})
{"key": "value"}
"""
if v == {}:
return None
return v

@field_validator("parser_config", mode="after") @field_validator("parser_config", mode="after")
@classmethod @classmethod
def validate_parser_config_json_length(cls, v: ParserConfig) -> ParserConfig:
def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None:
""" """
Validates serialized JSON length constraints for parser configuration. Validates serialized JSON length constraints for parser configuration.


Implements a two-stage validation workflow: Implements a two-stage validation workflow:
1. Model serialization - convert Pydantic model to JSON string
2. Size verification - enforce maximum allowed payload size
1. Null check - bypass validation for empty configurations
2. Model serialization - convert Pydantic model to JSON string
3. Size verification - enforce maximum allowed payload size


Args: Args:
v (ParserConfig | None): Raw parser configuration object v (ParserConfig | None): Raw parser configuration object
Raises: Raises:
ValueError: When serialized JSON exceeds 65,535 characters ValueError: When serialized JSON exceeds 65,535 characters
""" """
if v is None:
return None

if (json_str := v.model_dump_json()) and len(json_str) > 65535: if (json_str := v.model_dump_json()) and len(json_str) > 65535:
raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}") raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}")
return v return v

+ 28
- 30
sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py Просмотреть файл

def test_avatar(self, get_http_api_auth, tmp_path): def test_avatar(self, get_http_api_auth, tmp_path):
fn = create_image_file(tmp_path / "ragflow_test.png") fn = create_image_file(tmp_path / "ragflow_test.png")
payload = { payload = {
"name": "avatar_test",
"name": "avatar",
"avatar": f"data:image/png;base64,{encode_avatar(fn)}", "avatar": f"data:image/png;base64,{encode_avatar(fn)}",
} }
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)


@pytest.mark.p2 @pytest.mark.p2
def test_avatar_exceeds_limit_length(self, get_http_api_auth): def test_avatar_exceeds_limit_length(self, get_http_api_auth):
payload = {"name": "exceeds_limit_length_avatar", "avatar": "a" * 65536}
payload = {"name": "avatar_exceeds_limit_length", "avatar": "a" * 65536}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "String should have at most 65535 characters" in res["message"], res assert "String should have at most 65535 characters" in res["message"], res


@pytest.mark.p3 @pytest.mark.p3
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name, avatar_prefix, expected_message",
"name, prefix, expected_message",
[ [
("empty_prefix", "", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"), ("empty_prefix", "", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"), ("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
], ],
ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"], ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"],
) )
def test_avatar_invalid_prefix(self, get_http_api_auth, tmp_path, name, avatar_prefix, expected_message):
def test_avatar_invalid_prefix(self, get_http_api_auth, tmp_path, name, prefix, expected_message):
fn = create_image_file(tmp_path / "ragflow_test.png") fn = create_image_file(tmp_path / "ragflow_test.png")
payload = { payload = {
"name": name, "name": name,
"avatar": f"{avatar_prefix}{encode_avatar(fn)}",
"avatar": f"{prefix}{encode_avatar(fn)}",
} }
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res assert res["code"] == 101, res


@pytest.mark.p3 @pytest.mark.p3
def test_avatar_unset(self, get_http_api_auth): def test_avatar_unset(self, get_http_api_auth):
payload = {"name": "test_avatar_unset"}
payload = {"name": "avatar_unset"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["avatar"] is None, res assert res["data"]["avatar"] is None, res


@pytest.mark.p3 @pytest.mark.p3
def test_avatar_none(self, get_http_api_auth): def test_avatar_none(self, get_http_api_auth):
payload = {"name": "test_avatar_none", "avatar": None}
payload = {"name": "avatar_none", "avatar": None}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["avatar"] is None, res assert res["data"]["avatar"] is None, res


@pytest.mark.p2 @pytest.mark.p2
def test_description(self, get_http_api_auth): def test_description(self, get_http_api_auth):
payload = {"name": "test_description", "description": "description"}
payload = {"name": "description", "description": "description"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["description"] == "description", res assert res["data"]["description"] == "description", res


@pytest.mark.p2 @pytest.mark.p2
def test_description_exceeds_limit_length(self, get_http_api_auth): def test_description_exceeds_limit_length(self, get_http_api_auth):
payload = {"name": "exceeds_limit_length_description", "description": "a" * 65536}
payload = {"name": "description_exceeds_limit_length", "description": "a" * 65536}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "String should have at most 65535 characters" in res["message"], res assert "String should have at most 65535 characters" in res["message"], res


@pytest.mark.p3 @pytest.mark.p3
def test_description_unset(self, get_http_api_auth): def test_description_unset(self, get_http_api_auth):
payload = {"name": "test_description_unset"}
payload = {"name": "description_unset"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["description"] is None, res assert res["data"]["description"] is None, res


@pytest.mark.p3 @pytest.mark.p3
def test_description_none(self, get_http_api_auth): def test_description_none(self, get_http_api_auth):
payload = {"name": "test_description_none", "description": None}
payload = {"name": "description_none", "description": None}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["description"] is None, res assert res["data"]["description"] is None, res


@pytest.mark.p2 @pytest.mark.p2
def test_embedding_model_none(self, get_http_api_auth): def test_embedding_model_none(self, get_http_api_auth):
payload = {"name": "test_embedding_model_none", "embedding_model": None}
payload = {"name": "embedding_model_none", "embedding_model": None}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "Input should be a valid string" in res["message"], res assert "Input should be a valid string" in res["message"], res


@pytest.mark.p2 @pytest.mark.p2
def test_permission_unset(self, get_http_api_auth): def test_permission_unset(self, get_http_api_auth):
payload = {"name": "test_permission_unset"}
payload = {"name": "permission_unset"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["permission"] == "me", res assert res["data"]["permission"] == "me", res


@pytest.mark.p3 @pytest.mark.p3
def test_permission_none(self, get_http_api_auth): def test_permission_none(self, get_http_api_auth):
payload = {"name": "test_permission_none", "permission": None}
payload = {"name": "permission_none", "permission": None}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "Input should be 'me' or 'team'" in res["message"], res assert "Input should be 'me' or 'team'" in res["message"], res


@pytest.mark.p2 @pytest.mark.p2
def test_chunk_method_unset(self, get_http_api_auth): def test_chunk_method_unset(self, get_http_api_auth):
payload = {"name": "test_chunk_method_unset"}
payload = {"name": "chunk_method_unset"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["chunk_method"] == "naive", res assert res["data"]["chunk_method"] == "naive", res


@pytest.mark.p2 @pytest.mark.p2
def test_parser_config_empty(self, get_http_api_auth): def test_parser_config_empty(self, get_http_api_auth):
payload = {"name": "default_empty", "parser_config": {}}
payload = {"name": "parser_config_empty", "parser_config": {}}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {
"auto_keywords": 0,
"auto_questions": 0,
"chunk_token_num": 128, "chunk_token_num": 128,
"delimiter": r"\n", "delimiter": r"\n",
"filename_embd_weight": None,
"graphrag": None,
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"pages": None,
"raptor": None,
"tag_kb_ids": [],
"task_page_size": None,
"topn_tags": 1,
}
"raptor": {"use_raptor": False},
}, res


@pytest.mark.p2 @pytest.mark.p2
def test_parser_config_unset(self, get_http_api_auth): def test_parser_config_unset(self, get_http_api_auth):
payload = {"name": "default_unset"}
payload = {"name": "parser_config_unset"}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {


@pytest.mark.p3 @pytest.mark.p3
def test_parser_config_none(self, get_http_api_auth): def test_parser_config_none(self, get_http_api_auth):
payload = {"name": "default_none", "parser_config": None}
payload = {"name": "parser_config_none", "parser_config": None}
res = create_dataset(get_http_api_auth, payload) res = create_dataset(get_http_api_auth, payload)
assert res["code"] == 101, res
assert "Input should be a valid dictionary or instance of ParserConfig" in res["message"], res
assert res["code"] == 0, res
assert res["data"]["parser_config"] == {
"chunk_token_num": 128,
"delimiter": "\\n",
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
}, res


@pytest.mark.p2 @pytest.mark.p2
@pytest.mark.parametrize( @pytest.mark.parametrize(

+ 97
- 40
sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py Просмотреть файл

class TestDatasetUpdate: class TestDatasetUpdate:
@pytest.mark.p3 @pytest.mark.p3
def test_dataset_id_not_uuid(self, get_http_api_auth): def test_dataset_id_not_uuid(self, get_http_api_auth):
payload = {"name": "dataset_id_not_uuid"}
payload = {"name": "not_uuid"}
res = update_dataset(get_http_api_auth, "not_uuid", payload) res = update_dataset(get_http_api_auth, "not_uuid", payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "Input should be a valid UUID" in res["message"], res assert "Input should be a valid UUID" in res["message"], res


@pytest.mark.p3 @pytest.mark.p3
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name, avatar_prefix, expected_message",
"avatar_prefix, expected_message",
[ [
("empty_prefix", "", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
("unsupported_mine_type", "invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"),
("invalid_mine_type", "data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"),
("", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
("data:image/png;base64", "Missing MIME prefix. Expected format: data:<mime>;base64,<data>"),
("invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"),
("data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"),
], ],
ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"], ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"],
) )
def test_avatar_invalid_prefix(self, get_http_api_auth, add_dataset_func, tmp_path, name, avatar_prefix, expected_message):
def test_avatar_invalid_prefix(self, get_http_api_auth, add_dataset_func, tmp_path, avatar_prefix, expected_message):
dataset_id = add_dataset_func dataset_id = add_dataset_func
fn = create_image_file(tmp_path / "ragflow_test.png") fn = create_image_file(tmp_path / "ragflow_test.png")
payload = {
"name": name,
"avatar": f"{avatar_prefix}{encode_avatar(fn)}",
}
payload = {"avatar": f"{avatar_prefix}{encode_avatar(fn)}"}
res = update_dataset(get_http_api_auth, dataset_id, payload) res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert expected_message in res["message"], res assert expected_message in res["message"], res


@pytest.mark.p1 @pytest.mark.p1
@pytest.mark.parametrize( @pytest.mark.parametrize(
"name, permission",
"permission",
[ [
("me", "me"),
("team", "team"),
("me_upercase", "ME"),
("team_upercase", "TEAM"),
"me",
"team",
"ME",
"TEAM",
], ],
ids=["me", "team", "me_upercase", "team_upercase"], ids=["me", "team", "me_upercase", "team_upercase"],
) )
def test_permission(self, get_http_api_auth, add_dataset_func, name, permission):
def test_permission(self, get_http_api_auth, add_dataset_func, permission):
dataset_id = add_dataset_func dataset_id = add_dataset_func
payload = {"name": name, "permission": permission}
payload = {"permission": permission}
res = update_dataset(get_http_api_auth, dataset_id, payload) res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 0, res assert res["code"] == 0, res


@pytest.mark.p3 @pytest.mark.p3
def test_permission_none(self, get_http_api_auth, add_dataset_func): def test_permission_none(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func dataset_id = add_dataset_func
payload = {"name": "test_permission_none", "permission": None}
payload = {"permission": None}
res = update_dataset(get_http_api_auth, dataset_id, payload) res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "Input should be 'me' or 'team'" in res["message"], res assert "Input should be 'me' or 'team'" in res["message"], res


res = list_datasets(get_http_api_auth) res = list_datasets(get_http_api_auth)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == {}

# @pytest.mark.p2
# def test_parser_config_unset(self, get_http_api_auth, add_dataset_func):
# dataset_id = add_dataset_func
# payload = {"name": "default_unset"}
# res = update_dataset(get_http_api_auth, dataset_id, payload)
# assert res["code"] == 0, res

# res = list_datasets(get_http_api_auth)
# assert res["code"] == 0, res
# assert res["data"][0]["parser_config"] == {
# "chunk_token_num": 128,
# "delimiter": r"\n",
# "html4excel": False,
# "layout_recognize": "DeepDOC",
# "raptor": {"use_raptor": False},
# }, res
assert res["data"][0]["parser_config"] == {
"chunk_token_num": 128,
"delimiter": r"\n",
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
}, res


@pytest.mark.p3 @pytest.mark.p3
def test_parser_config_none(self, get_http_api_auth, add_dataset_func): def test_parser_config_none(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func dataset_id = add_dataset_func
payload = {"parser_config": None} payload = {"parser_config": None}
res = update_dataset(get_http_api_auth, dataset_id, payload) res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 101, res
assert "Input should be a valid dictionary or instance of ParserConfig" in res["message"], res
assert res["code"] == 0, res

res = list_datasets(get_http_api_auth, {"id": dataset_id})
assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == {
"chunk_token_num": 128,
"delimiter": r"\n",
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
}, res

@pytest.mark.p3
def test_parser_config_empty_with_chunk_method_change(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func
payload = {"chunk_method": "qa", "parser_config": {}}
res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 0, res

res = list_datasets(get_http_api_auth)
print(res)
assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res

@pytest.mark.p3
def test_parser_config_unset_with_chunk_method_change(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func
payload = {"chunk_method": "qa"}
res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 0, res

res = list_datasets(get_http_api_auth)
assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res

@pytest.mark.p3
def test_parser_config_none_with_chunk_method_change(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func
payload = {"chunk_method": "qa", "parser_config": None}
res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 0, res

res = list_datasets(get_http_api_auth, {"id": dataset_id})
print(res)
assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res


@pytest.mark.p2 @pytest.mark.p2
@pytest.mark.parametrize( @pytest.mark.parametrize(
{"unknown_field": "unknown_field"}, {"unknown_field": "unknown_field"},
], ],
) )
def test_unsupported_field(self, get_http_api_auth, add_dataset_func, payload):
def test_field_unsupported(self, get_http_api_auth, add_dataset_func, payload):
dataset_id = add_dataset_func dataset_id = add_dataset_func
res = update_dataset(get_http_api_auth, dataset_id, payload) res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 101, res assert res["code"] == 101, res
assert "Extra inputs are not permitted" in res["message"], res assert "Extra inputs are not permitted" in res["message"], res

@pytest.mark.p2
def test_field_unset(self, get_http_api_auth, add_dataset_func):
dataset_id = add_dataset_func
res = list_datasets(get_http_api_auth)
assert res["code"] == 0, res
original_data = res["data"][0]

payload = {"name": "default_unset"}
res = update_dataset(get_http_api_auth, dataset_id, payload)
assert res["code"] == 0, res

res = list_datasets(get_http_api_auth)
assert res["code"] == 0, res
assert res["data"][0]["avatar"] == original_data["avatar"], res
assert res["data"][0]["description"] == original_data["description"], res
assert res["data"][0]["embedding_model"] == original_data["embedding_model"], res
assert res["data"][0]["permission"] == original_data["permission"], res
assert res["data"][0]["chunk_method"] == original_data["chunk_method"], res
assert res["data"][0]["pagerank"] == original_data["pagerank"], res
assert res["data"][0]["parser_config"] == {
"chunk_token_num": 128,
"delimiter": r"\n",
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
}, res

Загрузка…
Отмена
Сохранить