Browse Source

Fix: add the validation for parser_config. (#5755)

### What problem does this PR solve?

#5719

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.17.1
Kevin Hu 7 months ago
parent
commit
da3f279495
No account linked to committer's email address

+ 3
- 1
api/apps/sdk/dataset.py View File

token_required, token_required,
get_error_data_result, get_error_data_result,
valid, valid,
get_parser_config,
get_parser_config, valid_parser_config,
) )




permission = req.get("permission") permission = req.get("permission")
chunk_method = req.get("chunk_method") chunk_method = req.get("chunk_method")
parser_config = req.get("parser_config") parser_config = req.get("parser_config")
valid_parser_config(parser_config)
valid_permission = ["me", "team"] valid_permission = ["me", "team"]
valid_chunk_method = [ valid_chunk_method = [
"naive", "naive",
permission = req.get("permission") permission = req.get("permission")
chunk_method = req.get("chunk_method") chunk_method = req.get("chunk_method")
parser_config = req.get("parser_config") parser_config = req.get("parser_config")
valid_parser_config(parser_config)
valid_permission = ["me", "team"] valid_permission = ["me", "team"]
valid_chunk_method = [ valid_chunk_method = [
"naive", "naive",

+ 29
- 0
api/utils/api_utils.py View File

"picture": None} "picture": None}
parser_config = key_mapping[chunk_method] parser_config = key_mapping[chunk_method]
return parser_config return parser_config


def valid_parser_config(parser_config):
if not parser_config:
return
scopes = set([
"chunk_token_num",
"delimiter",
"raptor",
"graphrag",
"layout_recognize",
"task_page_size",
"pages",
"html4excel",
"auto_keywords",
"auto_questions",
"tag_kb_ids",
"topn_tags",
"filename_embd_weight"
])
for k in parser_config.keys():
assert k in scopes, f"Abnormal 'parser_config'. Invalid key: {k}"

assert 1 <= parser_config.get("chunk_token_num", 1) < 100000000, "chunk_token_num should be in range from 1 to 100000000"
assert 1 <= parser_config.get("task_page_size", 1) < 100000000, "task_page_size should be in range from 1 to 100000000"
assert 0 <= parser_config.get("auto_keywords", 0) < 32, "auto_keywords should be in range from 0 to 32"
assert 0 <= parser_config.get("auto_questions", 0) < 10, "auto_questions should be in range from 0 to 10"
assert 0 <= parser_config.get("topn_tags", 0) < 10, "topn_tags should be in range from 0 to 10"
assert isinstance(parser_config.get("html4excel", False), bool), "html4excel should be True or False"

+ 11
- 15
sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py View File



@pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [ @pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [
("naive_default", "naive", ("naive_default", "naive",
{"chunk_token_count": 128,
{"chunk_token_num": 128,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"html4excel": False, "html4excel": False,
"delimiter": "\n!?。;!?", "delimiter": "\n!?。;!?",
}, },
0), 0),
("naive_empty", "naive", {}, 0), ("naive_empty", "naive", {}, 0),
pytest.param("naive_chunk_token_count_negative", "naive",
{"chunk_token_count": -1},
pytest.param("naive_chunk_token_num_negative", "naive",
{"chunk_token_num": -1},
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
pytest.param("naive_chunk_token_count_zero", "naive",
{"chunk_token_count": 0},
pytest.param("naive_chunk_token_num_zero", "naive",
{"chunk_token_num": 0},
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
pytest.param("naive_chunk_token_count_float", "naive",
{"chunk_token_count": 3.14},
pytest.param("naive_chunk_token_num_float", "naive",
{"chunk_token_num": 3.14},
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
pytest.param("naive_chunk_token_count_max", "naive",
{"chunk_token_count": 1024*1024*1024},
pytest.param("naive_chunk_token_num_max", "naive",
{"chunk_token_num": 1024*1024*1024},
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
pytest.param("naive_chunk_token_count_str", "naive",
{"chunk_token_count": '1024'},
pytest.param("naive_chunk_token_num_str", "naive",
{"chunk_token_num": '1024'},
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
("naive_layout_recognize_DeepDOC", "naive", ("naive_layout_recognize_DeepDOC", "naive",
{"layout_recognize": "DeepDOC"}, 0), {"layout_recognize": "DeepDOC"}, 0),
102, marks=pytest.mark.xfail(reason='issue#5719')), 102, marks=pytest.mark.xfail(reason='issue#5719')),
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0), ("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0),
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0), ("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0),
("knowledge_graph_entity_types_default", "knowledge_graph", {
"entity_types": ["organization", "person", "location", "event", "time"]}, 0),
pytest.param("knowledge_graph_entity_types_not_list", "knowledge_graph", {
"entity_types": "organization,person,location,event,time"}, 102, marks=pytest.mark.xfail(reason='issue#5719'))
]) ])
def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code): def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code):
payload = { payload = {

Loading…
Cancel
Save