|
|
|
|
|
|
|
|
assert res["data"]["embedding_model"] == embedding_model |
|
|
assert res["data"]["embedding_model"] == embedding_model |
|
|
|
|
|
|
|
|
@pytest.mark.parametrize( |
|
|
@pytest.mark.parametrize( |
|
|
"name, chunk_method, parser_config, expected_code", |
|
|
|
|
|
|
|
|
"name, chunk_method, parser_config, expected_code, expected_message", |
|
|
[ |
|
|
[ |
|
|
( |
|
|
( |
|
|
"naive_default", |
|
|
"naive_default", |
|
|
|
|
|
|
|
|
"raptor": {"use_raptor": False}, |
|
|
"raptor": {"use_raptor": False}, |
|
|
}, |
|
|
}, |
|
|
0, |
|
|
0, |
|
|
|
|
|
"", |
|
|
), |
|
|
), |
|
|
("naive_empty", "naive", {}, 0), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
|
|
|
("naive_empty", "naive", {}, 0, ""), |
|
|
|
|
|
( |
|
|
"naive_chunk_token_num_negative", |
|
|
"naive_chunk_token_num_negative", |
|
|
"naive", |
|
|
"naive", |
|
|
{"chunk_token_num": -1}, |
|
|
{"chunk_token_num": -1}, |
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('chunk_token_num should be in range from 1 to 100000000')", |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
|
|
|
|
|
|
( |
|
|
"naive_chunk_token_num_zero", |
|
|
"naive_chunk_token_num_zero", |
|
|
"naive", |
|
|
"naive", |
|
|
{"chunk_token_num": 0}, |
|
|
{"chunk_token_num": 0}, |
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('chunk_token_num should be in range from 1 to 100000000')", |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
|
|
|
"naive_chunk_token_num_float", |
|
|
|
|
|
|
|
|
( |
|
|
|
|
|
"naive_chunk_token_num_max", |
|
|
"naive", |
|
|
"naive", |
|
|
{"chunk_token_num": 3.14}, |
|
|
|
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
{"chunk_token_num": 100000000}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('chunk_token_num should be in range from 1 to 100000000')", |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_chunk_token_num_max", |
|
|
|
|
|
|
|
|
"naive_chunk_token_num_float", |
|
|
"naive", |
|
|
"naive", |
|
|
{"chunk_token_num": 1024 * 1024 * 1024}, |
|
|
|
|
|
|
|
|
{"chunk_token_num": 3.14}, |
|
|
102, |
|
|
102, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_chunk_token_num_str", |
|
|
"naive_chunk_token_num_str", |
|
|
"naive", |
|
|
"naive", |
|
|
{"chunk_token_num": "1024"}, |
|
|
{"chunk_token_num": "1024"}, |
|
|
102, |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
( |
|
|
( |
|
|
|
|
|
|
|
|
"naive", |
|
|
"naive", |
|
|
{"layout_recognize": "DeepDOC"}, |
|
|
{"layout_recognize": "DeepDOC"}, |
|
|
0, |
|
|
0, |
|
|
|
|
|
"", |
|
|
), |
|
|
), |
|
|
("naive_layout_recognize_Naive", "naive", {"layout_recognize": "Naive"}, 0), |
|
|
|
|
|
("naive_html4excel_true", "naive", {"html4excel": True}, 0), |
|
|
|
|
|
("naive_html4excel_false", "naive", {"html4excel": False}, 0), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
|
|
|
( |
|
|
|
|
|
"naive_layout_recognize_Naive", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"layout_recognize": "Naive"}, |
|
|
|
|
|
0, |
|
|
|
|
|
"", |
|
|
|
|
|
), |
|
|
|
|
|
("naive_html4excel_true", "naive", {"html4excel": True}, 0, ""), |
|
|
|
|
|
("naive_html4excel_false", "naive", {"html4excel": False}, 0, ""), |
|
|
|
|
|
( |
|
|
"naive_html4excel_not_bool", |
|
|
"naive_html4excel_not_bool", |
|
|
"naive", |
|
|
"naive", |
|
|
{"html4excel": 1}, |
|
|
{"html4excel": 1}, |
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('html4excel should be True or False')", |
|
|
), |
|
|
), |
|
|
("naive_delimiter_empty", "naive", {"delimiter": ""}, 0), |
|
|
|
|
|
("naive_delimiter_backticks", "naive", {"delimiter": "`##`"}, 0), |
|
|
|
|
|
|
|
|
("naive_delimiter_empty", "naive", {"delimiter": ""}, 0, ""), |
|
|
|
|
|
("naive_delimiter_backticks", "naive", {"delimiter": "`##`"}, 0, ""), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_delimiterl_not_str", |
|
|
|
|
|
|
|
|
"naive_delimiter_not_str", |
|
|
"naive", |
|
|
"naive", |
|
|
{"delimiterl": 1}, |
|
|
|
|
|
102, |
|
|
|
|
|
|
|
|
{"delimiter": 1}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
|
|
|
|
|
|
( |
|
|
"naive_task_page_size_negative", |
|
|
"naive_task_page_size_negative", |
|
|
"naive", |
|
|
"naive", |
|
|
{"task_page_size": -1}, |
|
|
{"task_page_size": -1}, |
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('task_page_size should be in range from 1 to 100000000')", |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
|
|
|
|
|
|
( |
|
|
"naive_task_page_size_zero", |
|
|
"naive_task_page_size_zero", |
|
|
"naive", |
|
|
"naive", |
|
|
{"task_page_size": 0}, |
|
|
{"task_page_size": 0}, |
|
|
102, |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('task_page_size should be in range from 1 to 100000000')", |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_task_page_size_max", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"task_page_size": 100000000}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('task_page_size should be in range from 1 to 100000000')", |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_task_page_size_float", |
|
|
"naive_task_page_size_float", |
|
|
"naive", |
|
|
"naive", |
|
|
{"task_page_size": 3.14}, |
|
|
{"task_page_size": 3.14}, |
|
|
102, |
|
|
|
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_task_page_size_max", |
|
|
|
|
|
|
|
|
"naive_task_page_size_str", |
|
|
"naive", |
|
|
"naive", |
|
|
{"task_page_size": 1024 * 1024 * 1024}, |
|
|
|
|
|
102, |
|
|
|
|
|
|
|
|
{"task_page_size": "1024"}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
|
|
|
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0, ""), |
|
|
|
|
|
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0, ""), |
|
|
|
|
|
( |
|
|
|
|
|
"invalid_key", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"invalid_key": "invalid_value"}, |
|
|
|
|
|
100, |
|
|
|
|
|
"""AssertionError("Abnormal \'parser_config\'. Invalid key: invalid_key")""", |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_auto_keywords_negative", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_keywords": -1}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('auto_keywords should be in range from 0 to 32')", |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_auto_keywords_max", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_keywords": 32}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('auto_keywords should be in range from 0 to 32')", |
|
|
|
|
|
), |
|
|
pytest.param( |
|
|
pytest.param( |
|
|
"naive_task_page_size_str", |
|
|
|
|
|
|
|
|
"naive_auto_keywords_float", |
|
|
"naive", |
|
|
"naive", |
|
|
{"task_page_size": "1024"}, |
|
|
|
|
|
102, |
|
|
|
|
|
|
|
|
{"auto_questions": 3.14}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
"naive_auto_keywords_str", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_keywords": "1024"}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_auto_questions_negative", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_questions": -1}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('auto_questions should be in range from 0 to 10')", |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_auto_questions_max", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_questions": 10}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('auto_questions should be in range from 0 to 10')", |
|
|
|
|
|
), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
"naive_auto_questions_float", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_questions": 3.14}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
"naive_auto_questions_str", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"auto_questions": "1024"}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_topn_tags_negative", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"topn_tags": -1}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('topn_tags should be in range from 0 to 10')", |
|
|
|
|
|
), |
|
|
|
|
|
( |
|
|
|
|
|
"naive_topn_tags_max", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"topn_tags": 10}, |
|
|
|
|
|
100, |
|
|
|
|
|
"AssertionError('topn_tags should be in range from 0 to 10')", |
|
|
|
|
|
), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
"naive_topn_tags_float", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"topn_tags": 3.14}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
|
|
), |
|
|
|
|
|
pytest.param( |
|
|
|
|
|
"naive_topn_tags_str", |
|
|
|
|
|
"naive", |
|
|
|
|
|
{"topn_tags": "1024"}, |
|
|
|
|
|
100, |
|
|
|
|
|
"", |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
), |
|
|
), |
|
|
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0), |
|
|
|
|
|
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0), |
|
|
|
|
|
], |
|
|
], |
|
|
) |
|
|
) |
|
|
def test_parser_configs( |
|
|
def test_parser_configs( |
|
|
self, get_http_api_auth, name, chunk_method, parser_config, expected_code |
|
|
|
|
|
|
|
|
self, |
|
|
|
|
|
get_http_api_auth, |
|
|
|
|
|
name, |
|
|
|
|
|
chunk_method, |
|
|
|
|
|
parser_config, |
|
|
|
|
|
expected_code, |
|
|
|
|
|
expected_message, |
|
|
): |
|
|
): |
|
|
payload = { |
|
|
payload = { |
|
|
"name": name, |
|
|
"name": name, |
|
|
|
|
|
|
|
|
"parser_config": parser_config, |
|
|
"parser_config": parser_config, |
|
|
} |
|
|
} |
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
# print(res) |
|
|
|
|
|
assert res["code"] == expected_code |
|
|
assert res["code"] == expected_code |
|
|
if expected_code == 0 and parser_config != {}: |
|
|
if expected_code == 0 and parser_config != {}: |
|
|
for k, v in parser_config.items(): |
|
|
for k, v in parser_config.items(): |
|
|
assert res["data"]["parser_config"][k] == v |
|
|
assert res["data"]["parser_config"][k] == v |
|
|
|
|
|
if expected_code != 0 or expected_message: |
|
|
|
|
|
assert res["message"] == expected_message |
|
|
if parser_config == {}: |
|
|
if parser_config == {}: |
|
|
assert res["data"]["parser_config"] == { |
|
|
assert res["data"]["parser_config"] == { |
|
|
"chunk_token_num": 128, |
|
|
"chunk_token_num": 128, |