|
|
|
@@ -13,39 +13,49 @@ |
|
|
|
# See the License for the specific language governing permissions and |
|
|
|
# limitations under the License. |
|
|
|
# |
|
|
|
import pytest |
|
|
|
import base64 |
|
|
|
|
|
|
|
from pathlib import Path |
|
|
|
|
|
|
|
from common import create_dataset, INVALID_API_TOKEN, DATASET_NAME_LIMIT |
|
|
|
import pytest |
|
|
|
from common import DATASET_NAME_LIMIT, INVALID_API_TOKEN, create_dataset |
|
|
|
from libs.auth import RAGFlowHttpApiAuth |
|
|
|
|
|
|
|
|
|
|
|
class TestAuthorization: |
|
|
|
def test_invalid_auth(self): |
|
|
|
INVALID_API_KEY = RAGFlowHttpApiAuth(INVALID_API_TOKEN) |
|
|
|
res = create_dataset(INVALID_API_KEY, {"name": "auth_test"}) |
|
|
|
|
|
|
|
assert res["code"] == 109 |
|
|
|
assert res["message"] == 'Authentication error: API key is invalid!' |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"auth, expected_code, expected_message", |
|
|
|
[ |
|
|
|
(None, 0, "`Authorization` can't be empty"), |
|
|
|
( |
|
|
|
RAGFlowHttpApiAuth(INVALID_API_TOKEN), |
|
|
|
109, |
|
|
|
"Authentication error: API key is invalid!", |
|
|
|
), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_invalid_auth(self, auth, expected_code, expected_message): |
|
|
|
res = create_dataset(auth, {"name": "auth_test"}) |
|
|
|
assert res["code"] == expected_code |
|
|
|
assert res["message"] == expected_message |
|
|
|
|
|
|
|
|
|
|
|
class TestDatasetCreation: |
|
|
|
@pytest.mark.parametrize("payload, expected_code", [ |
|
|
|
({"name": "valid_name"}, 0), |
|
|
|
({"name": "a"*(DATASET_NAME_LIMIT+1)}, 102), |
|
|
|
({"name": 0}, 100), |
|
|
|
({"name": ""}, 102), |
|
|
|
({"name": "duplicated_name"}, 102), |
|
|
|
({"name": "case_insensitive"}, 102), |
|
|
|
]) |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"payload, expected_code", |
|
|
|
[ |
|
|
|
({"name": "valid_name"}, 0), |
|
|
|
({"name": "a" * (DATASET_NAME_LIMIT + 1)}, 102), |
|
|
|
({"name": 0}, 100), |
|
|
|
({"name": ""}, 102), |
|
|
|
({"name": "duplicated_name"}, 102), |
|
|
|
({"name": "case_insensitive"}, 102), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_basic_scenarios(self, get_http_api_auth, payload, expected_code): |
|
|
|
if payload["name"] == "duplicated_name": |
|
|
|
create_dataset(get_http_api_auth, payload) |
|
|
|
elif payload["name"] == "case_insensitive": |
|
|
|
create_dataset(get_http_api_auth, { |
|
|
|
"name": payload["name"].upper()}) |
|
|
|
create_dataset(get_http_api_auth, {"name": payload["name"].upper()}) |
|
|
|
|
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
|
|
|
|
@@ -58,7 +68,7 @@ class TestDatasetCreation: |
|
|
|
|
|
|
|
@pytest.mark.slow |
|
|
|
def test_dataset_10k(self, get_http_api_auth): |
|
|
|
for i in range(10000): |
|
|
|
for i in range(10_000): |
|
|
|
payload = {"name": f"dataset_{i}"} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == 0, f"Failed to create dataset {i}" |
|
|
|
@@ -74,33 +84,33 @@ class TestAdvancedConfigurations: |
|
|
|
|
|
|
|
payload = { |
|
|
|
"name": "avatar_test", |
|
|
|
"avatar": encode_avatar(Path(request.config.rootdir) / 'test/data/logo.svg') |
|
|
|
"avatar": encode_avatar( |
|
|
|
Path(request.config.rootdir) / "test/data/logo.svg" |
|
|
|
), |
|
|
|
} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == 0 |
|
|
|
|
|
|
|
def test_description(self, get_http_api_auth): |
|
|
|
payload = { |
|
|
|
"name": "description_test", |
|
|
|
"description": "a" * 65536 |
|
|
|
} |
|
|
|
payload = {"name": "description_test", "description": "a" * 65536} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == 0 |
|
|
|
|
|
|
|
@pytest.mark.parametrize("name, permission, expected_code", [ |
|
|
|
("me", "me", 0), |
|
|
|
("team", "team", 0), |
|
|
|
pytest.param("empty_permission", "", 0, |
|
|
|
marks=pytest.mark.xfail(reason='issue#5709')), |
|
|
|
("me_upercase", "ME", 102), |
|
|
|
("team_upercase", "TEAM", 102), |
|
|
|
("other_permission", "other_permission", 102) |
|
|
|
]) |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"name, permission, expected_code", |
|
|
|
[ |
|
|
|
("me", "me", 0), |
|
|
|
("team", "team", 0), |
|
|
|
pytest.param( |
|
|
|
"empty_permission", "", 0, marks=pytest.mark.xfail(reason="issue#5709") |
|
|
|
), |
|
|
|
("me_upercase", "ME", 102), |
|
|
|
("team_upercase", "TEAM", 102), |
|
|
|
("other_permission", "other_permission", 102), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_permission(self, get_http_api_auth, name, permission, expected_code): |
|
|
|
payload = { |
|
|
|
"name": name, |
|
|
|
"permission": permission |
|
|
|
} |
|
|
|
payload = {"name": name, "permission": permission} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == expected_code |
|
|
|
if expected_code == 0 and permission != "": |
|
|
|
@@ -108,29 +118,33 @@ class TestAdvancedConfigurations: |
|
|
|
if permission == "": |
|
|
|
assert res["data"]["permission"] == "me" |
|
|
|
|
|
|
|
@pytest.mark.parametrize("name, chunk_method, expected_code", [ |
|
|
|
("naive", "naive", 0), |
|
|
|
("manual", "manual", 0), |
|
|
|
("qa", "qa", 0), |
|
|
|
("table", "table", 0), |
|
|
|
("paper", "paper", 0), |
|
|
|
("book", "book", 0), |
|
|
|
("laws", "laws", 0), |
|
|
|
("presentation", "presentation", 0), |
|
|
|
("picture", "picture", 0), |
|
|
|
("one", "one", 0), |
|
|
|
("picknowledge_graphture", "knowledge_graph", 0), |
|
|
|
("email", "email", 0), |
|
|
|
("tag", "tag", 0), |
|
|
|
pytest.param("empty_chunk_method", "", 0, |
|
|
|
marks=pytest.mark.xfail(reason='issue#5709')), |
|
|
|
("other_chunk_method", "other_chunk_method", 102) |
|
|
|
]) |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"name, chunk_method, expected_code", |
|
|
|
[ |
|
|
|
("naive", "naive", 0), |
|
|
|
("manual", "manual", 0), |
|
|
|
("qa", "qa", 0), |
|
|
|
("table", "table", 0), |
|
|
|
("paper", "paper", 0), |
|
|
|
("book", "book", 0), |
|
|
|
("laws", "laws", 0), |
|
|
|
("presentation", "presentation", 0), |
|
|
|
("picture", "picture", 0), |
|
|
|
("one", "one", 0), |
|
|
|
("picknowledge_graphture", "knowledge_graph", 0), |
|
|
|
("email", "email", 0), |
|
|
|
("tag", "tag", 0), |
|
|
|
pytest.param( |
|
|
|
"empty_chunk_method", |
|
|
|
"", |
|
|
|
0, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5709"), |
|
|
|
), |
|
|
|
("other_chunk_method", "other_chunk_method", 102), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_chunk_method(self, get_http_api_auth, name, chunk_method, expected_code): |
|
|
|
payload = { |
|
|
|
"name": name, |
|
|
|
"chunk_method": chunk_method |
|
|
|
} |
|
|
|
payload = {"name": name, "chunk_method": chunk_method} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == expected_code |
|
|
|
if expected_code == 0 and chunk_method != "": |
|
|
|
@@ -138,105 +152,172 @@ class TestAdvancedConfigurations: |
|
|
|
if chunk_method == "": |
|
|
|
assert res["data"]["chunk_method"] == "naive" |
|
|
|
|
|
|
|
@pytest.mark.parametrize("name, embedding_model, expected_code", [ |
|
|
|
("BAAI/bge-large-zh-v1.5", |
|
|
|
"BAAI/bge-large-zh-v1.5", 0), |
|
|
|
("BAAI/bge-base-en-v1.5", |
|
|
|
"BAAI/bge-base-en-v1.5", 0), |
|
|
|
("BAAI/bge-large-en-v1.5", |
|
|
|
"BAAI/bge-large-en-v1.5", 0), |
|
|
|
("BAAI/bge-small-en-v1.5", |
|
|
|
"BAAI/bge-small-en-v1.5", 0), |
|
|
|
("BAAI/bge-small-zh-v1.5", |
|
|
|
"BAAI/bge-small-zh-v1.5", 0), |
|
|
|
("jinaai/jina-embeddings-v2-base-en", |
|
|
|
"jinaai/jina-embeddings-v2-base-en", 0), |
|
|
|
("jinaai/jina-embeddings-v2-small-en", |
|
|
|
"jinaai/jina-embeddings-v2-small-en", 0), |
|
|
|
("nomic-ai/nomic-embed-text-v1.5", |
|
|
|
"nomic-ai/nomic-embed-text-v1.5", 0), |
|
|
|
("sentence-transformers/all-MiniLM-L6-v2", |
|
|
|
"sentence-transformers/all-MiniLM-L6-v2", 0), |
|
|
|
("text-embedding-v2", |
|
|
|
"text-embedding-v2", 0), |
|
|
|
("text-embedding-v3", |
|
|
|
"text-embedding-v3", 0), |
|
|
|
("maidalun1020/bce-embedding-base_v1", |
|
|
|
"maidalun1020/bce-embedding-base_v1", 0), |
|
|
|
("other_embedding_model", |
|
|
|
"other_embedding_model", 102) |
|
|
|
]) |
|
|
|
def test_embedding_model(self, get_http_api_auth, name, embedding_model, expected_code): |
|
|
|
payload = { |
|
|
|
"name": name, |
|
|
|
"embedding_model": embedding_model |
|
|
|
} |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"name, embedding_model, expected_code", |
|
|
|
[ |
|
|
|
("BAAI/bge-large-zh-v1.5", "BAAI/bge-large-zh-v1.5", 0), |
|
|
|
("BAAI/bge-base-en-v1.5", "BAAI/bge-base-en-v1.5", 0), |
|
|
|
("BAAI/bge-large-en-v1.5", "BAAI/bge-large-en-v1.5", 0), |
|
|
|
("BAAI/bge-small-en-v1.5", "BAAI/bge-small-en-v1.5", 0), |
|
|
|
("BAAI/bge-small-zh-v1.5", "BAAI/bge-small-zh-v1.5", 0), |
|
|
|
( |
|
|
|
"jinaai/jina-embeddings-v2-base-en", |
|
|
|
"jinaai/jina-embeddings-v2-base-en", |
|
|
|
0, |
|
|
|
), |
|
|
|
( |
|
|
|
"jinaai/jina-embeddings-v2-small-en", |
|
|
|
"jinaai/jina-embeddings-v2-small-en", |
|
|
|
0, |
|
|
|
), |
|
|
|
("nomic-ai/nomic-embed-text-v1.5", "nomic-ai/nomic-embed-text-v1.5", 0), |
|
|
|
( |
|
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
|
|
0, |
|
|
|
), |
|
|
|
("text-embedding-v2", "text-embedding-v2", 0), |
|
|
|
("text-embedding-v3", "text-embedding-v3", 0), |
|
|
|
( |
|
|
|
"maidalun1020/bce-embedding-base_v1", |
|
|
|
"maidalun1020/bce-embedding-base_v1", |
|
|
|
0, |
|
|
|
), |
|
|
|
("other_embedding_model", "other_embedding_model", 102), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_embedding_model( |
|
|
|
self, get_http_api_auth, name, embedding_model, expected_code |
|
|
|
): |
|
|
|
payload = {"name": name, "embedding_model": embedding_model} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
assert res["code"] == expected_code |
|
|
|
if expected_code == 0: |
|
|
|
assert res["data"]["embedding_model"] == embedding_model |
|
|
|
|
|
|
|
@pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [ |
|
|
|
("naive_default", "naive", |
|
|
|
{"chunk_token_num": 128, |
|
|
|
"layout_recognize": "DeepDOC", |
|
|
|
"html4excel": False, |
|
|
|
"delimiter": "\n!?。;!?", |
|
|
|
"task_page_size": 12, |
|
|
|
"raptor": {"use_raptor": False} |
|
|
|
}, |
|
|
|
0), |
|
|
|
("naive_empty", "naive", {}, 0), |
|
|
|
pytest.param("naive_chunk_token_num_negative", "naive", |
|
|
|
{"chunk_token_num": -1}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_chunk_token_num_zero", "naive", |
|
|
|
{"chunk_token_num": 0}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_chunk_token_num_float", "naive", |
|
|
|
{"chunk_token_num": 3.14}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_chunk_token_num_max", "naive", |
|
|
|
{"chunk_token_num": 1024*1024*1024}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_chunk_token_num_str", "naive", |
|
|
|
{"chunk_token_num": '1024'}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
("naive_layout_recognize_DeepDOC", "naive", |
|
|
|
{"layout_recognize": "DeepDOC"}, 0), |
|
|
|
("naive_layout_recognize_Naive", "naive", |
|
|
|
{"layout_recognize": "Naive"}, 0), |
|
|
|
("naive_html4excel_true", "naive", {"html4excel": True}, 0), |
|
|
|
("naive_html4excel_false", "naive", {"html4excel": False}, 0), |
|
|
|
pytest.param("naive_html4excel_not_bool", "naive", { |
|
|
|
"html4excel": 1}, 102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
("naive_delimiter_empty", "naive", {"delimiter": ""}, 0), |
|
|
|
("naive_delimiter_backticks", "naive", {"delimiter": "`##`"}, 0), |
|
|
|
pytest.param("naive_delimiterl_not_str", "naive", { |
|
|
|
"delimiterl": 1}, 102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_task_page_size_negative", "naive", |
|
|
|
{"task_page_size": -1}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_task_page_size_zero", "naive", |
|
|
|
{"task_page_size": 0}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_task_page_size_float", "naive", |
|
|
|
{"task_page_size": 3.14}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_task_page_size_max", "naive", |
|
|
|
{"task_page_size": 1024*1024*1024}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
pytest.param("naive_task_page_size_str", "naive", |
|
|
|
{"task_page_size": '1024'}, |
|
|
|
102, marks=pytest.mark.xfail(reason='issue#5719')), |
|
|
|
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0), |
|
|
|
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0), |
|
|
|
]) |
|
|
|
def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code): |
|
|
|
@pytest.mark.parametrize( |
|
|
|
"name, chunk_method, parser_config, expected_code", |
|
|
|
[ |
|
|
|
( |
|
|
|
"naive_default", |
|
|
|
"naive", |
|
|
|
{ |
|
|
|
"chunk_token_num": 128, |
|
|
|
"layout_recognize": "DeepDOC", |
|
|
|
"html4excel": False, |
|
|
|
"delimiter": "\n!?。;!?", |
|
|
|
"task_page_size": 12, |
|
|
|
"raptor": {"use_raptor": False}, |
|
|
|
}, |
|
|
|
0, |
|
|
|
), |
|
|
|
("naive_empty", "naive", {}, 0), |
|
|
|
pytest.param( |
|
|
|
"naive_chunk_token_num_negative", |
|
|
|
"naive", |
|
|
|
{"chunk_token_num": -1}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_chunk_token_num_zero", |
|
|
|
"naive", |
|
|
|
{"chunk_token_num": 0}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_chunk_token_num_float", |
|
|
|
"naive", |
|
|
|
{"chunk_token_num": 3.14}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_chunk_token_num_max", |
|
|
|
"naive", |
|
|
|
{"chunk_token_num": 1024 * 1024 * 1024}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_chunk_token_num_str", |
|
|
|
"naive", |
|
|
|
{"chunk_token_num": "1024"}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
( |
|
|
|
"naive_layout_recognize_DeepDOC", |
|
|
|
"naive", |
|
|
|
{"layout_recognize": "DeepDOC"}, |
|
|
|
0, |
|
|
|
), |
|
|
|
("naive_layout_recognize_Naive", "naive", {"layout_recognize": "Naive"}, 0), |
|
|
|
("naive_html4excel_true", "naive", {"html4excel": True}, 0), |
|
|
|
("naive_html4excel_false", "naive", {"html4excel": False}, 0), |
|
|
|
pytest.param( |
|
|
|
"naive_html4excel_not_bool", |
|
|
|
"naive", |
|
|
|
{"html4excel": 1}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
("naive_delimiter_empty", "naive", {"delimiter": ""}, 0), |
|
|
|
("naive_delimiter_backticks", "naive", {"delimiter": "`##`"}, 0), |
|
|
|
pytest.param( |
|
|
|
"naive_delimiterl_not_str", |
|
|
|
"naive", |
|
|
|
{"delimiterl": 1}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_task_page_size_negative", |
|
|
|
"naive", |
|
|
|
{"task_page_size": -1}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_task_page_size_zero", |
|
|
|
"naive", |
|
|
|
{"task_page_size": 0}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_task_page_size_float", |
|
|
|
"naive", |
|
|
|
{"task_page_size": 3.14}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_task_page_size_max", |
|
|
|
"naive", |
|
|
|
{"task_page_size": 1024 * 1024 * 1024}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
pytest.param( |
|
|
|
"naive_task_page_size_str", |
|
|
|
"naive", |
|
|
|
{"task_page_size": "1024"}, |
|
|
|
102, |
|
|
|
marks=pytest.mark.xfail(reason="issue#5719"), |
|
|
|
), |
|
|
|
("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0), |
|
|
|
("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0), |
|
|
|
], |
|
|
|
) |
|
|
|
def test_parser_configs( |
|
|
|
self, get_http_api_auth, name, chunk_method, parser_config, expected_code |
|
|
|
): |
|
|
|
payload = { |
|
|
|
"name": name, |
|
|
|
"chunk_method": chunk_method, |
|
|
|
"parser_config": parser_config |
|
|
|
"parser_config": parser_config, |
|
|
|
} |
|
|
|
res = create_dataset(get_http_api_auth, payload) |
|
|
|
# print(res) |
|
|
|
@@ -245,8 +326,10 @@ class TestAdvancedConfigurations: |
|
|
|
for k, v in parser_config.items(): |
|
|
|
assert res["data"]["parser_config"][k] == v |
|
|
|
if parser_config == {}: |
|
|
|
assert res["data"]["parser_config"] == {"chunk_token_num": 128, |
|
|
|
"delimiter": "\\n!?;。;!?", |
|
|
|
"html4excel": False, |
|
|
|
"layout_recognize": "DeepDOC", |
|
|
|
"raptor": {"use_raptor": False}} |
|
|
|
assert res["data"]["parser_config"] == { |
|
|
|
"chunk_token_num": 128, |
|
|
|
"delimiter": "\\n!?;。;!?", |
|
|
|
"html4excel": False, |
|
|
|
"layout_recognize": "DeepDOC", |
|
|
|
"raptor": {"use_raptor": False}, |
|
|
|
} |