### What problem does this PR solve? - Update `get_parser_config` to merge provided configs with defaults - Add GraphRAG configuration defaults for all chunk methods - Make raptor and graphrag fields non-nullable in ParserConfig schema - Update related test cases to reflect config changes - Ensure backward compatibility while adding new GraphRAG support - #8396 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)tags/v0.20.0
| @@ -351,28 +351,47 @@ def generate_confirmation_token(tenant_id): | |||
| def get_parser_config(chunk_method, parser_config): | |||
| if parser_config: | |||
| return parser_config | |||
| if not chunk_method: | |||
| chunk_method = "naive" | |||
| # Define default configurations for each chunk method | |||
| key_mapping = { | |||
| "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, | |||
| "qa": {"raptor": {"use_raptor": False}}, | |||
| "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "tag": None, | |||
| "resume": None, | |||
| "manual": {"raptor": {"use_raptor": False}}, | |||
| "manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "table": None, | |||
| "paper": {"raptor": {"use_raptor": False}}, | |||
| "book": {"raptor": {"use_raptor": False}}, | |||
| "laws": {"raptor": {"use_raptor": False}}, | |||
| "presentation": {"raptor": {"use_raptor": False}}, | |||
| "paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, | |||
| "one": None, | |||
| "knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]}, | |||
| "knowledge_graph": { | |||
| "chunk_token_num": 8192, | |||
| "delimiter": r"\n", | |||
| "entity_types": ["organization", "person", "location", "event", "time"], | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| "email": None, | |||
| "picture": None, | |||
| } | |||
| parser_config = key_mapping[chunk_method] | |||
| return parser_config | |||
| default_config = key_mapping[chunk_method] | |||
| # If no parser_config provided, return default | |||
| if not parser_config: | |||
| return default_config | |||
| # If parser_config is provided, merge with defaults to ensure required fields exist | |||
| if default_config is None: | |||
| return parser_config | |||
| # Ensure raptor and graphrag fields have default values if not provided | |||
| merged_config = deep_merge(default_config, parser_config) | |||
| return merged_config | |||
| def get_data_openai( | |||
| @@ -602,17 +621,14 @@ def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, s | |||
| TimeoutException = Union[Type[BaseException], BaseException] | |||
| OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]] | |||
| def timeout( | |||
| seconds: float |int = None, | |||
| attempts: int = 2, | |||
| *, | |||
| exception: Optional[TimeoutException] = None, | |||
| on_timeout: Optional[OnTimeoutCallback] = None | |||
| ): | |||
| def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None): | |||
| def decorator(func): | |||
| @wraps(func) | |||
| def wrapper(*args, **kwargs): | |||
| result_queue = queue.Queue(maxsize=1) | |||
| def target(): | |||
| try: | |||
| result = func(*args, **kwargs) | |||
| @@ -644,7 +660,7 @@ def timeout( | |||
| with trio.fail_after(seconds): | |||
| return await func(*args, **kwargs) | |||
| except trio.TooSlowError: | |||
| if a < attempts -1: | |||
| if a < attempts - 1: | |||
| continue | |||
| if on_timeout is not None: | |||
| if callable(on_timeout): | |||
| @@ -668,11 +684,11 @@ def timeout( | |||
| if asyncio.iscoroutinefunction(func): | |||
| return async_wrapper | |||
| return wrapper | |||
| return decorator | |||
| async def is_strong_enough(chat_model, embedding_model): | |||
| @timeout(30, 2) | |||
| async def _is_strong_enough(): | |||
| nonlocal chat_model, embedding_model | |||
| @@ -681,11 +697,11 @@ async def is_strong_enough(chat_model, embedding_model): | |||
| _ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"])) | |||
| if chat_model: | |||
| with trio.fail_after(30): | |||
| res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role":"user", "content": "Are you strong enough!?"}], {})) | |||
| res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {})) | |||
| if res.find("**ERROR**") >= 0: | |||
| raise Exception(res) | |||
| # Pressure test for GraphRAG task | |||
| async with trio.open_nursery() as nursery: | |||
| for _ in range(32): | |||
| nursery.start_soon(_is_strong_enough) | |||
| nursery.start_soon(_is_strong_enough) | |||
| @@ -365,10 +365,10 @@ class ParserConfig(Base): | |||
| auto_questions: int = Field(default=0, ge=0, le=10) | |||
| chunk_token_num: int = Field(default=512, ge=1, le=2048) | |||
| delimiter: str = Field(default=r"\n", min_length=1) | |||
| graphrag: GraphragConfig | None = None | |||
| graphrag: GraphragConfig = Field(default_factory=lambda: GraphragConfig(use_graphrag=False)) | |||
| html4excel: bool = False | |||
| layout_recognize: str = "DeepDOC" | |||
| raptor: RaptorConfig | None = None | |||
| raptor: RaptorConfig = Field(default_factory=lambda: RaptorConfig(use_raptor=False)) | |||
| tag_kb_ids: list[str] = Field(default_factory=list) | |||
| topn_tags: int = Field(default=1, ge=1, le=10) | |||
| filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0) | |||
| @@ -644,6 +644,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, res | |||
| @pytest.mark.p2 | |||
| @@ -657,6 +658,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, res | |||
| @pytest.mark.p3 | |||
| @@ -670,6 +672,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, res | |||
| @pytest.mark.p2 | |||
| @@ -695,3 +698,64 @@ class TestDatasetCreate: | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 101, res | |||
| assert "Extra inputs are not permitted" in res["message"], res | |||
| @pytest.mark.usefixtures("clear_datasets") | |||
| class TestParserConfigBugFix: | |||
| @pytest.mark.p1 | |||
| def test_parser_config_missing_raptor_and_graphrag(self, HttpApiAuth): | |||
| payload = {"name": "test_parser_config_missing_fields", "parser_config": {"chunk_token_num": 1024}} | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 0, res | |||
| parser_config = res["data"]["parser_config"] | |||
| assert "raptor" in parser_config, "raptor field should be present" | |||
| assert "graphrag" in parser_config, "graphrag field should be present" | |||
| assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False" | |||
| assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False" | |||
| assert parser_config["chunk_token_num"] == 1024, "User-provided chunk_token_num should be preserved" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_only_raptor(self, HttpApiAuth): | |||
| payload = {"name": "test_parser_config_only_raptor", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}}} | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 0, res | |||
| parser_config = res["data"]["parser_config"] | |||
| assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved" | |||
| assert "graphrag" in parser_config, "graphrag field should be present" | |||
| assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_only_graphrag(self, HttpApiAuth): | |||
| payload = {"name": "test_parser_config_only_graphrag", "parser_config": {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}}} | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 0, res | |||
| parser_config = res["data"]["parser_config"] | |||
| assert "raptor" in parser_config, "raptor field should be present" | |||
| assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False" | |||
| assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_both_fields(self, HttpApiAuth): | |||
| payload = {"name": "test_parser_config_both_fields", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}}} | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 0, res | |||
| parser_config = res["data"]["parser_config"] | |||
| assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved" | |||
| assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved" | |||
| @pytest.mark.p2 | |||
| @pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"]) | |||
| def test_parser_config_different_chunk_methods(self, HttpApiAuth, chunk_method): | |||
| payload = {"name": f"test_parser_config_{chunk_method}", "chunk_method": chunk_method, "parser_config": {"chunk_token_num": 512}} | |||
| res = create_dataset(HttpApiAuth, payload) | |||
| assert res["code"] == 0, res | |||
| parser_config = res["data"]["parser_config"] | |||
| assert "raptor" in parser_config, f"raptor field should be present for {chunk_method}" | |||
| assert "graphrag" in parser_config, f"graphrag field should be present for {chunk_method}" | |||
| assert parser_config["raptor"]["use_raptor"] is False, f"raptor.use_raptor should default to False for {chunk_method}" | |||
| assert parser_config["graphrag"]["use_graphrag"] is False, f"graphrag.use_graphrag should default to False for {chunk_method}" | |||
| @@ -755,6 +755,7 @@ class TestDatasetUpdate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, res | |||
| @pytest.mark.p3 | |||
| @@ -772,6 +773,7 @@ class TestDatasetUpdate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, res | |||
| @pytest.mark.p3 | |||
| @@ -783,7 +785,7 @@ class TestDatasetUpdate: | |||
| res = list_datasets(HttpApiAuth) | |||
| assert res["code"] == 0, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res | |||
| @pytest.mark.p3 | |||
| def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dataset_func): | |||
| @@ -794,7 +796,7 @@ class TestDatasetUpdate: | |||
| res = list_datasets(HttpApiAuth) | |||
| assert res["code"] == 0, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res | |||
| @pytest.mark.p3 | |||
| def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_dataset_func): | |||
| @@ -805,7 +807,7 @@ class TestDatasetUpdate: | |||
| res = list_datasets(HttpApiAuth, {"id": dataset_id}) | |||
| assert res["code"] == 0, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res | |||
| assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res | |||
| @pytest.mark.p2 | |||
| @pytest.mark.parametrize( | |||
| @@ -540,6 +540,7 @@ class TestUpdateDocumentParserConfig: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| } | |||
| else: | |||
| for k, v in parser_config.items(): | |||
| @@ -593,6 +593,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| parser_config_o = DataSet.ParserConfig(client, {}) | |||
| @@ -610,6 +611,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| payload = {"name": "parser_config_unset"} | |||
| @@ -626,6 +628,7 @@ class TestDatasetCreate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| payload = {"name": "parser_config_empty", "parser_config": None} | |||
| @@ -655,3 +658,64 @@ class TestDatasetCreate: | |||
| with pytest.raises(Exception) as excinfo: | |||
| client.create_dataset(**payload) | |||
| assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value) | |||
| @pytest.mark.usefixtures("clear_datasets") | |||
| class TestParserConfigBugFix: | |||
| @pytest.mark.p1 | |||
| def test_parser_config_missing_raptor_and_graphrag(self, client): | |||
| parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024}) | |||
| payload = {"name": "test_parser_config_missing_fields_sdk", "parser_config": parser_config} | |||
| dataset = client.create_dataset(**payload) | |||
| config = dataset.parser_config | |||
| assert hasattr(config, "raptor"), "raptor field should be present" | |||
| assert hasattr(config, "graphrag"), "graphrag field should be present" | |||
| assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False" | |||
| assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False" | |||
| assert config.chunk_token_num == 1024, "User-provided chunk_token_num should be preserved" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_only_raptor(self, client): | |||
| parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}}) | |||
| payload = {"name": "test_parser_config_only_raptor_sdk", "parser_config": parser_config} | |||
| dataset = client.create_dataset(**payload) | |||
| config = dataset.parser_config | |||
| assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved" | |||
| assert hasattr(config, "graphrag"), "graphrag field should be present" | |||
| assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_only_graphrag(self, client): | |||
| parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}}) | |||
| payload = {"name": "test_parser_config_only_graphrag_sdk", "parser_config": parser_config} | |||
| dataset = client.create_dataset(**payload) | |||
| config = dataset.parser_config | |||
| assert hasattr(config, "raptor"), "raptor field should be present" | |||
| assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False" | |||
| assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved" | |||
| @pytest.mark.p1 | |||
| def test_parser_config_with_both_fields(self, client): | |||
| parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}}) | |||
| payload = {"name": "test_parser_config_both_fields_sdk", "parser_config": parser_config} | |||
| dataset = client.create_dataset(**payload) | |||
| config = dataset.parser_config | |||
| assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved" | |||
| assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved" | |||
| @pytest.mark.p2 | |||
| @pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"]) | |||
| def test_parser_config_different_chunk_methods(self, client, chunk_method): | |||
| parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 512}) | |||
| payload = {"name": f"test_parser_config_{chunk_method}_sdk", "chunk_method": chunk_method, "parser_config": parser_config} | |||
| dataset = client.create_dataset(**payload) | |||
| config = dataset.parser_config | |||
| assert hasattr(config, "raptor"), f"raptor field should be present for {chunk_method}" | |||
| assert hasattr(config, "graphrag"), f"graphrag field should be present for {chunk_method}" | |||
| assert config.raptor.use_raptor is False, f"raptor.use_raptor should default to False for {chunk_method}" | |||
| assert config.graphrag.use_graphrag is False, f"graphrag.use_graphrag should default to False for {chunk_method}" | |||
| @@ -641,6 +641,7 @@ class TestDatasetUpdate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| dataset.update({"parser_config": {}}) | |||
| @@ -660,6 +661,7 @@ class TestDatasetUpdate: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| dataset.update({"parser_config": None}) | |||
| @@ -675,6 +677,7 @@ class TestDatasetUpdate: | |||
| client, | |||
| { | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| dataset.update({"chunk_method": "qa", "parser_config": {}}) | |||
| @@ -406,6 +406,7 @@ class TestUpdateDocumentParserConfig: | |||
| "html4excel": False, | |||
| "layout_recognize": "DeepDOC", | |||
| "raptor": {"use_raptor": False}, | |||
| "graphrag": {"use_graphrag": False}, | |||
| }, | |||
| ) | |||
| assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc) | |||