### What problem does this PR solve? change create dataset delimiter default value to r'\n' ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.19.0
| if not chunk_method: | if not chunk_method: | ||||
| chunk_method = "naive" | chunk_method = "naive" | ||||
| key_mapping = { | key_mapping = { | ||||
| "naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, | |||||
| "naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, | |||||
| "qa": {"raptor": {"use_raptor": False}}, | "qa": {"raptor": {"use_raptor": False}}, | ||||
| "tag": None, | "tag": None, | ||||
| "resume": None, | "resume": None, | ||||
| "laws": {"raptor": {"use_raptor": False}}, | "laws": {"raptor": {"use_raptor": False}}, | ||||
| "presentation": {"raptor": {"use_raptor": False}}, | "presentation": {"raptor": {"use_raptor": False}}, | ||||
| "one": None, | "one": None, | ||||
| "knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]}, | |||||
| "knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]}, | |||||
| "email": None, | "email": None, | ||||
| "picture": None, | "picture": None, | ||||
| } | } |
| auto_keywords: int = Field(default=0, ge=0, le=32) | auto_keywords: int = Field(default=0, ge=0, le=32) | ||||
| auto_questions: int = Field(default=0, ge=0, le=10) | auto_questions: int = Field(default=0, ge=0, le=10) | ||||
| chunk_token_num: int = Field(default=128, ge=1, le=2048) | chunk_token_num: int = Field(default=128, ge=1, le=2048) | ||||
| delimiter: str = Field(default=r"\n!?;。;!?", min_length=1) | |||||
| delimiter: str = Field(default=r"\n", min_length=1) | |||||
| graphrag: Optional[GraphragConfig] = None | graphrag: Optional[GraphragConfig] = None | ||||
| html4excel: bool = False | html4excel: bool = False | ||||
| layout_recognize: str = "DeepDOC" | layout_recognize: str = "DeepDOC" |
| if parser_config is None: | if parser_config is None: | ||||
| assert res["data"]["parser_config"] == { | assert res["data"]["parser_config"] == { | ||||
| "chunk_token_num": 128, | "chunk_token_num": 128, | ||||
| "delimiter": r"\n!?;。;!?", | |||||
| "delimiter": r"\n", | |||||
| "html4excel": False, | "html4excel": False, | ||||
| "layout_recognize": "DeepDOC", | "layout_recognize": "DeepDOC", | ||||
| "raptor": {"use_raptor": False}, | "raptor": {"use_raptor": False}, | ||||
| "auto_keywords": 0, | "auto_keywords": 0, | ||||
| "auto_questions": 0, | "auto_questions": 0, | ||||
| "chunk_token_num": 128, | "chunk_token_num": 128, | ||||
| "delimiter": r"\n!?;。;!?", | |||||
| "delimiter": r"\n", | |||||
| "filename_embd_weight": None, | "filename_embd_weight": None, | ||||
| "graphrag": None, | "graphrag": None, | ||||
| "html4excel": False, | "html4excel": False, |
| "chunk_token_num": 128, | "chunk_token_num": 128, | ||||
| "layout_recognize": "DeepDOC", | "layout_recognize": "DeepDOC", | ||||
| "html4excel": False, | "html4excel": False, | ||||
| "delimiter": "\\n!?;。;!?", | |||||
| "delimiter": r"\n", | |||||
| "task_page_size": 12, | "task_page_size": 12, | ||||
| "raptor": {"use_raptor": False}, | "raptor": {"use_raptor": False}, | ||||
| }, | }, | ||||
| else: | else: | ||||
| assert res["data"]["docs"][0]["parser_config"] == { | assert res["data"]["docs"][0]["parser_config"] == { | ||||
| "chunk_token_num": 128, | "chunk_token_num": 128, | ||||
| "delimiter": "\\n!?;。;!?", | |||||
| "delimiter": r"\n", | |||||
| "html4excel": False, | "html4excel": False, | ||||
| "layout_recognize": "DeepDOC", | "layout_recognize": "DeepDOC", | ||||
| "raptor": {"use_raptor": False}, | "raptor": {"use_raptor": False}, |