| @@ -18,6 +18,7 @@ from controllers.service_api.app.error import ( | |||
| from controllers.service_api.dataset.error import ( | |||
| ArchivedDocumentImmutableError, | |||
| DocumentIndexingError, | |||
| InvalidMetadataError, | |||
| ) | |||
| from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check | |||
| from core.errors.error import ProviderTokenNotInitError | |||
| @@ -50,6 +51,9 @@ class DocumentAddByTextApi(DatasetApiResource): | |||
| "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" | |||
| ) | |||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | |||
| parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") | |||
| parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") | |||
| args = parser.parse_args() | |||
| dataset_id = str(dataset_id) | |||
| tenant_id = str(tenant_id) | |||
| @@ -61,6 +65,28 @@ class DocumentAddByTextApi(DatasetApiResource): | |||
| if not dataset.indexing_technique and not args["indexing_technique"]: | |||
| raise ValueError("indexing_technique is required.") | |||
| # Validate metadata if provided | |||
| if args.get("doc_type") or args.get("doc_metadata"): | |||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||
| raise InvalidMetadataError( | |||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||
| ) | |||
| if not isinstance(args["doc_metadata"], dict): | |||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||
| # Validate metadata schema based on doc_type | |||
| if args["doc_type"] != "others": | |||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||
| for key, value in args["doc_metadata"].items(): | |||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||
| # set to MetaDataConfig | |||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||
| text = args.get("text") | |||
| name = args.get("name") | |||
| if text is None or name is None: | |||
| @@ -107,6 +133,8 @@ class DocumentUpdateByTextApi(DatasetApiResource): | |||
| "doc_language", type=str, default="English", required=False, nullable=False, location="json" | |||
| ) | |||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | |||
| parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") | |||
| parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") | |||
| args = parser.parse_args() | |||
| dataset_id = str(dataset_id) | |||
| tenant_id = str(tenant_id) | |||
| @@ -115,6 +143,29 @@ class DocumentUpdateByTextApi(DatasetApiResource): | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| # Validate metadata if provided | |||
| if args.get("doc_type") or args.get("doc_metadata"): | |||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||
| raise InvalidMetadataError( | |||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||
| ) | |||
| if not isinstance(args["doc_metadata"], dict): | |||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||
| # Validate metadata schema based on doc_type | |||
| if args["doc_type"] != "others": | |||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||
| for key, value in args["doc_metadata"].items(): | |||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||
| # set to MetaDataConfig | |||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||
| if args["text"]: | |||
| text = args.get("text") | |||
| name = args.get("name") | |||
| @@ -161,6 +212,30 @@ class DocumentAddByFileApi(DatasetApiResource): | |||
| args["doc_form"] = "text_model" | |||
| if "doc_language" not in args: | |||
| args["doc_language"] = "English" | |||
| # Validate metadata if provided | |||
| if args.get("doc_type") or args.get("doc_metadata"): | |||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||
| raise InvalidMetadataError( | |||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||
| ) | |||
| if not isinstance(args["doc_metadata"], dict): | |||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||
| # Validate metadata schema based on doc_type | |||
| if args["doc_type"] != "others": | |||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||
| for key, value in args["doc_metadata"].items(): | |||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||
| # set to MetaDataConfig | |||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||
| # get dataset info | |||
| dataset_id = str(dataset_id) | |||
| tenant_id = str(tenant_id) | |||
| @@ -228,6 +303,29 @@ class DocumentUpdateByFileApi(DatasetApiResource): | |||
| if "doc_language" not in args: | |||
| args["doc_language"] = "English" | |||
| # Validate metadata if provided | |||
| if args.get("doc_type") or args.get("doc_metadata"): | |||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||
| raise InvalidMetadataError( | |||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||
| ) | |||
| if not isinstance(args["doc_metadata"], dict): | |||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||
| # Validate metadata schema based on doc_type | |||
| if args["doc_type"] != "others": | |||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||
| for key, value in args["doc_metadata"].items(): | |||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||
| # set to MetaDataConfig | |||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||
| # get dataset info | |||
| dataset_id = str(dataset_id) | |||
| tenant_id = str(tenant_id) | |||
| @@ -42,6 +42,7 @@ from models.source import DataSourceOauthBinding | |||
| from services.entities.knowledge_entities.knowledge_entities import ( | |||
| ChildChunkUpdateArgs, | |||
| KnowledgeConfig, | |||
| MetaDataConfig, | |||
| RerankingModel, | |||
| RetrievalModel, | |||
| SegmentUpdateArgs, | |||
| @@ -894,6 +895,9 @@ class DocumentService: | |||
| document.data_source_info = json.dumps(data_source_info) | |||
| document.batch = batch | |||
| document.indexing_status = "waiting" | |||
| if knowledge_config.metadata: | |||
| document.doc_type = knowledge_config.metadata.doc_type | |||
| document.metadata = knowledge_config.metadata.doc_metadata | |||
| db.session.add(document) | |||
| documents.append(document) | |||
| duplicate_document_ids.append(document.id) | |||
| @@ -910,6 +914,7 @@ class DocumentService: | |||
| account, | |||
| file_name, | |||
| batch, | |||
| knowledge_config.metadata, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| @@ -965,6 +970,7 @@ class DocumentService: | |||
| account, | |||
| page.page_name, | |||
| batch, | |||
| knowledge_config.metadata, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| @@ -1005,6 +1011,7 @@ class DocumentService: | |||
| account, | |||
| document_name, | |||
| batch, | |||
| knowledge_config.metadata, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| @@ -1042,6 +1049,7 @@ class DocumentService: | |||
| account: Account, | |||
| name: str, | |||
| batch: str, | |||
| metadata: Optional[MetaDataConfig] = None, | |||
| ): | |||
| document = Document( | |||
| tenant_id=dataset.tenant_id, | |||
| @@ -1057,6 +1065,9 @@ class DocumentService: | |||
| doc_form=document_form, | |||
| doc_language=document_language, | |||
| ) | |||
| if metadata is not None: | |||
| document.doc_metadata = metadata.doc_metadata | |||
| document.doc_type = metadata.doc_type | |||
| return document | |||
| @staticmethod | |||
| @@ -1169,6 +1180,10 @@ class DocumentService: | |||
| # update document name | |||
| if document_data.name: | |||
| document.name = document_data.name | |||
| # update doc_type and doc_metadata if provided | |||
| if document_data.metadata is not None: | |||
| document.doc_metadata = document_data.metadata.doc_type | |||
| document.doc_type = document_data.metadata.doc_type | |||
| # update document to be waiting | |||
| document.indexing_status = "waiting" | |||
| document.completed_at = None | |||
| @@ -93,6 +93,11 @@ class RetrievalModel(BaseModel): | |||
| score_threshold: Optional[float] = None | |||
| class MetaDataConfig(BaseModel): | |||
| doc_type: str | |||
| doc_metadata: dict | |||
| class KnowledgeConfig(BaseModel): | |||
| original_document_id: Optional[str] = None | |||
| duplicate: bool = True | |||
| @@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel): | |||
| embedding_model: Optional[str] = None | |||
| embedding_model_provider: Optional[str] = None | |||
| name: Optional[str] = None | |||
| metadata: Optional[MetaDataConfig] = None | |||
| class SegmentUpdateArgs(BaseModel): | |||
| @@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='text' type='string' key='text'> | |||
| Document content | |||
| </Property> | |||
| <Property name='doc_type' type='string' key='doc_type'> | |||
| Type of document (optional): | |||
| - <code>book</code> Book | |||
| - <code>web_page</code> Web page | |||
| - <code>paper</code> Academic paper/article | |||
| - <code>social_media_post</code> Social media post | |||
| - <code>wikipedia_entry</code> Wikipedia entry | |||
| - <code>personal_document</code> Personal document | |||
| - <code>business_document</code> Business document | |||
| - <code>im_chat_log</code> Chat log | |||
| - <code>synced_from_notion</code> Notion document | |||
| - <code>synced_from_github</code> GitHub document | |||
| - <code>others</code> Other document types | |||
| </Property> | |||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||
| Document metadata (required if doc_type is provided). Fields vary by doc_type: | |||
| For <code>book</code>: | |||
| - <code>title</code> Book title | |||
| - <code>language</code> Book language | |||
| - <code>author</code> Book author | |||
| - <code>publisher</code> Publisher name | |||
| - <code>publication_date</code> Publication date | |||
| - <code>isbn</code> ISBN number | |||
| - <code>category</code> Book category | |||
| For <code>web_page</code>: | |||
| - <code>title</code> Page title | |||
| - <code>url</code> Page URL | |||
| - <code>language</code> Page language | |||
| - <code>publish_date</code> Publish date | |||
| - <code>author/publisher</code> Author or publisher | |||
| - <code>topic/keywords</code> Topic or keywords | |||
| - <code>description</code> Page description | |||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||
| For doc_type "others", any valid JSON object is accepted | |||
| </Property> | |||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | |||
| Index mode | |||
| - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index | |||
| @@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| - <code>hierarchical_model</code> Parent-child mode | |||
| - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions | |||
| - <code>doc_type</code> Type of document (optional) | |||
| - <code>book</code> Book | |||
| Document records a book or publication | |||
| - <code>web_page</code> Web page | |||
| Document records web page content | |||
| - <code>paper</code> Academic paper/article | |||
| Document records academic paper or research article | |||
| - <code>social_media_post</code> Social media post | |||
| Content from social media posts | |||
| - <code>wikipedia_entry</code> Wikipedia entry | |||
| Content from Wikipedia entries | |||
| - <code>personal_document</code> Personal document | |||
| Documents related to personal content | |||
| - <code>business_document</code> Business document | |||
| Documents related to business content | |||
| - <code>im_chat_log</code> Chat log | |||
| Records of instant messaging chats | |||
| - <code>synced_from_notion</code> Notion document | |||
| Documents synchronized from Notion | |||
| - <code>synced_from_github</code> GitHub document | |||
| Documents synchronized from GitHub | |||
| - <code>others</code> Other document types | |||
| Other document types not listed above | |||
| - <code>doc_metadata</code> Document metadata (required if doc_type is provided) | |||
| Fields vary by doc_type: | |||
| For <code>book</code>: | |||
| - <code>title</code> Book title | |||
| Title of the book | |||
| - <code>language</code> Book language | |||
| Language of the book | |||
| - <code>author</code> Book author | |||
| Author of the book | |||
| - <code>publisher</code> Publisher name | |||
| Name of the publishing house | |||
| - <code>publication_date</code> Publication date | |||
| Date when the book was published | |||
| - <code>isbn</code> ISBN number | |||
| International Standard Book Number | |||
| - <code>category</code> Book category | |||
| Category or genre of the book | |||
| For <code>web_page</code>: | |||
| - <code>title</code> Page title | |||
| Title of the web page | |||
| - <code>url</code> Page URL | |||
| URL address of the web page | |||
| - <code>language</code> Page language | |||
| Language of the web page | |||
| - <code>publish_date</code> Publish date | |||
| Date when the web page was published | |||
| - <code>author/publisher</code> Author or publisher | |||
| Author or publisher of the web page | |||
| - <code>topic/keywords</code> Topic or keywords | |||
| Topics or keywords of the web page | |||
| - <code>description</code> Page description | |||
| Description of the web page content | |||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||
| For doc_type "others", any valid JSON object is accepted | |||
| - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code> | |||
| - <code>process_rule</code> Processing rules | |||
| @@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='description' type='string' key='description'> | |||
| Knowledge description (optional) | |||
| </Property> | |||
| <Property name='doc_type' type='string' key='doc_type'> | |||
| Type of document (optional): | |||
| - <code>book</code> Book | |||
| - <code>web_page</code> Web page | |||
| - <code>paper</code> Academic paper/article | |||
| - <code>social_media_post</code> Social media post | |||
| - <code>wikipedia_entry</code> Wikipedia entry | |||
| - <code>personal_document</code> Personal document | |||
| - <code>business_document</code> Business document | |||
| - <code>im_chat_log</code> Chat log | |||
| - <code>synced_from_notion</code> Notion document | |||
| - <code>synced_from_github</code> GitHub document | |||
| - <code>others</code> Other document types | |||
| </Property> | |||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||
| Document metadata (required if doc_type is provided). Fields vary by doc_type: | |||
| For <code>book</code>: | |||
| - <code>title</code> Book title | |||
| - <code>language</code> Book language | |||
| - <code>author</code> Book author | |||
| - <code>publisher</code> Publisher name | |||
| - <code>publication_date</code> Publication date | |||
| - <code>isbn</code> ISBN number | |||
| - <code>category</code> Book category | |||
| For <code>web_page</code>: | |||
| - <code>title</code> Page title | |||
| - <code>url</code> Page URL | |||
| - <code>language</code> Page language | |||
| - <code>publish_date</code> Publish date | |||
| - <code>author/publisher</code> Author or publisher | |||
| - <code>topic/keywords</code> Topic or keywords | |||
| - <code>description</code> Page description | |||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||
| For doc_type "others", any valid JSON object is accepted | |||
| </Property> | |||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | |||
| Index technique (optional) | |||
| - <code>high_quality</code> High quality | |||
| @@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code> | |||
| - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk | |||
| - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional) | |||
| - <code>doc_type</code> Type of document (optional) | |||
| - <code>book</code> Book | |||
| Document records a book or publication | |||
| - <code>web_page</code> Web page | |||
| Document records web page content | |||
| - <code>paper</code> Academic paper/article | |||
| Document records academic paper or research article | |||
| - <code>social_media_post</code> Social media post | |||
| Content from social media posts | |||
| - <code>wikipedia_entry</code> Wikipedia entry | |||
| Content from Wikipedia entries | |||
| - <code>personal_document</code> Personal document | |||
| Documents related to personal content | |||
| - <code>business_document</code> Business document | |||
| Documents related to business content | |||
| - <code>im_chat_log</code> Chat log | |||
| Records of instant messaging chats | |||
| - <code>synced_from_notion</code> Notion document | |||
| Documents synchronized from Notion | |||
| - <code>synced_from_github</code> GitHub document | |||
| Documents synchronized from GitHub | |||
| - <code>others</code> Other document types | |||
| Other document types not listed above | |||
| - <code>doc_metadata</code> Document metadata (required if doc_type is provided) | |||
| Fields vary by doc_type: | |||
| For <code>book</code>: | |||
| - <code>title</code> Book title | |||
| Title of the book | |||
| - <code>language</code> Book language | |||
| Language of the book | |||
| - <code>author</code> Book author | |||
| Author of the book | |||
| - <code>publisher</code> Publisher name | |||
| Name of the publishing house | |||
| - <code>publication_date</code> Publication date | |||
| Date when the book was published | |||
| - <code>isbn</code> ISBN number | |||
| International Standard Book Number | |||
| - <code>category</code> Book category | |||
| Category or genre of the book | |||
| For <code>web_page</code>: | |||
| - <code>title</code> Page title | |||
| Title of the web page | |||
| - <code>url</code> Page URL | |||
| URL address of the web page | |||
| - <code>language</code> Page language | |||
| Language of the web page | |||
| - <code>publish_date</code> Publish date | |||
| Date when the web page was published | |||
| - <code>author/publisher</code> Author or publisher | |||
| Author or publisher of the web page | |||
| - <code>topic/keywords</code> Topic or keywords | |||
| Topics or keywords of the web page | |||
| - <code>description</code> Page description | |||
| Description of the web page content | |||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||
| For doc_type "others", any valid JSON object is accepted | |||
| </Property> | |||
| </Properties> | |||
| </Col> | |||
| @@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='text' type='string' key='text'> | |||
| 文档内容 | |||
| </Property> | |||
| <Property name='doc_type' type='string' key='doc_type'> | |||
| 文档类型(选填) | |||
| - <code>book</code> 图书 Book | |||
| - <code>web_page</code> 网页 Web page | |||
| - <code>paper</code> 学术论文/文章 Academic paper/article | |||
| - <code>social_media_post</code> 社交媒体帖子 Social media post | |||
| - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry | |||
| - <code>personal_document</code> 个人文档 Personal document | |||
| - <code>business_document</code> 商业文档 Business document | |||
| - <code>im_chat_log</code> 即时通讯记录 Chat log | |||
| - <code>synced_from_notion</code> Notion同步文档 Notion document | |||
| - <code>synced_from_github</code> GitHub同步文档 GitHub document | |||
| - <code>others</code> 其他文档类型 Other document types | |||
| </Property> | |||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||
| 文档元数据(如提供文档类型则必填)。字段因文档类型而异: | |||
| 针对图书 For <code>book</code>: | |||
| - <code>title</code> 书名 Book title | |||
| - <code>language</code> 图书语言 Book language | |||
| - <code>author</code> 作者 Book author | |||
| - <code>publisher</code> 出版社 Publisher name | |||
| - <code>publication_date</code> 出版日期 Publication date | |||
| - <code>isbn</code> ISBN号码 ISBN number | |||
| - <code>category</code> 图书分类 Book category | |||
| 针对网页 For <code>web_page</code>: | |||
| - <code>title</code> 页面标题 Page title | |||
| - <code>url</code> 页面网址 Page URL | |||
| - <code>language</code> 页面语言 Page language | |||
| - <code>publish_date</code> 发布日期 Publish date | |||
| - <code>author/publisher</code> 作者/发布者 Author or publisher | |||
| - <code>topic/keywords</code> 主题/关键词 Topic or keywords | |||
| - <code>description</code> 页面描述 Page description | |||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||
| </Property> | |||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | |||
| 索引方式 | |||
| - <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 | |||
| @@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| - <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式 | |||
| - <code>hierarchical_model</code> parent-child 模式 | |||
| - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding | |||
| - <code>doc_type</code> 文档类型(选填)Type of document (optional) | |||
| - <code>book</code> 图书 | |||
| 文档记录一本书籍或出版物 | |||
| - <code>web_page</code> 网页 | |||
| 网页内容的文档记录 | |||
| - <code>paper</code> 学术论文/文章 | |||
| 学术论文或研究文章的记录 | |||
| - <code>social_media_post</code> 社交媒体帖子 | |||
| 社交媒体上的帖子内容 | |||
| - <code>wikipedia_entry</code> 维基百科条目 | |||
| 维基百科的词条内容 | |||
| - <code>personal_document</code> 个人文档 | |||
| 个人相关的文档记录 | |||
| - <code>business_document</code> 商业文档 | |||
| 商业相关的文档记录 | |||
| - <code>im_chat_log</code> 即时通讯记录 | |||
| 即时通讯的聊天记录 | |||
| - <code>synced_from_notion</code> Notion同步文档 | |||
| 从Notion同步的文档内容 | |||
| - <code>synced_from_github</code> GitHub同步文档 | |||
| 从GitHub同步的文档内容 | |||
| - <code>others</code> 其他文档类型 | |||
| 其他未列出的文档类型 | |||
| - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填 | |||
| 字段因文档类型而异 | |||
| 针对图书类型 For <code>book</code>: | |||
| - <code>title</code> 书名 | |||
| 书籍的标题 | |||
| - <code>language</code> 图书语言 | |||
| 书籍的语言 | |||
| - <code>author</code> 作者 | |||
| 书籍的作者 | |||
| - <code>publisher</code> 出版社 | |||
| 出版社的名称 | |||
| - <code>publication_date</code> 出版日期 | |||
| 书籍的出版日期 | |||
| - <code>isbn</code> ISBN号码 | |||
| 书籍的ISBN编号 | |||
| - <code>category</code> 图书分类 | |||
| 书籍的分类类别 | |||
| 针对网页类型 For <code>web_page</code>: | |||
| - <code>title</code> 页面标题 | |||
| 网页的标题 | |||
| - <code>url</code> 页面网址 | |||
| 网页的URL地址 | |||
| - <code>language</code> 页面语言 | |||
| 网页的语言 | |||
| - <code>publish_date</code> 发布日期 | |||
| 网页的发布日期 | |||
| - <code>author/publisher</code> 作者/发布者 | |||
| 网页的作者或发布者 | |||
| - <code>topic/keywords</code> 主题/关键词 | |||
| 网页的主题或关键词 | |||
| - <code>description</code> 页面描述 | |||
| 网页的描述信息 | |||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||
| - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code> | |||
| @@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='text' type='string' key='text'> | |||
| 文档内容(选填) | |||
| </Property> | |||
| <Property name='doc_type' type='string' key='doc_type'> | |||
| 文档类型(选填) | |||
| - <code>book</code> 图书 Book | |||
| - <code>web_page</code> 网页 Web page | |||
| - <code>paper</code> 学术论文/文章 Academic paper/article | |||
| - <code>social_media_post</code> 社交媒体帖子 Social media post | |||
| - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry | |||
| - <code>personal_document</code> 个人文档 Personal document | |||
| - <code>business_document</code> 商业文档 Business document | |||
| - <code>im_chat_log</code> 即时通讯记录 Chat log | |||
| - <code>synced_from_notion</code> Notion同步文档 Notion document | |||
| - <code>synced_from_github</code> GitHub同步文档 GitHub document | |||
| - <code>others</code> 其他文档类型 Other document types | |||
| </Property> | |||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||
| 文档元数据(如提供文档类型则必填)。字段因文档类型而异: | |||
| 针对图书 For <code>book</code>: | |||
| - <code>title</code> 书名 Book title | |||
| - <code>language</code> 图书语言 Book language | |||
| - <code>author</code> 作者 Book author | |||
| - <code>publisher</code> 出版社 Publisher name | |||
| - <code>publication_date</code> 出版日期 Publication date | |||
| - <code>isbn</code> ISBN号码 ISBN number | |||
| - <code>category</code> 图书分类 Book category | |||
| 针对网页 For <code>web_page</code>: | |||
| - <code>title</code> 页面标题 Page title | |||
| - <code>url</code> 页面网址 Page URL | |||
| - <code>language</code> 页面语言 Page language | |||
| - <code>publish_date</code> 发布日期 Publish date | |||
| - <code>author/publisher</code> 作者/发布者 Author or publisher | |||
| - <code>topic/keywords</code> 主题/关键词 Topic or keywords | |||
| - <code>description</code> 页面描述 Page description | |||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||
| </Property> | |||
| <Property name='process_rule' type='object' key='process_rule'> | |||
| 处理规则(选填) | |||
| - <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 | |||
| @@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code> | |||
| - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度 | |||
| - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填) | |||
| - <code>doc_type</code> 文档类型(选填)Type of document (optional) | |||
| - <code>book</code> 图书 | |||
| 文档记录一本书籍或出版物 | |||
| - <code>web_page</code> 网页 | |||
| 网页内容的文档记录 | |||
| - <code>paper</code> 学术论文/文章 | |||
| 学术论文或研究文章的记录 | |||
| - <code>social_media_post</code> 社交媒体帖子 | |||
| 社交媒体上的帖子内容 | |||
| - <code>wikipedia_entry</code> 维基百科条目 | |||
| 维基百科的词条内容 | |||
| - <code>personal_document</code> 个人文档 | |||
| 个人相关的文档记录 | |||
| - <code>business_document</code> 商业文档 | |||
| 商业相关的文档记录 | |||
| - <code>im_chat_log</code> 即时通讯记录 | |||
| 即时通讯的聊天记录 | |||
| - <code>synced_from_notion</code> Notion同步文档 | |||
| 从Notion同步的文档内容 | |||
| - <code>synced_from_github</code> GitHub同步文档 | |||
| 从GitHub同步的文档内容 | |||
| - <code>others</code> 其他文档类型 | |||
| 其他未列出的文档类型 | |||
| - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填 | |||
| 字段因文档类型而异 | |||
| 针对图书类型 For <code>book</code>: | |||
| - <code>title</code> 书名 | |||
| 书籍的标题 | |||
| - <code>language</code> 图书语言 | |||
| 书籍的语言 | |||
| - <code>author</code> 作者 | |||
| 书籍的作者 | |||
| - <code>publisher</code> 出版社 | |||
| 出版社的名称 | |||
| - <code>publication_date</code> 出版日期 | |||
| 书籍的出版日期 | |||
| - <code>isbn</code> ISBN号码 | |||
| 书籍的ISBN编号 | |||
| - <code>category</code> 图书分类 | |||
| 书籍的分类类别 | |||
| 针对网页类型 For <code>web_page</code>: | |||
| - <code>title</code> 页面标题 | |||
| 网页的标题 | |||
| - <code>url</code> 页面网址 | |||
| 网页的URL地址 | |||
| - <code>language</code> 页面语言 | |||
| 网页的语言 | |||
| - <code>publish_date</code> 发布日期 | |||
| 网页的发布日期 | |||
| - <code>author/publisher</code> 作者/发布者 | |||
| 网页的作者或发布者 | |||
| - <code>topic/keywords</code> 主题/关键词 | |||
| 网页的主题或关键词 | |||
| - <code>description</code> 页面描述 | |||
| 网页的描述信息 | |||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||
| </Property> | |||
| </Properties> | |||
| </Col> | |||