Co-authored-by: zxhlyh <jasonapring2015@outlook.com>

7 mesi fa · 2c9af712a2
--- a/api/commands.py
+++ b/api/commands.py
 from libs.password import hash_password, password_pattern, valid_password
 from libs.rsa import generate_key_pair
 from models import Tenant
 from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
 from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
 from models.dataset import Document as DatasetDocument
 from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
 from models.provider import Provider, ProviderModel
    click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
@click.command("add-qdrant-index", help="Add Qdrant index.")
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
 def add_qdrant_doc_id_index(field: str):
    click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
    vector_type = dify_config.VECTOR_STORE
    if vector_type != "qdrant":
        click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
        return
 def add_qdrant_index(field: str):
    click.echo(click.style("Starting Qdrant index creation.", fg="green"))
    create_count = 0
    try:
    click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
@click.command("old-metadata-migration", help="Old metadata migration.")
 def old_metadata_migration():
    """
    Old metadata migration.
    """
    click.echo(click.style("Starting old metadata migration.", fg="green"))
    page = 1
    while True:
        try:
            documents = (
                DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
                .order_by(DatasetDocument.created_at.desc())
                .paginate(page=page, per_page=50)
            )
        except NotFound:
            break
        if not documents:
            break
        for document in documents:
            if document.doc_metadata:
                doc_metadata = document.doc_metadata
                for key, value in doc_metadata.items():
                    dataset_metadata = (
                        db.session.query(DatasetMetadata)
                        .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
                        .first()
                    )
                    if not dataset_metadata:
                        dataset_metadata = DatasetMetadata(
                            tenant_id=document.tenant_id,
                            dataset_id=document.dataset_id,
                            name=key,
                            type="string",
                            created_by=document.created_by,
                        )
                        db.session.add(dataset_metadata)
                        db.session.flush()
                        dataset_metadata_binding = DatasetMetadataBinding(
                            tenant_id=document.tenant_id,
                            dataset_id=document.dataset_id,
                            metadata_id=dataset_metadata.id,
                            document_id=document.id,
                            created_by=document.created_by,
                        )
                        db.session.add(dataset_metadata_binding)
                    else:
                        dataset_metadata_binding = DatasetMetadataBinding.query.filter(
                            DatasetMetadataBinding.dataset_id == document.dataset_id,
                            DatasetMetadataBinding.document_id == document.id,
                            DatasetMetadataBinding.metadata_id == dataset_metadata.id,
                        ).first()
                        if not dataset_metadata_binding:
                            dataset_metadata_binding = DatasetMetadataBinding(
                                tenant_id=document.tenant_id,
                                dataset_id=document.dataset_id,
                                metadata_id=dataset_metadata.id,
                                document_id=document.id,
                                created_by=document.created_by,
                            )
                            db.session.add(dataset_metadata_binding)
                db.session.commit()
        page += 1
    click.echo(click.style("Old metadata migration completed.", fg="green"))
@click.command("create-tenant", help="Create account and tenant.")
@click.option("--email", prompt=True, help="Tenant account email.")
@click.option("--name", prompt=True, help="Workspace name.")
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
 from controllers.service_api.dataset.error import (
    ArchivedDocumentImmutableError,
    DocumentIndexingError,
    InvalidMetadataError,
 )
 from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
 from core.errors.error import ProviderTokenNotInitError
            "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
        )
        parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
        args = parser.parse_args()
        dataset_id = str(dataset_id)
        if not dataset.indexing_technique and not args["indexing_technique"]:
            raise ValueError("indexing_technique is required.")
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        text = args.get("text")
        name = args.get("name")
        if text is None or name is None:
            "doc_language", type=str, default="English", required=False, nullable=False, location="json"
        )
        parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
        parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
        parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
        args = parser.parse_args()
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
        # indexing_technique is already set in dataset since this is an update
        args["indexing_technique"] = dataset.indexing_technique
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        if args["text"]:
            text = args.get("text")
            name = args.get("name")
        if "doc_language" not in args:
            args["doc_language"] = "English"
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        # get dataset info
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
        if "doc_language" not in args:
            args["doc_language"] = "English"
        # Validate metadata if provided
        if args.get("doc_type") or args.get("doc_metadata"):
            if not args.get("doc_type") or not args.get("doc_metadata"):
                raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
            if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
                raise InvalidMetadataError(
                    "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
                )
            if not isinstance(args["doc_metadata"], dict):
                raise InvalidMetadataError("doc_metadata must be a dictionary")
            # Validate metadata schema based on doc_type
            if args["doc_type"] != "others":
                metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
                for key, value in args["doc_metadata"].items():
                    if key in metadata_schema and not isinstance(value, metadata_schema[key]):
                        raise InvalidMetadataError(f"Invalid type for metadata field {key}")
            # set to MetaDataConfig
            args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
        # get dataset info
        dataset_id = str(dataset_id)
        tenant_id = str(tenant_id)
--- a/api/extensions/ext_commands.py
+++ b/api/extensions/ext_commands.py
 def init_app(app: DifyApp):
    from commands import (
        add_qdrant_doc_id_index,
        add_qdrant_index,
        convert_to_agent_apps,
        create_tenant,
        extract_plugins,
        fix_app_site_missing,
        install_plugins,
        migrate_data_for_plugin,
        old_metadata_migration,
        reset_email,
        reset_encrypt_key_pair,
        reset_password,
        reset_encrypt_key_pair,
        vdb_migrate,
        convert_to_agent_apps,
        add_qdrant_doc_id_index,
        add_qdrant_index,
        create_tenant,
        upgrade_db,
        fix_app_site_missing,
        extract_plugins,
        extract_unique_plugins,
        install_plugins,
        old_metadata_migration,
    ]
    for cmd in cmds_to_register:
        app.cli.add_command(cmd)
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
 from services.entities.knowledge_entities.knowledge_entities import (
    ChildChunkUpdateArgs,
    KnowledgeConfig,
    MetaDataConfig,
    RerankingModel,
    RetrievalModel,
    SegmentUpdateArgs,
                                document.data_source_info = json.dumps(data_source_info)
                                document.batch = batch
                                document.indexing_status = "waiting"
                                if knowledge_config.metadata:
                                    document.doc_type = knowledge_config.metadata.doc_type
                                    document.metadata = knowledge_config.metadata.doc_metadata
                                db.session.add(document)
                                documents.append(document)
                                duplicate_document_ids.append(document.id)
                            account,
                            file_name,
                            batch,
                            knowledge_config.metadata,
                        )
                        db.session.add(document)
                        db.session.flush()
                                    account,
                                    truncated_page_name,
                                    batch,
                                    knowledge_config.metadata,
                                )
                                db.session.add(document)
                                db.session.flush()
                            account,
                            document_name,
                            batch,
                            knowledge_config.metadata,
                        )
                        db.session.add(document)
                        db.session.flush()
        account: Account,
        name: str,
        batch: str,
        metadata: Optional[MetaDataConfig] = None,
    ):
        document = Document(
            tenant_id=dataset.tenant_id,
                BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
                BuiltInField.source: data_source_type,
            }
        if metadata is not None:
            doc_metadata.update(metadata.doc_metadata)
            document.doc_type = metadata.doc_type
        if doc_metadata:
            document.doc_metadata = doc_metadata
        return document
        # update document name
        if document_data.name:
            document.name = document_data.name
        # update doc_type and doc_metadata if provided
        if document_data.metadata is not None:
            document.doc_metadata = document_data.metadata.doc_metadata
            document.doc_type = document_data.metadata.doc_type
        # update document to be waiting
        document.indexing_status = "waiting"
        document.completed_at = None
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
    embedding_model: Optional[str] = None
    embedding_model_provider: Optional[str] = None
    name: Optional[str] = None
    metadata: Optional[MetaDataConfig] = None
 class SegmentUpdateArgs(BaseModel):
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
      <Property name='text' type='string' key='text'>
        Document content
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        Type of document (optional):
          - <code>book</code> Book
          - <code>web_page</code> Web page
          - <code>paper</code> Academic paper/article 
          - <code>social_media_post</code> Social media post
          - <code>wikipedia_entry</code> Wikipedia entry
          - <code>personal_document</code> Personal document
          - <code>business_document</code> Business document
          - <code>im_chat_log</code> Chat log
          - <code>synced_from_notion</code> Notion document
          - <code>synced_from_github</code> GitHub document
          - <code>others</code> Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        Document metadata (required if doc_type is provided). Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title 
          - <code>language</code> Book language
          - <code>author</code> Book author
          - <code>publisher</code> Publisher name
          - <code>publication_date</code> Publication date
          - <code>isbn</code> ISBN number
          - <code>category</code> Book category
          For <code>web_page</code>:
          - <code>title</code> Page title
          - <code>url</code> Page URL
          - <code>language</code> Page language
          - <code>publish_date</code> Publish date
          - <code>author/publisher</code> Author or publisher
          - <code>topic/keywords</code> Topic or keywords
          - <code>description</code> Page description
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        Index mode
          - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
          - <code>hierarchical_model</code> Parent-child mode
          - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
        - <code>doc_type</code> Type of document (optional)
          - <code>book</code> Book
            Document records a book or publication
          - <code>web_page</code> Web page 
            Document records web page content
          - <code>paper</code> Academic paper/article
            Document records academic paper or research article
          - <code>social_media_post</code> Social media post
            Content from social media posts
          - <code>wikipedia_entry</code> Wikipedia entry
            Content from Wikipedia entries
          - <code>personal_document</code> Personal document
            Documents related to personal content
          - <code>business_document</code> Business document
            Documents related to business content
          - <code>im_chat_log</code> Chat log
            Records of instant messaging chats
          - <code>synced_from_notion</code> Notion document
            Documents synchronized from Notion
          - <code>synced_from_github</code> GitHub document
            Documents synchronized from GitHub
          - <code>others</code> Other document types
            Other document types not listed above
        - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
          Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title
            Title of the book
          - <code>language</code> Book language
            Language of the book
          - <code>author</code> Book author
            Author of the book
          - <code>publisher</code> Publisher name
            Name of the publishing house
          - <code>publication_date</code> Publication date
            Date when the book was published
          - <code>isbn</code> ISBN number
            International Standard Book Number
          - <code>category</code> Book category
            Category or genre of the book
          For <code>web_page</code>:
          - <code>title</code> Page title
            Title of the web page
          - <code>url</code> Page URL
            URL address of the web page
          - <code>language</code> Page language
            Language of the web page
          - <code>publish_date</code> Publish date
            Date when the web page was published
          - <code>author/publisher</code> Author or publisher
            Author or publisher of the web page
          - <code>topic/keywords</code> Topic or keywords
            Topics or keywords of the web page
          - <code>description</code> Page description
            Description of the web page content
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
        - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
        - <code>process_rule</code> Processing rules
      <Property name='description' type='string' key='description'>
        Knowledge description (optional)
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        Type of document (optional):
          - <code>book</code> Book
          - <code>web_page</code> Web page
          - <code>paper</code> Academic paper/article 
          - <code>social_media_post</code> Social media post
          - <code>wikipedia_entry</code> Wikipedia entry
          - <code>personal_document</code> Personal document
          - <code>business_document</code> Business document
          - <code>im_chat_log</code> Chat log
          - <code>synced_from_notion</code> Notion document
          - <code>synced_from_github</code> GitHub document
          - <code>others</code> Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        Document metadata (required if doc_type is provided). Fields vary by doc_type:
          For <code>book</code>:
          - <code>title</code> Book title 
          - <code>language</code> Book language
          - <code>author</code> Book author
          - <code>publisher</code> Publisher name
          - <code>publication_date</code> Publication date
          - <code>isbn</code> ISBN number
          - <code>category</code> Book category
          For <code>web_page</code>:
          - <code>title</code> Page title
          - <code>url</code> Page URL
          - <code>language</code> Page language
          - <code>publish_date</code> Publish date
          - <code>author/publisher</code> Author or publisher
          - <code>topic/keywords</code> Topic or keywords
          - <code>description</code> Page description
          Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
          For doc_type "others", any valid JSON object is accepted
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        Index technique (optional)
          - <code>high_quality</code> High quality
              - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
              - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
              - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
            - <code>doc_type</code> Type of document (optional)
              - <code>book</code> Book
                Document records a book or publication
              - <code>web_page</code> Web page 
                Document records web page content
              - <code>paper</code> Academic paper/article
                Document records academic paper or research article
              - <code>social_media_post</code> Social media post
                Content from social media posts
              - <code>wikipedia_entry</code> Wikipedia entry
                Content from Wikipedia entries
              - <code>personal_document</code> Personal document
                Documents related to personal content
              - <code>business_document</code> Business document
                Documents related to business content
              - <code>im_chat_log</code> Chat log
                Records of instant messaging chats
              - <code>synced_from_notion</code> Notion document
                Documents synchronized from Notion
              - <code>synced_from_github</code> GitHub document
                Documents synchronized from GitHub
              - <code>others</code> Other document types
                Other document types not listed above
            - <code>doc_metadata</code> Document metadata (required if doc_type is provided)
              Fields vary by doc_type:
              For <code>book</code>:
              - <code>title</code> Book title
                Title of the book
              - <code>language</code> Book language
                Language of the book
              - <code>author</code> Book author
                Author of the book
              - <code>publisher</code> Publisher name
                Name of the publishing house
              - <code>publication_date</code> Publication date
                Date when the book was published
              - <code>isbn</code> ISBN number
                International Standard Book Number
              - <code>category</code> Book category
                Category or genre of the book
              For <code>web_page</code>:
              - <code>title</code> Page title
                Title of the web page
              - <code>url</code> Page URL
                URL address of the web page
              - <code>language</code> Page language
                Language of the web page
              - <code>publish_date</code> Publish date
                Date when the web page was published
              - <code>author/publisher</code> Author or publisher
                Author or publisher of the web page
              - <code>topic/keywords</code> Topic or keywords
                Topics or keywords of the web page
              - <code>description</code> Page description
                Description of the web page content
              Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
              For doc_type "others", any valid JSON object is accepted
      </Property>
    </Properties>
  </Col>
              "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
              "data_source_type": "upload_file",
              "name": "readme.txt",
              "doc_type": null
            }
          },
          "score": 3.730463140527718e-05,
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
      <Property name='text' type='string' key='text'>
        文档内容
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        文档类型（选填）
          - <code>book</code> 图书 Book
          - <code>web_page</code> 网页 Web page
          - <code>paper</code> 学术论文/文章 Academic paper/article 
          - <code>social_media_post</code> 社交媒体帖子 Social media post
          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
          - <code>personal_document</code> 个人文档 Personal document
          - <code>business_document</code> 商业文档 Business document
          - <code>im_chat_log</code> 即时通讯记录 Chat log
          - <code>synced_from_notion</code> Notion同步文档 Notion document
          - <code>synced_from_github</code> GitHub同步文档 GitHub document
          - <code>others</code> 其他文档类型 Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
          针对图书 For <code>book</code>:
          - <code>title</code> 书名 Book title 
          - <code>language</code> 图书语言 Book language
          - <code>author</code> 作者 Book author
          - <code>publisher</code> 出版社 Publisher name
          - <code>publication_date</code> 出版日期 Publication date
          - <code>isbn</code> ISBN号码 ISBN number
          - <code>category</code> 图书分类 Book category
          针对网页 For <code>web_page</code>:
          - <code>title</code> 页面标题 Page title
          - <code>url</code> 页面网址 Page URL
          - <code>language</code> 页面语言 Page language
          - <code>publish_date</code> 发布日期 Publish date
          - <code>author/publisher</code> 作者/发布者 Author or publisher
          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
          - <code>description</code> 页面描述 Page description
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
      <Property name='indexing_technique' type='string' key='indexing_technique'>
        索引方式
          - <code>high_quality</code> 高质量：使用  embedding 模型进行嵌入，构建为向量数据库索引
          - <code>text_model</code> text 文档直接 embedding，经济模式默认为该模式
          - <code>hierarchical_model</code> parent-child 模式
          - <code>qa_model</code> Q&A 模式：为分片文档生成 Q&A 对，然后对问题进行 embedding
        - <code>doc_type</code> 文档类型（选填）Type of document (optional)
          - <code>book</code> 图书
            文档记录一本书籍或出版物
          - <code>web_page</code> 网页
            网页内容的文档记录
          - <code>paper</code> 学术论文/文章
            学术论文或研究文章的记录
          - <code>social_media_post</code> 社交媒体帖子
            社交媒体上的帖子内容
          - <code>wikipedia_entry</code> 维基百科条目
            维基百科的词条内容
          - <code>personal_document</code> 个人文档
            个人相关的文档记录
          - <code>business_document</code> 商业文档
            商业相关的文档记录
          - <code>im_chat_log</code> 即时通讯记录
            即时通讯的聊天记录
          - <code>synced_from_notion</code> Notion同步文档
            从Notion同步的文档内容
          - <code>synced_from_github</code> GitHub同步文档
            从GitHub同步的文档内容
          - <code>others</code> 其他文档类型
            其他未列出的文档类型
        - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
          字段因文档类型而异
          针对图书类型 For <code>book</code>:
          - <code>title</code> 书名
            书籍的标题
          - <code>language</code> 图书语言
            书籍的语言
          - <code>author</code> 作者
            书籍的作者
          - <code>publisher</code> 出版社
            出版社的名称
          - <code>publication_date</code> 出版日期
            书籍的出版日期
          - <code>isbn</code> ISBN号码
            书籍的ISBN编号
          - <code>category</code> 图书分类
            书籍的分类类别
          针对网页类型 For <code>web_page</code>:
          - <code>title</code> 页面标题
            网页的标题
          - <code>url</code> 页面网址
            网页的URL地址
          - <code>language</code> 页面语言
            网页的语言
          - <code>publish_date</code> 发布日期
            网页的发布日期
          - <code>author/publisher</code> 作者/发布者
            网页的作者或发布者
          - <code>topic/keywords</code> 主题/关键词
            网页的主题或关键词
          - <code>description</code> 页面描述
            网页的描述信息
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
        - <code>doc_language</code> 在 Q&A 模式下，指定文档的语言，例如：<code>English</code>、<code>Chinese</code>
      <Property name='text' type='string' key='text'>
        文档内容（选填）
      </Property>
      <Property name='doc_type' type='string' key='doc_type'>
        文档类型（选填）
          - <code>book</code> 图书 Book
          - <code>web_page</code> 网页 Web page
          - <code>paper</code> 学术论文/文章 Academic paper/article 
          - <code>social_media_post</code> 社交媒体帖子 Social media post
          - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
          - <code>personal_document</code> 个人文档 Personal document
          - <code>business_document</code> 商业文档 Business document
          - <code>im_chat_log</code> 即时通讯记录 Chat log
          - <code>synced_from_notion</code> Notion同步文档 Notion document
          - <code>synced_from_github</code> GitHub同步文档 GitHub document
          - <code>others</code> 其他文档类型 Other document types
      </Property>
      <Property name='doc_metadata' type='object' key='doc_metadata'>
        文档元数据（如提供文档类型则必填）。字段因文档类型而异：
          针对图书 For <code>book</code>:
          - <code>title</code> 书名 Book title 
          - <code>language</code> 图书语言 Book language
          - <code>author</code> 作者 Book author
          - <code>publisher</code> 出版社 Publisher name
          - <code>publication_date</code> 出版日期 Publication date
          - <code>isbn</code> ISBN号码 ISBN number
          - <code>category</code> 图书分类 Book category
          针对网页 For <code>web_page</code>:
          - <code>title</code> 页面标题 Page title
          - <code>url</code> 页面网址 Page URL
          - <code>language</code> 页面语言 Page language
          - <code>publish_date</code> 发布日期 Publish date
          - <code>author/publisher</code> 作者/发布者 Author or publisher
          - <code>topic/keywords</code> 主题/关键词 Topic or keywords
          - <code>description</code> 页面描述 Page description
          请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
          针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
      <Property name='process_rule' type='object' key='process_rule'>
        处理规则（选填）
          - <code>mode</code> (string) 清洗、分段模式 ，automatic 自动 / custom 自定义
              - <code>separator</code> 分段标识符，目前仅允许设置一个分隔符。默认为 <code>***</code>
              - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
              - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时，段与段之间存在一定的重叠部分（选填）
            - <code>doc_type</code> 文档类型（选填）Type of document (optional)
              - <code>book</code> 图书
                文档记录一本书籍或出版物
              - <code>web_page</code> 网页
                网页内容的文档记录
              - <code>paper</code> 学术论文/文章
                学术论文或研究文章的记录
              - <code>social_media_post</code> 社交媒体帖子
                社交媒体上的帖子内容
              - <code>wikipedia_entry</code> 维基百科条目
                维基百科的词条内容
              - <code>personal_document</code> 个人文档
                个人相关的文档记录
              - <code>business_document</code> 商业文档
                商业相关的文档记录
              - <code>im_chat_log</code> 即时通讯记录
                即时通讯的聊天记录
              - <code>synced_from_notion</code> Notion同步文档
                从Notion同步的文档内容
              - <code>synced_from_github</code> GitHub同步文档
                从GitHub同步的文档内容
              - <code>others</code> 其他文档类型
                其他未列出的文档类型
            - <code>doc_metadata</code> 文档元数据（如提供文档类型则必填
              字段因文档类型而异
              针对图书类型 For <code>book</code>:
              - <code>title</code> 书名
                书籍的标题
              - <code>language</code> 图书语言
                书籍的语言
              - <code>author</code> 作者
                书籍的作者
              - <code>publisher</code> 出版社
                出版社的名称
              - <code>publication_date</code> 出版日期
                书籍的出版日期
              - <code>isbn</code> ISBN号码
                书籍的ISBN编号
              - <code>category</code> 图书分类
                书籍的分类类别
              针对网页类型 For <code>web_page</code>:
              - <code>title</code> 页面标题
                网页的标题
              - <code>url</code> 页面网址
                网页的URL地址
              - <code>language</code> 页面语言
                网页的语言
              - <code>publish_date</code> 发布日期
                网页的发布日期
              - <code>author/publisher</code> 作者/发布者
                网页的作者或发布者
              - <code>topic/keywords</code> 主题/关键词
                网页的主题或关键词
              - <code>description</code> 页面描述
                网页的描述信息
              请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
              针对"其他"类型文档，接受任何有效的JSON对象
      </Property>
    </Properties>
  </Col>
              "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
              "data_source_type": "upload_file",
              "name": "readme.txt",
              "doc_type": null
            }
          },
          "score": 3.730463140527718e-05,