Co-authored-by: zxhlyh <jasonapring2015@outlook.com>tags/1.1.1
| from libs.password import hash_password, password_pattern, valid_password | from libs.password import hash_password, password_pattern, valid_password | ||||
| from libs.rsa import generate_key_pair | from libs.rsa import generate_key_pair | ||||
| from models import Tenant | from models import Tenant | ||||
| from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment | |||||
| from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment | |||||
| from models.dataset import Document as DatasetDocument | from models.dataset import Document as DatasetDocument | ||||
| from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation | from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation | ||||
| from models.provider import Provider, ProviderModel | from models.provider import Provider, ProviderModel | ||||
| click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green")) | click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green")) | ||||
| @click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.") | |||||
| @click.command("add-qdrant-index", help="Add Qdrant index.") | |||||
| @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.") | @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.") | ||||
| def add_qdrant_doc_id_index(field: str): | |||||
| click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green")) | |||||
| vector_type = dify_config.VECTOR_STORE | |||||
| if vector_type != "qdrant": | |||||
| click.echo(click.style("This command only supports Qdrant vector store.", fg="red")) | |||||
| return | |||||
| def add_qdrant_index(field: str): | |||||
| click.echo(click.style("Starting Qdrant index creation.", fg="green")) | |||||
| create_count = 0 | create_count = 0 | ||||
| try: | try: | ||||
| click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green")) | click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green")) | ||||
| @click.command("old-metadata-migration", help="Old metadata migration.") | |||||
| def old_metadata_migration(): | |||||
| """ | |||||
| Old metadata migration. | |||||
| """ | |||||
| click.echo(click.style("Starting old metadata migration.", fg="green")) | |||||
| page = 1 | |||||
| while True: | |||||
| try: | |||||
| documents = ( | |||||
| DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None) | |||||
| .order_by(DatasetDocument.created_at.desc()) | |||||
| .paginate(page=page, per_page=50) | |||||
| ) | |||||
| except NotFound: | |||||
| break | |||||
| if not documents: | |||||
| break | |||||
| for document in documents: | |||||
| if document.doc_metadata: | |||||
| doc_metadata = document.doc_metadata | |||||
| for key, value in doc_metadata.items(): | |||||
| dataset_metadata = ( | |||||
| db.session.query(DatasetMetadata) | |||||
| .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key) | |||||
| .first() | |||||
| ) | |||||
| if not dataset_metadata: | |||||
| dataset_metadata = DatasetMetadata( | |||||
| tenant_id=document.tenant_id, | |||||
| dataset_id=document.dataset_id, | |||||
| name=key, | |||||
| type="string", | |||||
| created_by=document.created_by, | |||||
| ) | |||||
| db.session.add(dataset_metadata) | |||||
| db.session.flush() | |||||
| dataset_metadata_binding = DatasetMetadataBinding( | |||||
| tenant_id=document.tenant_id, | |||||
| dataset_id=document.dataset_id, | |||||
| metadata_id=dataset_metadata.id, | |||||
| document_id=document.id, | |||||
| created_by=document.created_by, | |||||
| ) | |||||
| db.session.add(dataset_metadata_binding) | |||||
| else: | |||||
| dataset_metadata_binding = DatasetMetadataBinding.query.filter( | |||||
| DatasetMetadataBinding.dataset_id == document.dataset_id, | |||||
| DatasetMetadataBinding.document_id == document.id, | |||||
| DatasetMetadataBinding.metadata_id == dataset_metadata.id, | |||||
| ).first() | |||||
| if not dataset_metadata_binding: | |||||
| dataset_metadata_binding = DatasetMetadataBinding( | |||||
| tenant_id=document.tenant_id, | |||||
| dataset_id=document.dataset_id, | |||||
| metadata_id=dataset_metadata.id, | |||||
| document_id=document.id, | |||||
| created_by=document.created_by, | |||||
| ) | |||||
| db.session.add(dataset_metadata_binding) | |||||
| db.session.commit() | |||||
| page += 1 | |||||
| click.echo(click.style("Old metadata migration completed.", fg="green")) | |||||
| @click.command("create-tenant", help="Create account and tenant.") | @click.command("create-tenant", help="Create account and tenant.") | ||||
| @click.option("--email", prompt=True, help="Tenant account email.") | @click.option("--email", prompt=True, help="Tenant account email.") | ||||
| @click.option("--name", prompt=True, help="Workspace name.") | @click.option("--name", prompt=True, help="Workspace name.") |
| from controllers.service_api.dataset.error import ( | from controllers.service_api.dataset.error import ( | ||||
| ArchivedDocumentImmutableError, | ArchivedDocumentImmutableError, | ||||
| DocumentIndexingError, | DocumentIndexingError, | ||||
| InvalidMetadataError, | |||||
| ) | ) | ||||
| from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check | from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check | ||||
| from core.errors.error import ProviderTokenNotInitError | from core.errors.error import ProviderTokenNotInitError | ||||
| "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" | "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" | ||||
| ) | ) | ||||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | ||||
| parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") | |||||
| parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") | |||||
| args = parser.parse_args() | args = parser.parse_args() | ||||
| dataset_id = str(dataset_id) | dataset_id = str(dataset_id) | ||||
| if not dataset.indexing_technique and not args["indexing_technique"]: | if not dataset.indexing_technique and not args["indexing_technique"]: | ||||
| raise ValueError("indexing_technique is required.") | raise ValueError("indexing_technique is required.") | ||||
| # Validate metadata if provided | |||||
| if args.get("doc_type") or args.get("doc_metadata"): | |||||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||||
| raise InvalidMetadataError( | |||||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||||
| ) | |||||
| if not isinstance(args["doc_metadata"], dict): | |||||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||||
| # Validate metadata schema based on doc_type | |||||
| if args["doc_type"] != "others": | |||||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||||
| for key, value in args["doc_metadata"].items(): | |||||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||||
| # set to MetaDataConfig | |||||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||||
| text = args.get("text") | text = args.get("text") | ||||
| name = args.get("name") | name = args.get("name") | ||||
| if text is None or name is None: | if text is None or name is None: | ||||
| "doc_language", type=str, default="English", required=False, nullable=False, location="json" | "doc_language", type=str, default="English", required=False, nullable=False, location="json" | ||||
| ) | ) | ||||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | ||||
| parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") | |||||
| parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") | |||||
| args = parser.parse_args() | args = parser.parse_args() | ||||
| dataset_id = str(dataset_id) | dataset_id = str(dataset_id) | ||||
| tenant_id = str(tenant_id) | tenant_id = str(tenant_id) | ||||
| # indexing_technique is already set in dataset since this is an update | # indexing_technique is already set in dataset since this is an update | ||||
| args["indexing_technique"] = dataset.indexing_technique | args["indexing_technique"] = dataset.indexing_technique | ||||
| # Validate metadata if provided | |||||
| if args.get("doc_type") or args.get("doc_metadata"): | |||||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||||
| raise InvalidMetadataError( | |||||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||||
| ) | |||||
| if not isinstance(args["doc_metadata"], dict): | |||||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||||
| # Validate metadata schema based on doc_type | |||||
| if args["doc_type"] != "others": | |||||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||||
| for key, value in args["doc_metadata"].items(): | |||||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||||
| # set to MetaDataConfig | |||||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||||
| if args["text"]: | if args["text"]: | ||||
| text = args.get("text") | text = args.get("text") | ||||
| name = args.get("name") | name = args.get("name") | ||||
| if "doc_language" not in args: | if "doc_language" not in args: | ||||
| args["doc_language"] = "English" | args["doc_language"] = "English" | ||||
| # Validate metadata if provided | |||||
| if args.get("doc_type") or args.get("doc_metadata"): | |||||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||||
| raise InvalidMetadataError( | |||||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||||
| ) | |||||
| if not isinstance(args["doc_metadata"], dict): | |||||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||||
| # Validate metadata schema based on doc_type | |||||
| if args["doc_type"] != "others": | |||||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||||
| for key, value in args["doc_metadata"].items(): | |||||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||||
| # set to MetaDataConfig | |||||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||||
| # get dataset info | # get dataset info | ||||
| dataset_id = str(dataset_id) | dataset_id = str(dataset_id) | ||||
| tenant_id = str(tenant_id) | tenant_id = str(tenant_id) | ||||
| if "doc_language" not in args: | if "doc_language" not in args: | ||||
| args["doc_language"] = "English" | args["doc_language"] = "English" | ||||
| # Validate metadata if provided | |||||
| if args.get("doc_type") or args.get("doc_metadata"): | |||||
| if not args.get("doc_type") or not args.get("doc_metadata"): | |||||
| raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") | |||||
| if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: | |||||
| raise InvalidMetadataError( | |||||
| "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) | |||||
| ) | |||||
| if not isinstance(args["doc_metadata"], dict): | |||||
| raise InvalidMetadataError("doc_metadata must be a dictionary") | |||||
| # Validate metadata schema based on doc_type | |||||
| if args["doc_type"] != "others": | |||||
| metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] | |||||
| for key, value in args["doc_metadata"].items(): | |||||
| if key in metadata_schema and not isinstance(value, metadata_schema[key]): | |||||
| raise InvalidMetadataError(f"Invalid type for metadata field {key}") | |||||
| # set to MetaDataConfig | |||||
| args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} | |||||
| # get dataset info | # get dataset info | ||||
| dataset_id = str(dataset_id) | dataset_id = str(dataset_id) | ||||
| tenant_id = str(tenant_id) | tenant_id = str(tenant_id) |
| def init_app(app: DifyApp): | def init_app(app: DifyApp): | ||||
| from commands import ( | from commands import ( | ||||
| add_qdrant_doc_id_index, | |||||
| add_qdrant_index, | |||||
| convert_to_agent_apps, | convert_to_agent_apps, | ||||
| create_tenant, | create_tenant, | ||||
| extract_plugins, | extract_plugins, | ||||
| fix_app_site_missing, | fix_app_site_missing, | ||||
| install_plugins, | install_plugins, | ||||
| migrate_data_for_plugin, | migrate_data_for_plugin, | ||||
| old_metadata_migration, | |||||
| reset_email, | reset_email, | ||||
| reset_encrypt_key_pair, | reset_encrypt_key_pair, | ||||
| reset_password, | reset_password, | ||||
| reset_encrypt_key_pair, | reset_encrypt_key_pair, | ||||
| vdb_migrate, | vdb_migrate, | ||||
| convert_to_agent_apps, | convert_to_agent_apps, | ||||
| add_qdrant_doc_id_index, | |||||
| add_qdrant_index, | |||||
| create_tenant, | create_tenant, | ||||
| upgrade_db, | upgrade_db, | ||||
| fix_app_site_missing, | fix_app_site_missing, | ||||
| extract_plugins, | extract_plugins, | ||||
| extract_unique_plugins, | extract_unique_plugins, | ||||
| install_plugins, | install_plugins, | ||||
| old_metadata_migration, | |||||
| ] | ] | ||||
| for cmd in cmds_to_register: | for cmd in cmds_to_register: | ||||
| app.cli.add_command(cmd) | app.cli.add_command(cmd) |
| from services.entities.knowledge_entities.knowledge_entities import ( | from services.entities.knowledge_entities.knowledge_entities import ( | ||||
| ChildChunkUpdateArgs, | ChildChunkUpdateArgs, | ||||
| KnowledgeConfig, | KnowledgeConfig, | ||||
| MetaDataConfig, | |||||
| RerankingModel, | RerankingModel, | ||||
| RetrievalModel, | RetrievalModel, | ||||
| SegmentUpdateArgs, | SegmentUpdateArgs, | ||||
| document.data_source_info = json.dumps(data_source_info) | document.data_source_info = json.dumps(data_source_info) | ||||
| document.batch = batch | document.batch = batch | ||||
| document.indexing_status = "waiting" | document.indexing_status = "waiting" | ||||
| if knowledge_config.metadata: | |||||
| document.doc_type = knowledge_config.metadata.doc_type | |||||
| document.metadata = knowledge_config.metadata.doc_metadata | |||||
| db.session.add(document) | db.session.add(document) | ||||
| documents.append(document) | documents.append(document) | ||||
| duplicate_document_ids.append(document.id) | duplicate_document_ids.append(document.id) | ||||
| account, | account, | ||||
| file_name, | file_name, | ||||
| batch, | batch, | ||||
| knowledge_config.metadata, | |||||
| ) | ) | ||||
| db.session.add(document) | db.session.add(document) | ||||
| db.session.flush() | db.session.flush() | ||||
| account, | account, | ||||
| truncated_page_name, | truncated_page_name, | ||||
| batch, | batch, | ||||
| knowledge_config.metadata, | |||||
| ) | ) | ||||
| db.session.add(document) | db.session.add(document) | ||||
| db.session.flush() | db.session.flush() | ||||
| account, | account, | ||||
| document_name, | document_name, | ||||
| batch, | batch, | ||||
| knowledge_config.metadata, | |||||
| ) | ) | ||||
| db.session.add(document) | db.session.add(document) | ||||
| db.session.flush() | db.session.flush() | ||||
| account: Account, | account: Account, | ||||
| name: str, | name: str, | ||||
| batch: str, | batch: str, | ||||
| metadata: Optional[MetaDataConfig] = None, | |||||
| ): | ): | ||||
| document = Document( | document = Document( | ||||
| tenant_id=dataset.tenant_id, | tenant_id=dataset.tenant_id, | ||||
| BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"), | BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"), | ||||
| BuiltInField.source: data_source_type, | BuiltInField.source: data_source_type, | ||||
| } | } | ||||
| if metadata is not None: | |||||
| doc_metadata.update(metadata.doc_metadata) | |||||
| document.doc_type = metadata.doc_type | |||||
| if doc_metadata: | if doc_metadata: | ||||
| document.doc_metadata = doc_metadata | document.doc_metadata = doc_metadata | ||||
| return document | return document | ||||
| # update document name | # update document name | ||||
| if document_data.name: | if document_data.name: | ||||
| document.name = document_data.name | document.name = document_data.name | ||||
| # update doc_type and doc_metadata if provided | |||||
| if document_data.metadata is not None: | |||||
| document.doc_metadata = document_data.metadata.doc_metadata | |||||
| document.doc_type = document_data.metadata.doc_type | |||||
| # update document to be waiting | # update document to be waiting | ||||
| document.indexing_status = "waiting" | document.indexing_status = "waiting" | ||||
| document.completed_at = None | document.completed_at = None |
| embedding_model: Optional[str] = None | embedding_model: Optional[str] = None | ||||
| embedding_model_provider: Optional[str] = None | embedding_model_provider: Optional[str] = None | ||||
| name: Optional[str] = None | name: Optional[str] = None | ||||
| metadata: Optional[MetaDataConfig] = None | |||||
| class SegmentUpdateArgs(BaseModel): | class SegmentUpdateArgs(BaseModel): |
| <Property name='text' type='string' key='text'> | <Property name='text' type='string' key='text'> | ||||
| Document content | Document content | ||||
| </Property> | </Property> | ||||
| <Property name='doc_type' type='string' key='doc_type'> | |||||
| Type of document (optional): | |||||
| - <code>book</code> Book | |||||
| - <code>web_page</code> Web page | |||||
| - <code>paper</code> Academic paper/article | |||||
| - <code>social_media_post</code> Social media post | |||||
| - <code>wikipedia_entry</code> Wikipedia entry | |||||
| - <code>personal_document</code> Personal document | |||||
| - <code>business_document</code> Business document | |||||
| - <code>im_chat_log</code> Chat log | |||||
| - <code>synced_from_notion</code> Notion document | |||||
| - <code>synced_from_github</code> GitHub document | |||||
| - <code>others</code> Other document types | |||||
| </Property> | |||||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||||
| Document metadata (required if doc_type is provided). Fields vary by doc_type: | |||||
| For <code>book</code>: | |||||
| - <code>title</code> Book title | |||||
| - <code>language</code> Book language | |||||
| - <code>author</code> Book author | |||||
| - <code>publisher</code> Publisher name | |||||
| - <code>publication_date</code> Publication date | |||||
| - <code>isbn</code> ISBN number | |||||
| - <code>category</code> Book category | |||||
| For <code>web_page</code>: | |||||
| - <code>title</code> Page title | |||||
| - <code>url</code> Page URL | |||||
| - <code>language</code> Page language | |||||
| - <code>publish_date</code> Publish date | |||||
| - <code>author/publisher</code> Author or publisher | |||||
| - <code>topic/keywords</code> Topic or keywords | |||||
| - <code>description</code> Page description | |||||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||||
| For doc_type "others", any valid JSON object is accepted | |||||
| </Property> | |||||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | <Property name='indexing_technique' type='string' key='indexing_technique'> | ||||
| Index mode | Index mode | ||||
| - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index | - <code>high_quality</code> High quality: embedding using embedding model, built as vector database index | ||||
| - <code>hierarchical_model</code> Parent-child mode | - <code>hierarchical_model</code> Parent-child mode | ||||
| - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions | - <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions | ||||
| - <code>doc_type</code> Type of document (optional) | |||||
| - <code>book</code> Book | |||||
| Document records a book or publication | |||||
| - <code>web_page</code> Web page | |||||
| Document records web page content | |||||
| - <code>paper</code> Academic paper/article | |||||
| Document records academic paper or research article | |||||
| - <code>social_media_post</code> Social media post | |||||
| Content from social media posts | |||||
| - <code>wikipedia_entry</code> Wikipedia entry | |||||
| Content from Wikipedia entries | |||||
| - <code>personal_document</code> Personal document | |||||
| Documents related to personal content | |||||
| - <code>business_document</code> Business document | |||||
| Documents related to business content | |||||
| - <code>im_chat_log</code> Chat log | |||||
| Records of instant messaging chats | |||||
| - <code>synced_from_notion</code> Notion document | |||||
| Documents synchronized from Notion | |||||
| - <code>synced_from_github</code> GitHub document | |||||
| Documents synchronized from GitHub | |||||
| - <code>others</code> Other document types | |||||
| Other document types not listed above | |||||
| - <code>doc_metadata</code> Document metadata (required if doc_type is provided) | |||||
| Fields vary by doc_type: | |||||
| For <code>book</code>: | |||||
| - <code>title</code> Book title | |||||
| Title of the book | |||||
| - <code>language</code> Book language | |||||
| Language of the book | |||||
| - <code>author</code> Book author | |||||
| Author of the book | |||||
| - <code>publisher</code> Publisher name | |||||
| Name of the publishing house | |||||
| - <code>publication_date</code> Publication date | |||||
| Date when the book was published | |||||
| - <code>isbn</code> ISBN number | |||||
| International Standard Book Number | |||||
| - <code>category</code> Book category | |||||
| Category or genre of the book | |||||
| For <code>web_page</code>: | |||||
| - <code>title</code> Page title | |||||
| Title of the web page | |||||
| - <code>url</code> Page URL | |||||
| URL address of the web page | |||||
| - <code>language</code> Page language | |||||
| Language of the web page | |||||
| - <code>publish_date</code> Publish date | |||||
| Date when the web page was published | |||||
| - <code>author/publisher</code> Author or publisher | |||||
| Author or publisher of the web page | |||||
| - <code>topic/keywords</code> Topic or keywords | |||||
| Topics or keywords of the web page | |||||
| - <code>description</code> Page description | |||||
| Description of the web page content | |||||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||||
| For doc_type "others", any valid JSON object is accepted | |||||
| - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code> | - <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code> | ||||
| - <code>process_rule</code> Processing rules | - <code>process_rule</code> Processing rules | ||||
| <Property name='description' type='string' key='description'> | <Property name='description' type='string' key='description'> | ||||
| Knowledge description (optional) | Knowledge description (optional) | ||||
| </Property> | </Property> | ||||
| <Property name='doc_type' type='string' key='doc_type'> | |||||
| Type of document (optional): | |||||
| - <code>book</code> Book | |||||
| - <code>web_page</code> Web page | |||||
| - <code>paper</code> Academic paper/article | |||||
| - <code>social_media_post</code> Social media post | |||||
| - <code>wikipedia_entry</code> Wikipedia entry | |||||
| - <code>personal_document</code> Personal document | |||||
| - <code>business_document</code> Business document | |||||
| - <code>im_chat_log</code> Chat log | |||||
| - <code>synced_from_notion</code> Notion document | |||||
| - <code>synced_from_github</code> GitHub document | |||||
| - <code>others</code> Other document types | |||||
| </Property> | |||||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||||
| Document metadata (required if doc_type is provided). Fields vary by doc_type: | |||||
| For <code>book</code>: | |||||
| - <code>title</code> Book title | |||||
| - <code>language</code> Book language | |||||
| - <code>author</code> Book author | |||||
| - <code>publisher</code> Publisher name | |||||
| - <code>publication_date</code> Publication date | |||||
| - <code>isbn</code> ISBN number | |||||
| - <code>category</code> Book category | |||||
| For <code>web_page</code>: | |||||
| - <code>title</code> Page title | |||||
| - <code>url</code> Page URL | |||||
| - <code>language</code> Page language | |||||
| - <code>publish_date</code> Publish date | |||||
| - <code>author/publisher</code> Author or publisher | |||||
| - <code>topic/keywords</code> Topic or keywords | |||||
| - <code>description</code> Page description | |||||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||||
| For doc_type "others", any valid JSON object is accepted | |||||
| </Property> | |||||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | <Property name='indexing_technique' type='string' key='indexing_technique'> | ||||
| Index technique (optional) | Index technique (optional) | ||||
| - <code>high_quality</code> High quality | - <code>high_quality</code> High quality | ||||
| - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code> | - <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code> | ||||
| - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk | - <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk | ||||
| - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional) | - <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional) | ||||
| - <code>doc_type</code> Type of document (optional) | |||||
| - <code>book</code> Book | |||||
| Document records a book or publication | |||||
| - <code>web_page</code> Web page | |||||
| Document records web page content | |||||
| - <code>paper</code> Academic paper/article | |||||
| Document records academic paper or research article | |||||
| - <code>social_media_post</code> Social media post | |||||
| Content from social media posts | |||||
| - <code>wikipedia_entry</code> Wikipedia entry | |||||
| Content from Wikipedia entries | |||||
| - <code>personal_document</code> Personal document | |||||
| Documents related to personal content | |||||
| - <code>business_document</code> Business document | |||||
| Documents related to business content | |||||
| - <code>im_chat_log</code> Chat log | |||||
| Records of instant messaging chats | |||||
| - <code>synced_from_notion</code> Notion document | |||||
| Documents synchronized from Notion | |||||
| - <code>synced_from_github</code> GitHub document | |||||
| Documents synchronized from GitHub | |||||
| - <code>others</code> Other document types | |||||
| Other document types not listed above | |||||
| - <code>doc_metadata</code> Document metadata (required if doc_type is provided) | |||||
| Fields vary by doc_type: | |||||
| For <code>book</code>: | |||||
| - <code>title</code> Book title | |||||
| Title of the book | |||||
| - <code>language</code> Book language | |||||
| Language of the book | |||||
| - <code>author</code> Book author | |||||
| Author of the book | |||||
| - <code>publisher</code> Publisher name | |||||
| Name of the publishing house | |||||
| - <code>publication_date</code> Publication date | |||||
| Date when the book was published | |||||
| - <code>isbn</code> ISBN number | |||||
| International Standard Book Number | |||||
| - <code>category</code> Book category | |||||
| Category or genre of the book | |||||
| For <code>web_page</code>: | |||||
| - <code>title</code> Page title | |||||
| Title of the web page | |||||
| - <code>url</code> Page URL | |||||
| URL address of the web page | |||||
| - <code>language</code> Page language | |||||
| Language of the web page | |||||
| - <code>publish_date</code> Publish date | |||||
| Date when the web page was published | |||||
| - <code>author/publisher</code> Author or publisher | |||||
| Author or publisher of the web page | |||||
| - <code>topic/keywords</code> Topic or keywords | |||||
| Topics or keywords of the web page | |||||
| - <code>description</code> Page description | |||||
| Description of the web page content | |||||
| Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. | |||||
| For doc_type "others", any valid JSON object is accepted | |||||
| </Property> | </Property> | ||||
| </Properties> | </Properties> | ||||
| </Col> | </Col> | ||||
| "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", | "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", | ||||
| "data_source_type": "upload_file", | "data_source_type": "upload_file", | ||||
| "name": "readme.txt", | "name": "readme.txt", | ||||
| "doc_type": null | |||||
| } | } | ||||
| }, | }, | ||||
| "score": 3.730463140527718e-05, | "score": 3.730463140527718e-05, |
| <Property name='text' type='string' key='text'> | <Property name='text' type='string' key='text'> | ||||
| 文档内容 | 文档内容 | ||||
| </Property> | </Property> | ||||
| <Property name='doc_type' type='string' key='doc_type'> | |||||
| 文档类型(选填) | |||||
| - <code>book</code> 图书 Book | |||||
| - <code>web_page</code> 网页 Web page | |||||
| - <code>paper</code> 学术论文/文章 Academic paper/article | |||||
| - <code>social_media_post</code> 社交媒体帖子 Social media post | |||||
| - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry | |||||
| - <code>personal_document</code> 个人文档 Personal document | |||||
| - <code>business_document</code> 商业文档 Business document | |||||
| - <code>im_chat_log</code> 即时通讯记录 Chat log | |||||
| - <code>synced_from_notion</code> Notion同步文档 Notion document | |||||
| - <code>synced_from_github</code> GitHub同步文档 GitHub document | |||||
| - <code>others</code> 其他文档类型 Other document types | |||||
| </Property> | |||||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||||
| 文档元数据(如提供文档类型则必填)。字段因文档类型而异: | |||||
| 针对图书 For <code>book</code>: | |||||
| - <code>title</code> 书名 Book title | |||||
| - <code>language</code> 图书语言 Book language | |||||
| - <code>author</code> 作者 Book author | |||||
| - <code>publisher</code> 出版社 Publisher name | |||||
| - <code>publication_date</code> 出版日期 Publication date | |||||
| - <code>isbn</code> ISBN号码 ISBN number | |||||
| - <code>category</code> 图书分类 Book category | |||||
| 针对网页 For <code>web_page</code>: | |||||
| - <code>title</code> 页面标题 Page title | |||||
| - <code>url</code> 页面网址 Page URL | |||||
| - <code>language</code> 页面语言 Page language | |||||
| - <code>publish_date</code> 发布日期 Publish date | |||||
| - <code>author/publisher</code> 作者/发布者 Author or publisher | |||||
| - <code>topic/keywords</code> 主题/关键词 Topic or keywords | |||||
| - <code>description</code> 页面描述 Page description | |||||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||||
| </Property> | |||||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | <Property name='indexing_technique' type='string' key='indexing_technique'> | ||||
| 索引方式 | 索引方式 | ||||
| - <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 | - <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 | ||||
| - <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式 | - <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式 | ||||
| - <code>hierarchical_model</code> parent-child 模式 | - <code>hierarchical_model</code> parent-child 模式 | ||||
| - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding | - <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding | ||||
| - <code>doc_type</code> 文档类型(选填)Type of document (optional) | |||||
| - <code>book</code> 图书 | |||||
| 文档记录一本书籍或出版物 | |||||
| - <code>web_page</code> 网页 | |||||
| 网页内容的文档记录 | |||||
| - <code>paper</code> 学术论文/文章 | |||||
| 学术论文或研究文章的记录 | |||||
| - <code>social_media_post</code> 社交媒体帖子 | |||||
| 社交媒体上的帖子内容 | |||||
| - <code>wikipedia_entry</code> 维基百科条目 | |||||
| 维基百科的词条内容 | |||||
| - <code>personal_document</code> 个人文档 | |||||
| 个人相关的文档记录 | |||||
| - <code>business_document</code> 商业文档 | |||||
| 商业相关的文档记录 | |||||
| - <code>im_chat_log</code> 即时通讯记录 | |||||
| 即时通讯的聊天记录 | |||||
| - <code>synced_from_notion</code> Notion同步文档 | |||||
| 从Notion同步的文档内容 | |||||
| - <code>synced_from_github</code> GitHub同步文档 | |||||
| 从GitHub同步的文档内容 | |||||
| - <code>others</code> 其他文档类型 | |||||
| 其他未列出的文档类型 | |||||
| - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填 | |||||
| 字段因文档类型而异 | |||||
| 针对图书类型 For <code>book</code>: | |||||
| - <code>title</code> 书名 | |||||
| 书籍的标题 | |||||
| - <code>language</code> 图书语言 | |||||
| 书籍的语言 | |||||
| - <code>author</code> 作者 | |||||
| 书籍的作者 | |||||
| - <code>publisher</code> 出版社 | |||||
| 出版社的名称 | |||||
| - <code>publication_date</code> 出版日期 | |||||
| 书籍的出版日期 | |||||
| - <code>isbn</code> ISBN号码 | |||||
| 书籍的ISBN编号 | |||||
| - <code>category</code> 图书分类 | |||||
| 书籍的分类类别 | |||||
| 针对网页类型 For <code>web_page</code>: | |||||
| - <code>title</code> 页面标题 | |||||
| 网页的标题 | |||||
| - <code>url</code> 页面网址 | |||||
| 网页的URL地址 | |||||
| - <code>language</code> 页面语言 | |||||
| 网页的语言 | |||||
| - <code>publish_date</code> 发布日期 | |||||
| 网页的发布日期 | |||||
| - <code>author/publisher</code> 作者/发布者 | |||||
| 网页的作者或发布者 | |||||
| - <code>topic/keywords</code> 主题/关键词 | |||||
| 网页的主题或关键词 | |||||
| - <code>description</code> 页面描述 | |||||
| 网页的描述信息 | |||||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||||
| - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code> | - <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code> | ||||
| <Property name='text' type='string' key='text'> | <Property name='text' type='string' key='text'> | ||||
| 文档内容(选填) | 文档内容(选填) | ||||
| </Property> | </Property> | ||||
| <Property name='doc_type' type='string' key='doc_type'> | |||||
| 文档类型(选填) | |||||
| - <code>book</code> 图书 Book | |||||
| - <code>web_page</code> 网页 Web page | |||||
| - <code>paper</code> 学术论文/文章 Academic paper/article | |||||
| - <code>social_media_post</code> 社交媒体帖子 Social media post | |||||
| - <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry | |||||
| - <code>personal_document</code> 个人文档 Personal document | |||||
| - <code>business_document</code> 商业文档 Business document | |||||
| - <code>im_chat_log</code> 即时通讯记录 Chat log | |||||
| - <code>synced_from_notion</code> Notion同步文档 Notion document | |||||
| - <code>synced_from_github</code> GitHub同步文档 GitHub document | |||||
| - <code>others</code> 其他文档类型 Other document types | |||||
| </Property> | |||||
| <Property name='doc_metadata' type='object' key='doc_metadata'> | |||||
| 文档元数据(如提供文档类型则必填)。字段因文档类型而异: | |||||
| 针对图书 For <code>book</code>: | |||||
| - <code>title</code> 书名 Book title | |||||
| - <code>language</code> 图书语言 Book language | |||||
| - <code>author</code> 作者 Book author | |||||
| - <code>publisher</code> 出版社 Publisher name | |||||
| - <code>publication_date</code> 出版日期 Publication date | |||||
| - <code>isbn</code> ISBN号码 ISBN number | |||||
| - <code>category</code> 图书分类 Book category | |||||
| 针对网页 For <code>web_page</code>: | |||||
| - <code>title</code> 页面标题 Page title | |||||
| - <code>url</code> 页面网址 Page URL | |||||
| - <code>language</code> 页面语言 Page language | |||||
| - <code>publish_date</code> 发布日期 Publish date | |||||
| - <code>author/publisher</code> 作者/发布者 Author or publisher | |||||
| - <code>topic/keywords</code> 主题/关键词 Topic or keywords | |||||
| - <code>description</code> 页面描述 Page description | |||||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||||
| </Property> | |||||
| <Property name='process_rule' type='object' key='process_rule'> | <Property name='process_rule' type='object' key='process_rule'> | ||||
| 处理规则(选填) | 处理规则(选填) | ||||
| - <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 | - <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 | ||||
| - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code> | - <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code> | ||||
| - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度 | - <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度 | ||||
| - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填) | - <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填) | ||||
| - <code>doc_type</code> 文档类型(选填)Type of document (optional) | |||||
| - <code>book</code> 图书 | |||||
| 文档记录一本书籍或出版物 | |||||
| - <code>web_page</code> 网页 | |||||
| 网页内容的文档记录 | |||||
| - <code>paper</code> 学术论文/文章 | |||||
| 学术论文或研究文章的记录 | |||||
| - <code>social_media_post</code> 社交媒体帖子 | |||||
| 社交媒体上的帖子内容 | |||||
| - <code>wikipedia_entry</code> 维基百科条目 | |||||
| 维基百科的词条内容 | |||||
| - <code>personal_document</code> 个人文档 | |||||
| 个人相关的文档记录 | |||||
| - <code>business_document</code> 商业文档 | |||||
| 商业相关的文档记录 | |||||
| - <code>im_chat_log</code> 即时通讯记录 | |||||
| 即时通讯的聊天记录 | |||||
| - <code>synced_from_notion</code> Notion同步文档 | |||||
| 从Notion同步的文档内容 | |||||
| - <code>synced_from_github</code> GitHub同步文档 | |||||
| 从GitHub同步的文档内容 | |||||
| - <code>others</code> 其他文档类型 | |||||
| 其他未列出的文档类型 | |||||
| - <code>doc_metadata</code> 文档元数据(如提供文档类型则必填 | |||||
| 字段因文档类型而异 | |||||
| 针对图书类型 For <code>book</code>: | |||||
| - <code>title</code> 书名 | |||||
| 书籍的标题 | |||||
| - <code>language</code> 图书语言 | |||||
| 书籍的语言 | |||||
| - <code>author</code> 作者 | |||||
| 书籍的作者 | |||||
| - <code>publisher</code> 出版社 | |||||
| 出版社的名称 | |||||
| - <code>publication_date</code> 出版日期 | |||||
| 书籍的出版日期 | |||||
| - <code>isbn</code> ISBN号码 | |||||
| 书籍的ISBN编号 | |||||
| - <code>category</code> 图书分类 | |||||
| 书籍的分类类别 | |||||
| 针对网页类型 For <code>web_page</code>: | |||||
| - <code>title</code> 页面标题 | |||||
| 网页的标题 | |||||
| - <code>url</code> 页面网址 | |||||
| 网页的URL地址 | |||||
| - <code>language</code> 页面语言 | |||||
| 网页的语言 | |||||
| - <code>publish_date</code> 发布日期 | |||||
| 网页的发布日期 | |||||
| - <code>author/publisher</code> 作者/发布者 | |||||
| 网页的作者或发布者 | |||||
| - <code>topic/keywords</code> 主题/关键词 | |||||
| 网页的主题或关键词 | |||||
| - <code>description</code> 页面描述 | |||||
| 网页的描述信息 | |||||
| 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 | |||||
| 针对"其他"类型文档,接受任何有效的JSON对象 | |||||
| </Property> | </Property> | ||||
| </Properties> | </Properties> | ||||
| </Col> | </Col> | ||||
| "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", | "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", | ||||
| "data_source_type": "upload_file", | "data_source_type": "upload_file", | ||||
| "name": "readme.txt", | "name": "readme.txt", | ||||
| "doc_type": null | |||||
| } | } | ||||
| }, | }, | ||||
| "score": 3.730463140527718e-05, | "score": 3.730463140527718e-05, |