Co-authored-by: lizb <lizb@sugon.com>tags/1.5.1
| from flask import request | from flask import request | ||||
| from flask_restful import marshal, reqparse | from flask_restful import marshal, reqparse | ||||
| from sqlalchemy import desc, select | from sqlalchemy import desc, select | ||||
| from werkzeug.exceptions import NotFound | |||||
| from werkzeug.exceptions import Forbidden, NotFound | |||||
| import services | import services | ||||
| from controllers.common.errors import FilenameNotExistsError | from controllers.common.errors import FilenameNotExistsError | ||||
| from controllers.service_api.dataset.error import ( | from controllers.service_api.dataset.error import ( | ||||
| ArchivedDocumentImmutableError, | ArchivedDocumentImmutableError, | ||||
| DocumentIndexingError, | DocumentIndexingError, | ||||
| InvalidMetadataError, | |||||
| ) | ) | ||||
| from controllers.service_api.wraps import ( | from controllers.service_api.wraps import ( | ||||
| DatasetApiResource, | DatasetApiResource, | ||||
| return data | return data | ||||
| class DocumentDetailApi(DatasetApiResource): | |||||
| METADATA_CHOICES = {"all", "only", "without"} | |||||
| def get(self, tenant_id, dataset_id, document_id): | |||||
| dataset_id = str(dataset_id) | |||||
| document_id = str(document_id) | |||||
| dataset = self.get_dataset(dataset_id, tenant_id) | |||||
| document = DocumentService.get_document(dataset.id, document_id) | |||||
| if not document: | |||||
| raise NotFound("Document not found.") | |||||
| if document.tenant_id != str(tenant_id): | |||||
| raise Forbidden("No permission.") | |||||
| metadata = request.args.get("metadata", "all") | |||||
| if metadata not in self.METADATA_CHOICES: | |||||
| raise InvalidMetadataError(f"Invalid metadata value: {metadata}") | |||||
| if metadata == "only": | |||||
| response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details} | |||||
| elif metadata == "without": | |||||
| dataset_process_rules = DatasetService.get_process_rules(dataset_id) | |||||
| document_process_rules = document.dataset_process_rule.to_dict() | |||||
| data_source_info = document.data_source_detail_dict | |||||
| response = { | |||||
| "id": document.id, | |||||
| "position": document.position, | |||||
| "data_source_type": document.data_source_type, | |||||
| "data_source_info": data_source_info, | |||||
| "dataset_process_rule_id": document.dataset_process_rule_id, | |||||
| "dataset_process_rule": dataset_process_rules, | |||||
| "document_process_rule": document_process_rules, | |||||
| "name": document.name, | |||||
| "created_from": document.created_from, | |||||
| "created_by": document.created_by, | |||||
| "created_at": document.created_at.timestamp(), | |||||
| "tokens": document.tokens, | |||||
| "indexing_status": document.indexing_status, | |||||
| "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, | |||||
| "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, | |||||
| "indexing_latency": document.indexing_latency, | |||||
| "error": document.error, | |||||
| "enabled": document.enabled, | |||||
| "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, | |||||
| "disabled_by": document.disabled_by, | |||||
| "archived": document.archived, | |||||
| "segment_count": document.segment_count, | |||||
| "average_segment_length": document.average_segment_length, | |||||
| "hit_count": document.hit_count, | |||||
| "display_status": document.display_status, | |||||
| "doc_form": document.doc_form, | |||||
| "doc_language": document.doc_language, | |||||
| } | |||||
| else: | |||||
| dataset_process_rules = DatasetService.get_process_rules(dataset_id) | |||||
| document_process_rules = document.dataset_process_rule.to_dict() | |||||
| data_source_info = document.data_source_detail_dict | |||||
| response = { | |||||
| "id": document.id, | |||||
| "position": document.position, | |||||
| "data_source_type": document.data_source_type, | |||||
| "data_source_info": data_source_info, | |||||
| "dataset_process_rule_id": document.dataset_process_rule_id, | |||||
| "dataset_process_rule": dataset_process_rules, | |||||
| "document_process_rule": document_process_rules, | |||||
| "name": document.name, | |||||
| "created_from": document.created_from, | |||||
| "created_by": document.created_by, | |||||
| "created_at": document.created_at.timestamp(), | |||||
| "tokens": document.tokens, | |||||
| "indexing_status": document.indexing_status, | |||||
| "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, | |||||
| "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, | |||||
| "indexing_latency": document.indexing_latency, | |||||
| "error": document.error, | |||||
| "enabled": document.enabled, | |||||
| "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, | |||||
| "disabled_by": document.disabled_by, | |||||
| "archived": document.archived, | |||||
| "doc_type": document.doc_type, | |||||
| "doc_metadata": document.doc_metadata_details, | |||||
| "segment_count": document.segment_count, | |||||
| "average_segment_length": document.average_segment_length, | |||||
| "hit_count": document.hit_count, | |||||
| "display_status": document.display_status, | |||||
| "doc_form": document.doc_form, | |||||
| "doc_language": document.doc_language, | |||||
| } | |||||
| return response | |||||
| api.add_resource( | api.add_resource( | ||||
| DocumentAddByTextApi, | DocumentAddByTextApi, | ||||
| "/datasets/<uuid:dataset_id>/document/create_by_text", | "/datasets/<uuid:dataset_id>/document/create_by_text", | ||||
| api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") | api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") | ||||
| api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents") | api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents") | ||||
| api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status") | api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status") | ||||
| api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") |
| from pydantic import BaseModel | from pydantic import BaseModel | ||||
| from sqlalchemy import select, update | from sqlalchemy import select, update | ||||
| from sqlalchemy.orm import Session | from sqlalchemy.orm import Session | ||||
| from werkzeug.exceptions import Forbidden, Unauthorized | |||||
| from werkzeug.exceptions import Forbidden, NotFound, Unauthorized | |||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| from extensions.ext_redis import redis_client | from extensions.ext_redis import redis_client | ||||
| from libs.login import _get_user | from libs.login import _get_user | ||||
| from models.account import Account, Tenant, TenantAccountJoin, TenantStatus | from models.account import Account, Tenant, TenantAccountJoin, TenantStatus | ||||
| from models.dataset import RateLimitLog | |||||
| from models.dataset import Dataset, RateLimitLog | |||||
| from models.model import ApiToken, App, EndUser | from models.model import ApiToken, App, EndUser | ||||
| from services.feature_service import FeatureService | from services.feature_service import FeatureService | ||||
| class DatasetApiResource(Resource): | class DatasetApiResource(Resource): | ||||
| method_decorators = [validate_dataset_token] | method_decorators = [validate_dataset_token] | ||||
| def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset: | |||||
| dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first() | |||||
| if not dataset: | |||||
| raise NotFound("Dataset not found.") | |||||
| return dataset |
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||||
| method='GET' | |||||
| title='Get Document Detail' | |||||
| name='#get-document-detail' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| Get a document's detail. | |||||
| ### Path | |||||
| - `dataset_id` (string) Dataset ID | |||||
| - `document_id` (string) Document ID | |||||
| ### Query | |||||
| - `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`. | |||||
| ### Response | |||||
| Returns the document's detail. | |||||
| </Col> | |||||
| <Col sticky> | |||||
| ### Request Example | |||||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||||
| -H 'Authorization: Bearer {api_key}' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| ### Response Example | |||||
| <CodeGroup title="Response"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||||
| "position": 1, | |||||
| "data_source_type": "upload_file", | |||||
| "data_source_info": { | |||||
| "upload_file": { | |||||
| ... | |||||
| } | |||||
| }, | |||||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_process_rule": { | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "document_process_rule": { | |||||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "name": "xxxx", | |||||
| "created_from": "web", | |||||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||||
| "created_at": 1750464191, | |||||
| "tokens": null, | |||||
| "indexing_status": "waiting", | |||||
| "completed_at": null, | |||||
| "updated_at": 1750464191, | |||||
| "indexing_latency": null, | |||||
| "error": null, | |||||
| "enabled": true, | |||||
| "disabled_at": null, | |||||
| "disabled_by": null, | |||||
| "archived": false, | |||||
| "segment_count": 0, | |||||
| "average_segment_length": 0, | |||||
| "hit_count": null, | |||||
| "display_status": "queuing", | |||||
| "doc_form": "hierarchical_model", | |||||
| "doc_language": "Chinese Simplified" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| ___ | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/status/{action}' | url='/datasets/{dataset_id}/documents/status/{action}' | ||||
| method='PATCH' | method='PATCH' |
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||||
| method='GET' | |||||
| title='ドキュメントの詳細を取得' | |||||
| name='#get-document-detail' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| ドキュメントの詳細を取得. | |||||
| ### Path | |||||
| - `dataset_id` (string) ナレッジベースID | |||||
| - `document_id` (string) ドキュメントID | |||||
| ### Query | |||||
| - `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。 | |||||
| ### Response | |||||
| ナレッジベースドキュメントの詳細を返す. | |||||
| </Col> | |||||
| <Col sticky> | |||||
| ### Request Example | |||||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||||
| -H 'Authorization: Bearer {api_key}' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| ### Response Example | |||||
| <CodeGroup title="Response"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||||
| "position": 1, | |||||
| "data_source_type": "upload_file", | |||||
| "data_source_info": { | |||||
| "upload_file": { | |||||
| ... | |||||
| } | |||||
| }, | |||||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_process_rule": { | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "document_process_rule": { | |||||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "name": "xxxx", | |||||
| "created_from": "web", | |||||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||||
| "created_at": 1750464191, | |||||
| "tokens": null, | |||||
| "indexing_status": "waiting", | |||||
| "completed_at": null, | |||||
| "updated_at": 1750464191, | |||||
| "indexing_latency": null, | |||||
| "error": null, | |||||
| "enabled": true, | |||||
| "disabled_at": null, | |||||
| "disabled_by": null, | |||||
| "archived": false, | |||||
| "segment_count": 0, | |||||
| "average_segment_length": 0, | |||||
| "hit_count": null, | |||||
| "display_status": "queuing", | |||||
| "doc_form": "hierarchical_model", | |||||
| "doc_language": "Chinese Simplified" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| ___ | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/status/{action}' | url='/datasets/{dataset_id}/documents/status/{action}' | ||||
| method='PATCH' | method='PATCH' |
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||||
| method='GET' | |||||
| title='获取文档详情' | |||||
| name='#get-document-detail' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| 获取文档详情. | |||||
| ### Path | |||||
| - `dataset_id` (string) 知识库 ID | |||||
| - `document_id` (string) 文档 ID | |||||
| ### Query | |||||
| - `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`. | |||||
| ### Response | |||||
| 返回知识库文档的详情. | |||||
| </Col> | |||||
| <Col sticky> | |||||
| ### Request Example | |||||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||||
| -H 'Authorization: Bearer {api_key}' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| ### Response Example | |||||
| <CodeGroup title="Response"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||||
| "position": 1, | |||||
| "data_source_type": "upload_file", | |||||
| "data_source_info": { | |||||
| "upload_file": { | |||||
| ... | |||||
| } | |||||
| }, | |||||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_process_rule": { | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "document_process_rule": { | |||||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||||
| "mode": "hierarchical", | |||||
| "rules": { | |||||
| "pre_processing_rules": [ | |||||
| { | |||||
| "id": "remove_extra_spaces", | |||||
| "enabled": true | |||||
| }, | |||||
| { | |||||
| "id": "remove_urls_emails", | |||||
| "enabled": false | |||||
| } | |||||
| ], | |||||
| "segmentation": { | |||||
| "separator": "**********page_ending**********", | |||||
| "max_tokens": 1024, | |||||
| "chunk_overlap": 0 | |||||
| }, | |||||
| "parent_mode": "paragraph", | |||||
| "subchunk_segmentation": { | |||||
| "separator": "\n", | |||||
| "max_tokens": 512, | |||||
| "chunk_overlap": 0 | |||||
| } | |||||
| } | |||||
| }, | |||||
| "name": "xxxx", | |||||
| "created_from": "web", | |||||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||||
| "created_at": 1750464191, | |||||
| "tokens": null, | |||||
| "indexing_status": "waiting", | |||||
| "completed_at": null, | |||||
| "updated_at": 1750464191, | |||||
| "indexing_latency": null, | |||||
| "error": null, | |||||
| "enabled": true, | |||||
| "disabled_at": null, | |||||
| "disabled_by": null, | |||||
| "archived": false, | |||||
| "segment_count": 0, | |||||
| "average_segment_length": 0, | |||||
| "hit_count": null, | |||||
| "display_status": "queuing", | |||||
| "doc_form": "hierarchical_model", | |||||
| "doc_language": "Chinese Simplified" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| ___ | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/status/{action}' | url='/datasets/{dataset_id}/documents/status/{action}' | ||||
| method='PATCH' | method='PATCH' |