| @@ -3,7 +3,7 @@ import json | |||
| from flask import request | |||
| from flask_restful import marshal, reqparse | |||
| from sqlalchemy import desc, select | |||
| from werkzeug.exceptions import NotFound | |||
| from werkzeug.exceptions import Forbidden, NotFound | |||
| import services | |||
| from controllers.common.errors import FilenameNotExistsError | |||
| @@ -18,6 +18,7 @@ from controllers.service_api.app.error import ( | |||
| from controllers.service_api.dataset.error import ( | |||
| ArchivedDocumentImmutableError, | |||
| DocumentIndexingError, | |||
| InvalidMetadataError, | |||
| ) | |||
| from controllers.service_api.wraps import ( | |||
| DatasetApiResource, | |||
| @@ -466,6 +467,101 @@ class DocumentIndexingStatusApi(DatasetApiResource): | |||
| return data | |||
| class DocumentDetailApi(DatasetApiResource): | |||
| METADATA_CHOICES = {"all", "only", "without"} | |||
| def get(self, tenant_id, dataset_id, document_id): | |||
| dataset_id = str(dataset_id) | |||
| document_id = str(document_id) | |||
| dataset = self.get_dataset(dataset_id, tenant_id) | |||
| document = DocumentService.get_document(dataset.id, document_id) | |||
| if not document: | |||
| raise NotFound("Document not found.") | |||
| if document.tenant_id != str(tenant_id): | |||
| raise Forbidden("No permission.") | |||
| metadata = request.args.get("metadata", "all") | |||
| if metadata not in self.METADATA_CHOICES: | |||
| raise InvalidMetadataError(f"Invalid metadata value: {metadata}") | |||
| if metadata == "only": | |||
| response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details} | |||
| elif metadata == "without": | |||
| dataset_process_rules = DatasetService.get_process_rules(dataset_id) | |||
| document_process_rules = document.dataset_process_rule.to_dict() | |||
| data_source_info = document.data_source_detail_dict | |||
| response = { | |||
| "id": document.id, | |||
| "position": document.position, | |||
| "data_source_type": document.data_source_type, | |||
| "data_source_info": data_source_info, | |||
| "dataset_process_rule_id": document.dataset_process_rule_id, | |||
| "dataset_process_rule": dataset_process_rules, | |||
| "document_process_rule": document_process_rules, | |||
| "name": document.name, | |||
| "created_from": document.created_from, | |||
| "created_by": document.created_by, | |||
| "created_at": document.created_at.timestamp(), | |||
| "tokens": document.tokens, | |||
| "indexing_status": document.indexing_status, | |||
| "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, | |||
| "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, | |||
| "indexing_latency": document.indexing_latency, | |||
| "error": document.error, | |||
| "enabled": document.enabled, | |||
| "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, | |||
| "disabled_by": document.disabled_by, | |||
| "archived": document.archived, | |||
| "segment_count": document.segment_count, | |||
| "average_segment_length": document.average_segment_length, | |||
| "hit_count": document.hit_count, | |||
| "display_status": document.display_status, | |||
| "doc_form": document.doc_form, | |||
| "doc_language": document.doc_language, | |||
| } | |||
| else: | |||
| dataset_process_rules = DatasetService.get_process_rules(dataset_id) | |||
| document_process_rules = document.dataset_process_rule.to_dict() | |||
| data_source_info = document.data_source_detail_dict | |||
| response = { | |||
| "id": document.id, | |||
| "position": document.position, | |||
| "data_source_type": document.data_source_type, | |||
| "data_source_info": data_source_info, | |||
| "dataset_process_rule_id": document.dataset_process_rule_id, | |||
| "dataset_process_rule": dataset_process_rules, | |||
| "document_process_rule": document_process_rules, | |||
| "name": document.name, | |||
| "created_from": document.created_from, | |||
| "created_by": document.created_by, | |||
| "created_at": document.created_at.timestamp(), | |||
| "tokens": document.tokens, | |||
| "indexing_status": document.indexing_status, | |||
| "completed_at": int(document.completed_at.timestamp()) if document.completed_at else None, | |||
| "updated_at": int(document.updated_at.timestamp()) if document.updated_at else None, | |||
| "indexing_latency": document.indexing_latency, | |||
| "error": document.error, | |||
| "enabled": document.enabled, | |||
| "disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None, | |||
| "disabled_by": document.disabled_by, | |||
| "archived": document.archived, | |||
| "doc_type": document.doc_type, | |||
| "doc_metadata": document.doc_metadata_details, | |||
| "segment_count": document.segment_count, | |||
| "average_segment_length": document.average_segment_length, | |||
| "hit_count": document.hit_count, | |||
| "display_status": document.display_status, | |||
| "doc_form": document.doc_form, | |||
| "doc_language": document.doc_language, | |||
| } | |||
| return response | |||
| api.add_resource( | |||
| DocumentAddByTextApi, | |||
| "/datasets/<uuid:dataset_id>/document/create_by_text", | |||
| @@ -489,3 +585,4 @@ api.add_resource( | |||
| api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") | |||
| api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents") | |||
| api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status") | |||
| api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") | |||
| @@ -11,13 +11,13 @@ from flask_restful import Resource | |||
| from pydantic import BaseModel | |||
| from sqlalchemy import select, update | |||
| from sqlalchemy.orm import Session | |||
| from werkzeug.exceptions import Forbidden, Unauthorized | |||
| from werkzeug.exceptions import Forbidden, NotFound, Unauthorized | |||
| from extensions.ext_database import db | |||
| from extensions.ext_redis import redis_client | |||
| from libs.login import _get_user | |||
| from models.account import Account, Tenant, TenantAccountJoin, TenantStatus | |||
| from models.dataset import RateLimitLog | |||
| from models.dataset import Dataset, RateLimitLog | |||
| from models.model import ApiToken, App, EndUser | |||
| from services.feature_service import FeatureService | |||
| @@ -317,3 +317,11 @@ def create_or_update_end_user_for_user_id(app_model: App, user_id: Optional[str] | |||
| class DatasetApiResource(Resource): | |||
| method_decorators = [validate_dataset_token] | |||
| def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset: | |||
| dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first() | |||
| if not dataset: | |||
| raise NotFound("Dataset not found.") | |||
| return dataset | |||
| @@ -1124,6 +1124,129 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||
| method='GET' | |||
| title='Get Document Detail' | |||
| name='#get-document-detail' | |||
| /> | |||
| <Row> | |||
| <Col> | |||
| Get a document's detail. | |||
| ### Path | |||
| - `dataset_id` (string) Dataset ID | |||
| - `document_id` (string) Document ID | |||
| ### Query | |||
| - `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`. | |||
| ### Response | |||
| Returns the document's detail. | |||
| </Col> | |||
| <Col sticky> | |||
| ### Request Example | |||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||
| ```bash {{ title: 'cURL' }} | |||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||
| -H 'Authorization: Bearer {api_key}' | |||
| ``` | |||
| </CodeGroup> | |||
| ### Response Example | |||
| <CodeGroup title="Response"> | |||
| ```json {{ title: 'Response' }} | |||
| { | |||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||
| "position": 1, | |||
| "data_source_type": "upload_file", | |||
| "data_source_info": { | |||
| "upload_file": { | |||
| ... | |||
| } | |||
| }, | |||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_process_rule": { | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "document_process_rule": { | |||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "name": "xxxx", | |||
| "created_from": "web", | |||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||
| "created_at": 1750464191, | |||
| "tokens": null, | |||
| "indexing_status": "waiting", | |||
| "completed_at": null, | |||
| "updated_at": 1750464191, | |||
| "indexing_latency": null, | |||
| "error": null, | |||
| "enabled": true, | |||
| "disabled_at": null, | |||
| "disabled_by": null, | |||
| "archived": false, | |||
| "segment_count": 0, | |||
| "average_segment_length": 0, | |||
| "hit_count": null, | |||
| "display_status": "queuing", | |||
| "doc_form": "hierarchical_model", | |||
| "doc_language": "Chinese Simplified" | |||
| } | |||
| ``` | |||
| </CodeGroup> | |||
| </Col> | |||
| </Row> | |||
| ___ | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||
| method='PATCH' | |||
| @@ -881,6 +881,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||
| method='GET' | |||
| title='ドキュメントの詳細を取得' | |||
| name='#get-document-detail' | |||
| /> | |||
| <Row> | |||
| <Col> | |||
| ドキュメントの詳細を取得. | |||
| ### Path | |||
| - `dataset_id` (string) ナレッジベースID | |||
| - `document_id` (string) ドキュメントID | |||
| ### Query | |||
| - `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。 | |||
| ### Response | |||
| ナレッジベースドキュメントの詳細を返す. | |||
| </Col> | |||
| <Col sticky> | |||
| ### Request Example | |||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||
| ```bash {{ title: 'cURL' }} | |||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||
| -H 'Authorization: Bearer {api_key}' | |||
| ``` | |||
| </CodeGroup> | |||
| ### Response Example | |||
| <CodeGroup title="Response"> | |||
| ```json {{ title: 'Response' }} | |||
| { | |||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||
| "position": 1, | |||
| "data_source_type": "upload_file", | |||
| "data_source_info": { | |||
| "upload_file": { | |||
| ... | |||
| } | |||
| }, | |||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_process_rule": { | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "document_process_rule": { | |||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "name": "xxxx", | |||
| "created_from": "web", | |||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||
| "created_at": 1750464191, | |||
| "tokens": null, | |||
| "indexing_status": "waiting", | |||
| "completed_at": null, | |||
| "updated_at": 1750464191, | |||
| "indexing_latency": null, | |||
| "error": null, | |||
| "enabled": true, | |||
| "disabled_at": null, | |||
| "disabled_by": null, | |||
| "archived": false, | |||
| "segment_count": 0, | |||
| "average_segment_length": 0, | |||
| "hit_count": null, | |||
| "display_status": "queuing", | |||
| "doc_form": "hierarchical_model", | |||
| "doc_language": "Chinese Simplified" | |||
| } | |||
| ``` | |||
| </CodeGroup> | |||
| </Col> | |||
| </Row> | |||
| ___ | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||
| method='PATCH' | |||
| @@ -1131,6 +1131,130 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/{document_id}' | |||
| method='GET' | |||
| title='获取文档详情' | |||
| name='#get-document-detail' | |||
| /> | |||
| <Row> | |||
| <Col> | |||
| 获取文档详情. | |||
| ### Path | |||
| - `dataset_id` (string) 知识库 ID | |||
| - `document_id` (string) 文档 ID | |||
| ### Query | |||
| - `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`. | |||
| ### Response | |||
| 返回知识库文档的详情. | |||
| </Col> | |||
| <Col sticky> | |||
| ### Request Example | |||
| <CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}> | |||
| ```bash {{ title: 'cURL' }} | |||
| curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ | |||
| -H 'Authorization: Bearer {api_key}' | |||
| ``` | |||
| </CodeGroup> | |||
| ### Response Example | |||
| <CodeGroup title="Response"> | |||
| ```json {{ title: 'Response' }} | |||
| { | |||
| "id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2", | |||
| "position": 1, | |||
| "data_source_type": "upload_file", | |||
| "data_source_info": { | |||
| "upload_file": { | |||
| ... | |||
| } | |||
| }, | |||
| "dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_process_rule": { | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "document_process_rule": { | |||
| "id": "24b99906-845e-499f-9e3c-d5565dd6962c", | |||
| "dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9", | |||
| "mode": "hierarchical", | |||
| "rules": { | |||
| "pre_processing_rules": [ | |||
| { | |||
| "id": "remove_extra_spaces", | |||
| "enabled": true | |||
| }, | |||
| { | |||
| "id": "remove_urls_emails", | |||
| "enabled": false | |||
| } | |||
| ], | |||
| "segmentation": { | |||
| "separator": "**********page_ending**********", | |||
| "max_tokens": 1024, | |||
| "chunk_overlap": 0 | |||
| }, | |||
| "parent_mode": "paragraph", | |||
| "subchunk_segmentation": { | |||
| "separator": "\n", | |||
| "max_tokens": 512, | |||
| "chunk_overlap": 0 | |||
| } | |||
| } | |||
| }, | |||
| "name": "xxxx", | |||
| "created_from": "web", | |||
| "created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0", | |||
| "created_at": 1750464191, | |||
| "tokens": null, | |||
| "indexing_status": "waiting", | |||
| "completed_at": null, | |||
| "updated_at": 1750464191, | |||
| "indexing_latency": null, | |||
| "error": null, | |||
| "enabled": true, | |||
| "disabled_at": null, | |||
| "disabled_by": null, | |||
| "archived": false, | |||
| "segment_count": 0, | |||
| "average_segment_length": 0, | |||
| "hit_count": null, | |||
| "display_status": "queuing", | |||
| "doc_form": "hierarchical_model", | |||
| "doc_language": "Chinese Simplified" | |||
| } | |||
| ``` | |||
| </CodeGroup> | |||
| </Col> | |||
| </Row> | |||
| ___ | |||
| <hr className='ml-0 mr-0' /> | |||
| <Heading | |||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||
| method='PATCH' | |||