Browse Source

Add get document detail service api (#21700)

Co-authored-by: lizb <lizb@sugon.com>
tags/1.5.1
Ganondorf 4 months ago
parent
commit
55a6b330ec
No account linked to committer's email address

+ 98
- 1
api/controllers/service_api/dataset/document.py View File

from flask import request from flask import request
from flask_restful import marshal, reqparse from flask_restful import marshal, reqparse
from sqlalchemy import desc, select from sqlalchemy import desc, select
from werkzeug.exceptions import NotFound
from werkzeug.exceptions import Forbidden, NotFound


import services import services
from controllers.common.errors import FilenameNotExistsError from controllers.common.errors import FilenameNotExistsError
from controllers.service_api.dataset.error import ( from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError, ArchivedDocumentImmutableError,
DocumentIndexingError, DocumentIndexingError,
InvalidMetadataError,
) )
from controllers.service_api.wraps import ( from controllers.service_api.wraps import (
DatasetApiResource, DatasetApiResource,
return data return data




class DocumentDetailApi(DatasetApiResource):
METADATA_CHOICES = {"all", "only", "without"}

def get(self, tenant_id, dataset_id, document_id):
dataset_id = str(dataset_id)
document_id = str(document_id)

dataset = self.get_dataset(dataset_id, tenant_id)

document = DocumentService.get_document(dataset.id, document_id)

if not document:
raise NotFound("Document not found.")

if document.tenant_id != str(tenant_id):
raise Forbidden("No permission.")

metadata = request.args.get("metadata", "all")
if metadata not in self.METADATA_CHOICES:
raise InvalidMetadataError(f"Invalid metadata value: {metadata}")

if metadata == "only":
response = {"id": document.id, "doc_type": document.doc_type, "doc_metadata": document.doc_metadata_details}
elif metadata == "without":
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}
else:
dataset_process_rules = DatasetService.get_process_rules(dataset_id)
document_process_rules = document.dataset_process_rule.to_dict()
data_source_info = document.data_source_detail_dict
response = {
"id": document.id,
"position": document.position,
"data_source_type": document.data_source_type,
"data_source_info": data_source_info,
"dataset_process_rule_id": document.dataset_process_rule_id,
"dataset_process_rule": dataset_process_rules,
"document_process_rule": document_process_rules,
"name": document.name,
"created_from": document.created_from,
"created_by": document.created_by,
"created_at": document.created_at.timestamp(),
"tokens": document.tokens,
"indexing_status": document.indexing_status,
"completed_at": int(document.completed_at.timestamp()) if document.completed_at else None,
"updated_at": int(document.updated_at.timestamp()) if document.updated_at else None,
"indexing_latency": document.indexing_latency,
"error": document.error,
"enabled": document.enabled,
"disabled_at": int(document.disabled_at.timestamp()) if document.disabled_at else None,
"disabled_by": document.disabled_by,
"archived": document.archived,
"doc_type": document.doc_type,
"doc_metadata": document.doc_metadata_details,
"segment_count": document.segment_count,
"average_segment_length": document.average_segment_length,
"hit_count": document.hit_count,
"display_status": document.display_status,
"doc_form": document.doc_form,
"doc_language": document.doc_language,
}

return response


api.add_resource( api.add_resource(
DocumentAddByTextApi, DocumentAddByTextApi,
"/datasets/<uuid:dataset_id>/document/create_by_text", "/datasets/<uuid:dataset_id>/document/create_by_text",
api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>") api.add_resource(DocumentDeleteApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")
api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents") api.add_resource(DocumentListApi, "/datasets/<uuid:dataset_id>/documents")
api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status") api.add_resource(DocumentIndexingStatusApi, "/datasets/<uuid:dataset_id>/documents/<string:batch>/indexing-status")
api.add_resource(DocumentDetailApi, "/datasets/<uuid:dataset_id>/documents/<uuid:document_id>")

+ 10
- 2
api/controllers/service_api/wraps.py View File

from pydantic import BaseModel from pydantic import BaseModel
from sqlalchemy import select, update from sqlalchemy import select, update
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from werkzeug.exceptions import Forbidden, Unauthorized
from werkzeug.exceptions import Forbidden, NotFound, Unauthorized


from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from libs.login import _get_user from libs.login import _get_user
from models.account import Account, Tenant, TenantAccountJoin, TenantStatus from models.account import Account, Tenant, TenantAccountJoin, TenantStatus
from models.dataset import RateLimitLog
from models.dataset import Dataset, RateLimitLog
from models.model import ApiToken, App, EndUser from models.model import ApiToken, App, EndUser
from services.feature_service import FeatureService from services.feature_service import FeatureService




class DatasetApiResource(Resource): class DatasetApiResource(Resource):
method_decorators = [validate_dataset_token] method_decorators = [validate_dataset_token]

def get_dataset(self, dataset_id: str, tenant_id: str) -> Dataset:
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id, Dataset.tenant_id == tenant_id).first()

if not dataset:
raise NotFound("Dataset not found.")

return dataset

+ 123
- 0
web/app/(commonLayout)/datasets/template/template.en.mdx View File



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='Get Document Detail'
name='#get-document-detail'
/>
<Row>
<Col>
Get a document's detail.
### Path
- `dataset_id` (string) Dataset ID
- `document_id` (string) Document ID

### Query
- `metadata` (string) Metadata filter, can be `all`, `only`, or `without`. Default is `all`.

### Response
Returns the document's detail.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>

### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />

<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'

+ 124
- 0
web/app/(commonLayout)/datasets/template/template.ja.mdx View File



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='ドキュメントの詳細を取得'
name='#get-document-detail'
/>
<Row>
<Col>
ドキュメントの詳細を取得.
### Path
- `dataset_id` (string) ナレッジベースID
- `document_id` (string) ドキュメントID

### Query
- `metadata` (string) metadataのフィルター条件 `all`、`only`、または`without`。デフォルトは `all`。

### Response
ナレッジベースドキュメントの詳細を返す.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>

### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />


<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'

+ 124
- 0
web/app/(commonLayout)/datasets/template/template.zh.mdx View File



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/{document_id}'
method='GET'
title='获取文档详情'
name='#get-document-detail'
/>
<Row>
<Col>
获取文档详情.
### Path
- `dataset_id` (string) 知识库 ID
- `document_id` (string) 文档 ID

### Query
- `metadata` (string) metadata 过滤条件 `all`, `only`, 或者 `without`. 默认是 `all`.

### Response
返回知识库文档的详情.
</Col>
<Col sticky>
### Request Example
<CodeGroup title="Request" tag="GET" label="/datasets/{dataset_id}/documents/{document_id}" targetCode={`curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \\\n-H 'Authorization: Bearer {api_key}'`}>
```bash {{ title: 'cURL' }}
curl -X GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \
-H 'Authorization: Bearer {api_key}'
```
</CodeGroup>

### Response Example
<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"id": "f46ae30c-5c11-471b-96d0-464f5f32a7b2",
"position": 1,
"data_source_type": "upload_file",
"data_source_info": {
"upload_file": {
...
}
},
"dataset_process_rule_id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_process_rule": {
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"document_process_rule": {
"id": "24b99906-845e-499f-9e3c-d5565dd6962c",
"dataset_id": "48a0db76-d1a9-46c1-ae35-2baaa919a8a9",
"mode": "hierarchical",
"rules": {
"pre_processing_rules": [
{
"id": "remove_extra_spaces",
"enabled": true
},
{
"id": "remove_urls_emails",
"enabled": false
}
],
"segmentation": {
"separator": "**********page_ending**********",
"max_tokens": 1024,
"chunk_overlap": 0
},
"parent_mode": "paragraph",
"subchunk_segmentation": {
"separator": "\n",
"max_tokens": 512,
"chunk_overlap": 0
}
}
},
"name": "xxxx",
"created_from": "web",
"created_by": "17f71940-a7b5-4c77-b60f-2bd645c1ffa0",
"created_at": 1750464191,
"tokens": null,
"indexing_status": "waiting",
"completed_at": null,
"updated_at": 1750464191,
"indexing_latency": null,
"error": null,
"enabled": true,
"disabled_at": null,
"disabled_by": null,
"archived": false,
"segment_count": 0,
"average_segment_length": 0,
"hit_count": null,
"display_status": "queuing",
"doc_form": "hierarchical_model",
"doc_language": "Chinese Simplified"
}
```
</CodeGroup>
</Col>
</Row>
___
<hr className='ml-0 mr-0' />


<Heading <Heading
url='/datasets/{dataset_id}/documents/status/{action}' url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH' method='PATCH'

Loading…
Cancel
Save