Kaynağa Gözat

Knowledge base API supports status updates #18147 (#18235)

tags/1.5.0
GuanMu 4 ay önce
ebeveyn
işleme
870e73c03b
No account linked to committer's email address

+ 9
- 73
api/controllers/console/datasets/datasets_document.py Dosyayı Görüntüle

from core.plugin.impl.exc import PluginDaemonClientSideError from core.plugin.impl.exc import PluginDaemonClientSideError
from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.extractor.entity.extract_setting import ExtractSetting
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client
from fields.document_fields import ( from fields.document_fields import (
dataset_and_document_fields, dataset_and_document_fields,
document_fields, document_fields,
from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile
from services.dataset_service import DatasetService, DocumentService from services.dataset_service import DatasetService, DocumentService
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
from tasks.add_document_to_index_task import add_document_to_index_task
from tasks.remove_document_from_index_task import remove_document_from_index_task




class DocumentResource(Resource): class DocumentResource(Resource):
DatasetService.check_dataset_permission(dataset, current_user) DatasetService.check_dataset_permission(dataset, current_user)


document_ids = request.args.getlist("document_id") document_ids = request.args.getlist("document_id")
for document_id in document_ids:
document = self.get_document(dataset_id, document_id)

indexing_cache_key = "document_{}_indexing".format(document.id)
cache_result = redis_client.get(indexing_cache_key)
if cache_result is not None:
raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later")

if action == "enable":
if document.enabled:
continue
document.enabled = True
document.disabled_at = None
document.disabled_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

add_document_to_index_task.delay(document_id)

elif action == "disable":
if not document.completed_at or document.indexing_status != "completed":
raise InvalidActionError(f"Document: {document.name} is not completed.")
if not document.enabled:
continue

document.enabled = False
document.disabled_at = datetime.now(UTC).replace(tzinfo=None)
document.disabled_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

remove_document_from_index_task.delay(document_id)

elif action == "archive":
if document.archived:
continue

document.archived = True
document.archived_at = datetime.now(UTC).replace(tzinfo=None)
document.archived_by = current_user.id
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

if document.enabled:
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

remove_document_from_index_task.delay(document_id)

elif action == "un_archive":
if not document.archived:
continue
document.archived = False
document.archived_at = None
document.archived_by = None
document.updated_at = datetime.now(UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

add_document_to_index_task.delay(document_id)


else:
raise InvalidActionError()
try:
DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
except services.errors.document.DocumentIndexingError as e:
raise InvalidActionError(str(e))
except ValueError as e:
raise InvalidActionError(str(e))
except NotFound as e:
raise NotFound(str(e))

return {"result": "success"}, 200 return {"result": "success"}, 200





+ 53
- 2
api/controllers/service_api/dataset/dataset.py Dosyayı Görüntüle



import services.dataset_service import services.dataset_service
from controllers.service_api import api from controllers.service_api import api
from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError
from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError, InvalidActionError
from controllers.service_api.wraps import ( from controllers.service_api.wraps import (
DatasetApiResource, DatasetApiResource,
cloud_edition_billing_rate_limit_check, cloud_edition_billing_rate_limit_check,
from fields.tag_fields import tag_fields from fields.tag_fields import tag_fields
from libs.login import current_user from libs.login import current_user
from models.dataset import Dataset, DatasetPermissionEnum from models.dataset import Dataset, DatasetPermissionEnum
from services.dataset_service import DatasetPermissionService, DatasetService
from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService
from services.entities.knowledge_entities.knowledge_entities import RetrievalModel from services.entities.knowledge_entities.knowledge_entities import RetrievalModel
from services.tag_service import TagService from services.tag_service import TagService


raise DatasetInUseError() raise DatasetInUseError()




class DocumentStatusApi(DatasetApiResource):
"""Resource for batch document status operations."""

def patch(self, tenant_id, dataset_id, action):
"""
Batch update document status.

Args:
tenant_id: tenant id
dataset_id: dataset id
action: action to perform (enable, disable, archive, un_archive)

Returns:
dict: A dictionary with a key 'result' and a value 'success'
int: HTTP status code 200 indicating that the operation was successful.

Raises:
NotFound: If the dataset with the given ID does not exist.
Forbidden: If the user does not have permission.
InvalidActionError: If the action is invalid or cannot be performed.
"""
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)

if dataset is None:
raise NotFound("Dataset not found.")

# Check user's permission
try:
DatasetService.check_dataset_permission(dataset, current_user)
except services.errors.account.NoPermissionError as e:
raise Forbidden(str(e))

# Check dataset model setting
DatasetService.check_dataset_model_setting(dataset)

# Get document IDs from request body
data = request.get_json()
document_ids = data.get("document_ids", [])

try:
DocumentService.batch_update_document_status(dataset, document_ids, action, current_user)
except services.errors.document.DocumentIndexingError as e:
raise InvalidActionError(str(e))
except ValueError as e:
raise InvalidActionError(str(e))

return {"result": "success"}, 200


class DatasetTagsApi(DatasetApiResource): class DatasetTagsApi(DatasetApiResource):
@validate_dataset_token @validate_dataset_token
@marshal_with(tag_fields) @marshal_with(tag_fields)


api.add_resource(DatasetListApi, "/datasets") api.add_resource(DatasetListApi, "/datasets")
api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>") api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>")
api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>")
api.add_resource(DatasetTagsApi, "/datasets/tags") api.add_resource(DatasetTagsApi, "/datasets/tags")
api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding") api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding")
api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding") api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding")

+ 96
- 1
api/services/dataset_service.py Dosyayı Görüntüle

from services.feature_service import FeatureModel, FeatureService from services.feature_service import FeatureModel, FeatureService
from services.tag_service import TagService from services.tag_service import TagService
from services.vector_service import VectorService from services.vector_service import VectorService
from tasks.add_document_to_index_task import add_document_to_index_task
from tasks.batch_clean_document_task import batch_clean_document_task from tasks.batch_clean_document_task import batch_clean_document_task
from tasks.clean_notion_document_task import clean_notion_document_task from tasks.clean_notion_document_task import clean_notion_document_task
from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task
from tasks.enable_segments_to_index_task import enable_segments_to_index_task from tasks.enable_segments_to_index_task import enable_segments_to_index_task
from tasks.recover_document_indexing_task import recover_document_indexing_task from tasks.recover_document_indexing_task import recover_document_indexing_task
from tasks.remove_document_from_index_task import remove_document_from_index_task
from tasks.retry_document_indexing_task import retry_document_indexing_task from tasks.retry_document_indexing_task import retry_document_indexing_task
from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task


raise ValueError(ex.description) raise ValueError(ex.description)


filtered_data["updated_by"] = user.id filtered_data["updated_by"] = user.id
filtered_data["updated_at"] = datetime.datetime.now()
filtered_data["updated_at"] = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)


# update Retrieval model # update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"] filtered_data["retrieval_model"] = data["retrieval_model"]
if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int): if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int):
raise ValueError("Process rule segmentation max_tokens is invalid") raise ValueError("Process rule segmentation max_tokens is invalid")


@staticmethod
def batch_update_document_status(dataset: Dataset, document_ids: list[str], action: str, user):
"""
Batch update document status.

Args:
dataset (Dataset): The dataset object
document_ids (list[str]): List of document IDs to update
action (str): Action to perform (enable, disable, archive, un_archive)
user: Current user performing the action

Raises:
DocumentIndexingError: If document is being indexed or not in correct state
"""
if not document_ids:
return

for document_id in document_ids:
document = DocumentService.get_document(dataset.id, document_id)

if not document:
continue

indexing_cache_key = f"document_{document.id}_indexing"
cache_result = redis_client.get(indexing_cache_key)
if cache_result is not None:
raise DocumentIndexingError(f"Document:{document.name} is being indexed, please try again later")

if action == "enable":
if document.enabled:
continue
document.enabled = True
document.disabled_at = None
document.disabled_by = None
document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

add_document_to_index_task.delay(document_id)

elif action == "disable":
if not document.completed_at or document.indexing_status != "completed":
raise DocumentIndexingError(f"Document: {document.name} is not completed.")
if not document.enabled:
continue

document.enabled = False
document.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
document.disabled_by = user.id
document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()

# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

remove_document_from_index_task.delay(document_id)

elif action == "archive":
if document.archived:
continue

document.archived = True
document.archived_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
document.archived_by = user.id
document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()

if document.enabled:
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)

remove_document_from_index_task.delay(document_id)

elif action == "un_archive":
if not document.archived:
continue
document.archived = False
document.archived_at = None
document.archived_by = None
document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
db.session.commit()

# Only re-index if the document is currently enabled
if document.enabled:
# Set cache to prevent indexing the same document multiple times
redis_client.setex(indexing_cache_key, 600, 1)
add_document_to_index_task.delay(document_id)

else:
raise ValueError(f"Invalid action: {action}")



class SegmentService: class SegmentService:
@classmethod @classmethod

+ 57
- 0
web/app/(commonLayout)/datasets/template/template.en.mdx Dosyayı Görüntüle



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'
title='Update Document Status'
name='#batch_document_status'
/>
<Row>
<Col>
### Path
<Properties>
<Property name='dataset_id' type='string' key='dataset_id'>
Knowledge ID
</Property>
<Property name='action' type='string' key='action'>
- `enable` - Enable document
- `disable` - Disable document
- `archive` - Archive document
- `un_archive` - Unarchive document
</Property>
</Properties>

### Request Body
<Properties>
<Property name='document_ids' type='array[string]' key='document_ids'>
List of document IDs
</Property>
</Properties>
</Col>
<Col sticky>
<CodeGroup
title="Request"
tag="PATCH"
label="/datasets/{dataset_id}/documents/status/{action}"
targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
>
```bash {{ title: 'cURL' }}
curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
--header 'Authorization: Bearer {api_key}' \
--header 'Content-Type: application/json' \
--data-raw '{
"document_ids": ["doc-id-1", "doc-id-2"]
}'
```
</CodeGroup>

<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"result": "success"
}
```
</CodeGroup>
</Col>
</Row>

<hr className='ml-0 mr-0' />

<Heading <Heading
url='/datasets/{dataset_id}/documents/{document_id}/segments' url='/datasets/{dataset_id}/documents/{document_id}/segments'
method='POST' method='POST'

+ 58
- 0
web/app/(commonLayout)/datasets/template/template.ja.mdx Dosyayı Görüntüle



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'
title='ドキュメントステータスの更新'
name='#batch_document_status'
/>
<Row>
<Col>
### パス
<Properties>
<Property name='dataset_id' type='string' key='dataset_id'>
ナレッジ ID
</Property>
<Property name='action' type='string' key='action'>
- `enable` - ドキュメントを有効化
- `disable` - ドキュメントを無効化
- `archive` - ドキュメントをアーカイブ
- `un_archive` - ドキュメントのアーカイブを解除
</Property>
</Properties>

### リクエストボディ
<Properties>
<Property name='document_ids' type='array[string]' key='document_ids'>
ドキュメントIDのリスト
</Property>
</Properties>
</Col>
<Col sticky>
<CodeGroup
title="リクエスト"
tag="PATCH"
label="/datasets/{dataset_id}/documents/status/{action}"
targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
>
```bash {{ title: 'cURL' }}
curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
--header 'Authorization: Bearer {api_key}' \
--header 'Content-Type: application/json' \
--data-raw '{
"document_ids": ["doc-id-1", "doc-id-2"]
}'
```
</CodeGroup>

<CodeGroup title="レスポンス">
```json {{ title: 'Response' }}
{
"result": "success"
}
```
</CodeGroup>
</Col>
</Row>

<hr className='ml-0 mr-0' />

<Heading <Heading
url='/datasets/{dataset_id}/documents/{document_id}/segments' url='/datasets/{dataset_id}/documents/{document_id}/segments'
method='POST' method='POST'
</tbody> </tbody>
</table> </table>
<div className="pb-4" /> <div className="pb-4" />


+ 57
- 0
web/app/(commonLayout)/datasets/template/template.zh.mdx Dosyayı Görüntüle



<hr className='ml-0 mr-0' /> <hr className='ml-0 mr-0' />


<Heading
url='/datasets/{dataset_id}/documents/status/{action}'
method='PATCH'
title='更新文档状态'
name='#batch_document_status'
/>
<Row>
<Col>
### Path
<Properties>
<Property name='dataset_id' type='string' key='dataset_id'>
知识库 ID
</Property>
<Property name='action' type='string' key='action'>
- `enable` - 启用文档
- `disable` - 禁用文档
- `archive` - 归档文档
- `un_archive` - 取消归档文档
</Property>
</Properties>

### Request Body
<Properties>
<Property name='document_ids' type='array[string]' key='document_ids'>
文档ID列表
</Property>
</Properties>
</Col>
<Col sticky>
<CodeGroup
title="Request"
tag="PATCH"
label="/datasets/{dataset_id}/documents/status/{action}"
targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`}
>
```bash {{ title: 'cURL' }}
curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \
--header 'Authorization: Bearer {api_key}' \
--header 'Content-Type: application/json' \
--data-raw '{
"document_ids": ["doc-id-1", "doc-id-2"]
}'
```
</CodeGroup>

<CodeGroup title="Response">
```json {{ title: 'Response' }}
{
"result": "success"
}
```
</CodeGroup>
</Col>
</Row>

<hr className='ml-0 mr-0' />

<Heading <Heading
url='/datasets/{dataset_id}/documents/{document_id}/segments' url='/datasets/{dataset_id}/documents/{document_id}/segments'
method='POST' method='POST'

Loading…
İptal
Kaydet