| from core.plugin.impl.exc import PluginDaemonClientSideError | from core.plugin.impl.exc import PluginDaemonClientSideError | ||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| from extensions.ext_redis import redis_client | |||||
| from fields.document_fields import ( | from fields.document_fields import ( | ||||
| dataset_and_document_fields, | dataset_and_document_fields, | ||||
| document_fields, | document_fields, | ||||
| from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile | from models import Dataset, DatasetProcessRule, Document, DocumentSegment, UploadFile | ||||
| from services.dataset_service import DatasetService, DocumentService | from services.dataset_service import DatasetService, DocumentService | ||||
| from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig | from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig | ||||
| from tasks.add_document_to_index_task import add_document_to_index_task | |||||
| from tasks.remove_document_from_index_task import remove_document_from_index_task | |||||
| class DocumentResource(Resource): | class DocumentResource(Resource): | ||||
| DatasetService.check_dataset_permission(dataset, current_user) | DatasetService.check_dataset_permission(dataset, current_user) | ||||
| document_ids = request.args.getlist("document_id") | document_ids = request.args.getlist("document_id") | ||||
| for document_id in document_ids: | |||||
| document = self.get_document(dataset_id, document_id) | |||||
| indexing_cache_key = "document_{}_indexing".format(document.id) | |||||
| cache_result = redis_client.get(indexing_cache_key) | |||||
| if cache_result is not None: | |||||
| raise InvalidActionError(f"Document:{document.name} is being indexed, please try again later") | |||||
| if action == "enable": | |||||
| if document.enabled: | |||||
| continue | |||||
| document.enabled = True | |||||
| document.disabled_at = None | |||||
| document.disabled_by = None | |||||
| document.updated_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| add_document_to_index_task.delay(document_id) | |||||
| elif action == "disable": | |||||
| if not document.completed_at or document.indexing_status != "completed": | |||||
| raise InvalidActionError(f"Document: {document.name} is not completed.") | |||||
| if not document.enabled: | |||||
| continue | |||||
| document.enabled = False | |||||
| document.disabled_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| document.disabled_by = current_user.id | |||||
| document.updated_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| remove_document_from_index_task.delay(document_id) | |||||
| elif action == "archive": | |||||
| if document.archived: | |||||
| continue | |||||
| document.archived = True | |||||
| document.archived_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| document.archived_by = current_user.id | |||||
| document.updated_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| if document.enabled: | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| remove_document_from_index_task.delay(document_id) | |||||
| elif action == "un_archive": | |||||
| if not document.archived: | |||||
| continue | |||||
| document.archived = False | |||||
| document.archived_at = None | |||||
| document.archived_by = None | |||||
| document.updated_at = datetime.now(UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| add_document_to_index_task.delay(document_id) | |||||
| else: | |||||
| raise InvalidActionError() | |||||
| try: | |||||
| DocumentService.batch_update_document_status(dataset, document_ids, action, current_user) | |||||
| except services.errors.document.DocumentIndexingError as e: | |||||
| raise InvalidActionError(str(e)) | |||||
| except ValueError as e: | |||||
| raise InvalidActionError(str(e)) | |||||
| except NotFound as e: | |||||
| raise NotFound(str(e)) | |||||
| return {"result": "success"}, 200 | return {"result": "success"}, 200 | ||||
| import services.dataset_service | import services.dataset_service | ||||
| from controllers.service_api import api | from controllers.service_api import api | ||||
| from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError | |||||
| from controllers.service_api.dataset.error import DatasetInUseError, DatasetNameDuplicateError, InvalidActionError | |||||
| from controllers.service_api.wraps import ( | from controllers.service_api.wraps import ( | ||||
| DatasetApiResource, | DatasetApiResource, | ||||
| cloud_edition_billing_rate_limit_check, | cloud_edition_billing_rate_limit_check, | ||||
| from fields.tag_fields import tag_fields | from fields.tag_fields import tag_fields | ||||
| from libs.login import current_user | from libs.login import current_user | ||||
| from models.dataset import Dataset, DatasetPermissionEnum | from models.dataset import Dataset, DatasetPermissionEnum | ||||
| from services.dataset_service import DatasetPermissionService, DatasetService | |||||
| from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService | |||||
| from services.entities.knowledge_entities.knowledge_entities import RetrievalModel | from services.entities.knowledge_entities.knowledge_entities import RetrievalModel | ||||
| from services.tag_service import TagService | from services.tag_service import TagService | ||||
| raise DatasetInUseError() | raise DatasetInUseError() | ||||
| class DocumentStatusApi(DatasetApiResource): | |||||
| """Resource for batch document status operations.""" | |||||
| def patch(self, tenant_id, dataset_id, action): | |||||
| """ | |||||
| Batch update document status. | |||||
| Args: | |||||
| tenant_id: tenant id | |||||
| dataset_id: dataset id | |||||
| action: action to perform (enable, disable, archive, un_archive) | |||||
| Returns: | |||||
| dict: A dictionary with a key 'result' and a value 'success' | |||||
| int: HTTP status code 200 indicating that the operation was successful. | |||||
| Raises: | |||||
| NotFound: If the dataset with the given ID does not exist. | |||||
| Forbidden: If the user does not have permission. | |||||
| InvalidActionError: If the action is invalid or cannot be performed. | |||||
| """ | |||||
| dataset_id_str = str(dataset_id) | |||||
| dataset = DatasetService.get_dataset(dataset_id_str) | |||||
| if dataset is None: | |||||
| raise NotFound("Dataset not found.") | |||||
| # Check user's permission | |||||
| try: | |||||
| DatasetService.check_dataset_permission(dataset, current_user) | |||||
| except services.errors.account.NoPermissionError as e: | |||||
| raise Forbidden(str(e)) | |||||
| # Check dataset model setting | |||||
| DatasetService.check_dataset_model_setting(dataset) | |||||
| # Get document IDs from request body | |||||
| data = request.get_json() | |||||
| document_ids = data.get("document_ids", []) | |||||
| try: | |||||
| DocumentService.batch_update_document_status(dataset, document_ids, action, current_user) | |||||
| except services.errors.document.DocumentIndexingError as e: | |||||
| raise InvalidActionError(str(e)) | |||||
| except ValueError as e: | |||||
| raise InvalidActionError(str(e)) | |||||
| return {"result": "success"}, 200 | |||||
| class DatasetTagsApi(DatasetApiResource): | class DatasetTagsApi(DatasetApiResource): | ||||
| @validate_dataset_token | @validate_dataset_token | ||||
| @marshal_with(tag_fields) | @marshal_with(tag_fields) | ||||
| api.add_resource(DatasetListApi, "/datasets") | api.add_resource(DatasetListApi, "/datasets") | ||||
| api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>") | api.add_resource(DatasetApi, "/datasets/<uuid:dataset_id>") | ||||
| api.add_resource(DocumentStatusApi, "/datasets/<uuid:dataset_id>/documents/status/<string:action>") | |||||
| api.add_resource(DatasetTagsApi, "/datasets/tags") | api.add_resource(DatasetTagsApi, "/datasets/tags") | ||||
| api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding") | api.add_resource(DatasetTagBindingApi, "/datasets/tags/binding") | ||||
| api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding") | api.add_resource(DatasetTagUnbindingApi, "/datasets/tags/unbinding") | 
| from services.feature_service import FeatureModel, FeatureService | from services.feature_service import FeatureModel, FeatureService | ||||
| from services.tag_service import TagService | from services.tag_service import TagService | ||||
| from services.vector_service import VectorService | from services.vector_service import VectorService | ||||
| from tasks.add_document_to_index_task import add_document_to_index_task | |||||
| from tasks.batch_clean_document_task import batch_clean_document_task | from tasks.batch_clean_document_task import batch_clean_document_task | ||||
| from tasks.clean_notion_document_task import clean_notion_document_task | from tasks.clean_notion_document_task import clean_notion_document_task | ||||
| from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task | from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task | ||||
| from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task | from tasks.duplicate_document_indexing_task import duplicate_document_indexing_task | ||||
| from tasks.enable_segments_to_index_task import enable_segments_to_index_task | from tasks.enable_segments_to_index_task import enable_segments_to_index_task | ||||
| from tasks.recover_document_indexing_task import recover_document_indexing_task | from tasks.recover_document_indexing_task import recover_document_indexing_task | ||||
| from tasks.remove_document_from_index_task import remove_document_from_index_task | |||||
| from tasks.retry_document_indexing_task import retry_document_indexing_task | from tasks.retry_document_indexing_task import retry_document_indexing_task | ||||
| from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task | from tasks.sync_website_document_indexing_task import sync_website_document_indexing_task | ||||
| raise ValueError(ex.description) | raise ValueError(ex.description) | ||||
| filtered_data["updated_by"] = user.id | filtered_data["updated_by"] = user.id | ||||
| filtered_data["updated_at"] = datetime.datetime.now() | |||||
| filtered_data["updated_at"] = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| # update Retrieval model | # update Retrieval model | ||||
| filtered_data["retrieval_model"] = data["retrieval_model"] | filtered_data["retrieval_model"] = data["retrieval_model"] | ||||
| if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int): | if not isinstance(args["process_rule"]["rules"]["segmentation"]["max_tokens"], int): | ||||
| raise ValueError("Process rule segmentation max_tokens is invalid") | raise ValueError("Process rule segmentation max_tokens is invalid") | ||||
| @staticmethod | |||||
| def batch_update_document_status(dataset: Dataset, document_ids: list[str], action: str, user): | |||||
| """ | |||||
| Batch update document status. | |||||
| Args: | |||||
| dataset (Dataset): The dataset object | |||||
| document_ids (list[str]): List of document IDs to update | |||||
| action (str): Action to perform (enable, disable, archive, un_archive) | |||||
| user: Current user performing the action | |||||
| Raises: | |||||
| DocumentIndexingError: If document is being indexed or not in correct state | |||||
| """ | |||||
| if not document_ids: | |||||
| return | |||||
| for document_id in document_ids: | |||||
| document = DocumentService.get_document(dataset.id, document_id) | |||||
| if not document: | |||||
| continue | |||||
| indexing_cache_key = f"document_{document.id}_indexing" | |||||
| cache_result = redis_client.get(indexing_cache_key) | |||||
| if cache_result is not None: | |||||
| raise DocumentIndexingError(f"Document:{document.name} is being indexed, please try again later") | |||||
| if action == "enable": | |||||
| if document.enabled: | |||||
| continue | |||||
| document.enabled = True | |||||
| document.disabled_at = None | |||||
| document.disabled_by = None | |||||
| document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| add_document_to_index_task.delay(document_id) | |||||
| elif action == "disable": | |||||
| if not document.completed_at or document.indexing_status != "completed": | |||||
| raise DocumentIndexingError(f"Document: {document.name} is not completed.") | |||||
| if not document.enabled: | |||||
| continue | |||||
| document.enabled = False | |||||
| document.disabled_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| document.disabled_by = user.id | |||||
| document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| remove_document_from_index_task.delay(document_id) | |||||
| elif action == "archive": | |||||
| if document.archived: | |||||
| continue | |||||
| document.archived = True | |||||
| document.archived_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| document.archived_by = user.id | |||||
| document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| if document.enabled: | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| remove_document_from_index_task.delay(document_id) | |||||
| elif action == "un_archive": | |||||
| if not document.archived: | |||||
| continue | |||||
| document.archived = False | |||||
| document.archived_at = None | |||||
| document.archived_by = None | |||||
| document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None) | |||||
| db.session.commit() | |||||
| # Only re-index if the document is currently enabled | |||||
| if document.enabled: | |||||
| # Set cache to prevent indexing the same document multiple times | |||||
| redis_client.setex(indexing_cache_key, 600, 1) | |||||
| add_document_to_index_task.delay(document_id) | |||||
| else: | |||||
| raise ValueError(f"Invalid action: {action}") | |||||
| class SegmentService: | class SegmentService: | ||||
| @classmethod | @classmethod | 
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||||
| method='PATCH' | |||||
| title='Update Document Status' | |||||
| name='#batch_document_status' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| ### Path | |||||
| <Properties> | |||||
| <Property name='dataset_id' type='string' key='dataset_id'> | |||||
| Knowledge ID | |||||
| </Property> | |||||
| <Property name='action' type='string' key='action'> | |||||
| - `enable` - Enable document | |||||
| - `disable` - Disable document | |||||
| - `archive` - Archive document | |||||
| - `un_archive` - Unarchive document | |||||
| </Property> | |||||
| </Properties> | |||||
| ### Request Body | |||||
| <Properties> | |||||
| <Property name='document_ids' type='array[string]' key='document_ids'> | |||||
| List of document IDs | |||||
| </Property> | |||||
| </Properties> | |||||
| </Col> | |||||
| <Col sticky> | |||||
| <CodeGroup | |||||
| title="Request" | |||||
| tag="PATCH" | |||||
| label="/datasets/{dataset_id}/documents/status/{action}" | |||||
| targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`} | |||||
| > | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ | |||||
| --header 'Authorization: Bearer {api_key}' \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --data-raw '{ | |||||
| "document_ids": ["doc-id-1", "doc-id-2"] | |||||
| }' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| <CodeGroup title="Response"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "result": "success" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/{document_id}/segments' | url='/datasets/{dataset_id}/documents/{document_id}/segments' | ||||
| method='POST' | method='POST' | 
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||||
| method='PATCH' | |||||
| title='ドキュメントステータスの更新' | |||||
| name='#batch_document_status' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| ### パス | |||||
| <Properties> | |||||
| <Property name='dataset_id' type='string' key='dataset_id'> | |||||
| ナレッジ ID | |||||
| </Property> | |||||
| <Property name='action' type='string' key='action'> | |||||
| - `enable` - ドキュメントを有効化 | |||||
| - `disable` - ドキュメントを無効化 | |||||
| - `archive` - ドキュメントをアーカイブ | |||||
| - `un_archive` - ドキュメントのアーカイブを解除 | |||||
| </Property> | |||||
| </Properties> | |||||
| ### リクエストボディ | |||||
| <Properties> | |||||
| <Property name='document_ids' type='array[string]' key='document_ids'> | |||||
| ドキュメントIDのリスト | |||||
| </Property> | |||||
| </Properties> | |||||
| </Col> | |||||
| <Col sticky> | |||||
| <CodeGroup | |||||
| title="リクエスト" | |||||
| tag="PATCH" | |||||
| label="/datasets/{dataset_id}/documents/status/{action}" | |||||
| targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`} | |||||
| > | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ | |||||
| --header 'Authorization: Bearer {api_key}' \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --data-raw '{ | |||||
| "document_ids": ["doc-id-1", "doc-id-2"] | |||||
| }' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| <CodeGroup title="レスポンス"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "result": "success" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/{document_id}/segments' | url='/datasets/{dataset_id}/documents/{document_id}/segments' | ||||
| method='POST' | method='POST' | ||||
| </tbody> | </tbody> | ||||
| </table> | </table> | ||||
| <div className="pb-4" /> | <div className="pb-4" /> | ||||
| <hr className='ml-0 mr-0' /> | <hr className='ml-0 mr-0' /> | ||||
| <Heading | |||||
| url='/datasets/{dataset_id}/documents/status/{action}' | |||||
| method='PATCH' | |||||
| title='更新文档状态' | |||||
| name='#batch_document_status' | |||||
| /> | |||||
| <Row> | |||||
| <Col> | |||||
| ### Path | |||||
| <Properties> | |||||
| <Property name='dataset_id' type='string' key='dataset_id'> | |||||
| 知识库 ID | |||||
| </Property> | |||||
| <Property name='action' type='string' key='action'> | |||||
| - `enable` - 启用文档 | |||||
| - `disable` - 禁用文档 | |||||
| - `archive` - 归档文档 | |||||
| - `un_archive` - 取消归档文档 | |||||
| </Property> | |||||
| </Properties> | |||||
| ### Request Body | |||||
| <Properties> | |||||
| <Property name='document_ids' type='array[string]' key='document_ids'> | |||||
| 文档ID列表 | |||||
| </Property> | |||||
| </Properties> | |||||
| </Col> | |||||
| <Col sticky> | |||||
| <CodeGroup | |||||
| title="Request" | |||||
| tag="PATCH" | |||||
| label="/datasets/{dataset_id}/documents/status/{action}" | |||||
| targetCode={`curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json' \\\n--data-raw '{\n "document_ids": ["doc-id-1", "doc-id-2"]\n}'`} | |||||
| > | |||||
| ```bash {{ title: 'cURL' }} | |||||
| curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/status/{action}' \ | |||||
| --header 'Authorization: Bearer {api_key}' \ | |||||
| --header 'Content-Type: application/json' \ | |||||
| --data-raw '{ | |||||
| "document_ids": ["doc-id-1", "doc-id-2"] | |||||
| }' | |||||
| ``` | |||||
| </CodeGroup> | |||||
| <CodeGroup title="Response"> | |||||
| ```json {{ title: 'Response' }} | |||||
| { | |||||
| "result": "success" | |||||
| } | |||||
| ``` | |||||
| </CodeGroup> | |||||
| </Col> | |||||
| </Row> | |||||
| <hr className='ml-0 mr-0' /> | |||||
| <Heading | <Heading | ||||
| url='/datasets/{dataset_id}/documents/{document_id}/segments' | url='/datasets/{dataset_id}/documents/{document_id}/segments' | ||||
| method='POST' | method='POST' |