| def get(self, provider): | def get(self, provider): | ||||
| user = current_user | user = current_user | ||||
| user_id = user.id | |||||
| tenant_id = user.current_tenant_id | tenant_id = user.current_tenant_id | ||||
| return jsonable_encoder(BuiltinToolManageService.get_builtin_tool_provider_info(tenant_id, provider)) | return jsonable_encoder(BuiltinToolManageService.get_builtin_tool_provider_info(tenant_id, provider)) |
| documents.append(document) | documents.append(document) | ||||
| # build index | # build index | ||||
| # get the process rule | |||||
| processing_rule = ( | |||||
| db.session.query(DatasetProcessRule) | |||||
| .where(DatasetProcessRule.id == dataset_document.dataset_process_rule_id) | |||||
| .first() | |||||
| ) | |||||
| index_type = dataset_document.doc_form | index_type = dataset_document.doc_form | ||||
| index_processor = IndexProcessorFactory(index_type).init_index_processor() | index_processor = IndexProcessorFactory(index_type).init_index_processor() | ||||
| self._load( | self._load( |
| def instruction_modify_legacy( | def instruction_modify_legacy( | ||||
| tenant_id: str, flow_id: str, current: str, instruction: str, model_config: dict, ideal_output: str | None | tenant_id: str, flow_id: str, current: str, instruction: str, model_config: dict, ideal_output: str | None | ||||
| ) -> dict: | ) -> dict: | ||||
| app: App | None = db.session.query(App).where(App.id == flow_id).first() | |||||
| last_run: Message | None = ( | last_run: Message | None = ( | ||||
| db.session.query(Message).where(Message.app_id == flow_id).order_by(Message.created_at.desc()).first() | db.session.query(Message).where(Message.app_id == flow_id).order_by(Message.created_at.desc()).first() | ||||
| ) | ) |
| if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000: | if not isinstance(top_k, int) or top_k <= 0 or top_k > 10000: | ||||
| top_k = 5 # Use default if invalid | top_k = 5 # Use default if invalid | ||||
| # just not implement fetch by score_threshold now, may be later | # just not implement fetch by score_threshold now, may be later | ||||
| score_threshold = float(kwargs.get("score_threshold") or 0.0) | |||||
| if len(query) > 0: | if len(query) > 0: | ||||
| # Check which language the query is in | # Check which language the query is in | ||||
| zh_pattern = re.compile("[\u4e00-\u9fa5]+") | zh_pattern = re.compile("[\u4e00-\u9fa5]+") |
| # Get the actual volume path (may include dify_km prefix) | # Get the actual volume path (may include dify_km prefix) | ||||
| volume_path = self._get_volume_path(filename, dataset_id) | volume_path = self._get_volume_path(filename, dataset_id) | ||||
| actual_filename = volume_path.split("/")[-1] if "/" in volume_path else volume_path | |||||
| # For User Volume, use the full path with dify_km prefix | # For User Volume, use the full path with dify_km prefix | ||||
| if volume_prefix == "USER VOLUME": | if volume_prefix == "USER VOLUME": |
| import json | import json | ||||
| import logging | import logging | ||||
| from dataclasses import asdict, dataclass | from dataclasses import asdict, dataclass | ||||
| from datetime import datetime, timedelta | |||||
| from datetime import datetime | |||||
| from enum import Enum | from enum import Enum | ||||
| from typing import Any, Optional | from typing import Any, Optional | ||||
| versions.append(current_metadata) | versions.append(current_metadata) | ||||
| # 获取历史版本 | # 获取历史版本 | ||||
| version_pattern = f"{self._version_prefix}{filename}.v*" | |||||
| try: | try: | ||||
| version_files = self._storage.scan(self._dataset_id or "", files=True) | version_files = self._storage.scan(self._dataset_id or "", files=True) | ||||
| for file_path in version_files: | for file_path in version_files: | ||||
| """ | """ | ||||
| try: | try: | ||||
| cleaned_count = 0 | cleaned_count = 0 | ||||
| cutoff_date = datetime.now() - timedelta(days=max_age_days) | |||||
| # 获取所有版本文件 | # 获取所有版本文件 | ||||
| try: | try: |
| with redis_client.lock(lock_name, timeout=20): | with redis_client.lock(lock_name, timeout=20): | ||||
| index_node_id = str(uuid.uuid4()) | index_node_id = str(uuid.uuid4()) | ||||
| index_node_hash = helper.generate_text_hash(content) | index_node_hash = helper.generate_text_hash(content) | ||||
| child_chunk_count = ( | |||||
| db.session.query(ChildChunk) | |||||
| .where( | |||||
| ChildChunk.tenant_id == current_user.current_tenant_id, | |||||
| ChildChunk.dataset_id == dataset.id, | |||||
| ChildChunk.document_id == document.id, | |||||
| ChildChunk.segment_id == segment.id, | |||||
| ) | |||||
| .count() | |||||
| ) | |||||
| max_position = ( | max_position = ( | ||||
| db.session.query(func.max(ChildChunk.position)) | db.session.query(func.max(ChildChunk.position)) | ||||
| .where( | .where( |
| Usage: retry_document_indexing_task.delay(dataset_id, document_ids) | Usage: retry_document_indexing_task.delay(dataset_id, document_ids) | ||||
| """ | """ | ||||
| documents: list[Document] = [] | |||||
| start_at = time.perf_counter() | start_at = time.perf_counter() | ||||
| try: | try: | ||||
| dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first() | dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first() |