| @@ -89,7 +89,7 @@ class AnnotationReplyActionStatusApi(Resource): | |||
| app_annotation_job_key = "{}_app_annotation_job_{}".format(action, str(job_id)) | |||
| cache_result = redis_client.get(app_annotation_job_key) | |||
| if cache_result is None: | |||
| raise ValueError("The job is not exist.") | |||
| raise ValueError("The job does not exist.") | |||
| job_status = cache_result.decode() | |||
| error_msg = "" | |||
| @@ -226,7 +226,7 @@ class AnnotationBatchImportStatusApi(Resource): | |||
| indexing_cache_key = "app_annotation_batch_import_{}".format(str(job_id)) | |||
| cache_result = redis_client.get(indexing_cache_key) | |||
| if cache_result is None: | |||
| raise ValueError("The job is not exist.") | |||
| raise ValueError("The job does not exist.") | |||
| job_status = cache_result.decode() | |||
| error_msg = "" | |||
| if job_status == "error": | |||
| @@ -398,7 +398,7 @@ class DatasetDocumentSegmentBatchImportApi(Resource): | |||
| indexing_cache_key = "segment_batch_import_{}".format(job_id) | |||
| cache_result = redis_client.get(indexing_cache_key) | |||
| if cache_result is None: | |||
| raise ValueError("The job is not exist.") | |||
| raise ValueError("The job does not exist.") | |||
| return {"job_id": job_id, "job_status": cache_result.decode()}, 200 | |||
| @@ -13,6 +13,7 @@ from fields.dataset_fields import dataset_detail_fields | |||
| from libs.login import current_user | |||
| from models.dataset import Dataset, DatasetPermissionEnum | |||
| from services.dataset_service import DatasetPermissionService, DatasetService | |||
| from services.entities.knowledge_entities.knowledge_entities import RetrievalModel | |||
| def _validate_name(name): | |||
| @@ -120,8 +121,11 @@ class DatasetListApi(DatasetApiResource): | |||
| nullable=True, | |||
| required=False, | |||
| ) | |||
| args = parser.parse_args() | |||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=True, location="json") | |||
| parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json") | |||
| parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json") | |||
| args = parser.parse_args() | |||
| try: | |||
| dataset = DatasetService.create_empty_dataset( | |||
| tenant_id=tenant_id, | |||
| @@ -133,6 +137,9 @@ class DatasetListApi(DatasetApiResource): | |||
| provider=args["provider"], | |||
| external_knowledge_api_id=args["external_knowledge_api_id"], | |||
| external_knowledge_id=args["external_knowledge_id"], | |||
| embedding_model_provider=args["embedding_model_provider"], | |||
| embedding_model_name=args["embedding_model"], | |||
| retrieval_model=RetrievalModel(**args["retrieval_model"]), | |||
| ) | |||
| except services.errors.dataset.DatasetNameDuplicateError: | |||
| raise DatasetNameDuplicateError() | |||
| @@ -49,7 +49,9 @@ class DocumentAddByTextApi(DatasetApiResource): | |||
| parser.add_argument( | |||
| "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" | |||
| ) | |||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") | |||
| parser.add_argument("retrieval_model", type=dict, required=False, nullable=True, location="json") | |||
| parser.add_argument("embedding_model", type=str, required=False, nullable=True, location="json") | |||
| parser.add_argument("embedding_model_provider", type=str, required=False, nullable=True, location="json") | |||
| args = parser.parse_args() | |||
| dataset_id = str(dataset_id) | |||
| @@ -57,7 +59,7 @@ class DocumentAddByTextApi(DatasetApiResource): | |||
| dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| raise ValueError("Dataset does not exist.") | |||
| if not dataset.indexing_technique and not args["indexing_technique"]: | |||
| raise ValueError("indexing_technique is required.") | |||
| @@ -114,7 +116,7 @@ class DocumentUpdateByTextApi(DatasetApiResource): | |||
| dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| raise ValueError("Dataset does not exist.") | |||
| # indexing_technique is already set in dataset since this is an update | |||
| args["indexing_technique"] = dataset.indexing_technique | |||
| @@ -172,7 +174,7 @@ class DocumentAddByFileApi(DatasetApiResource): | |||
| dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| raise ValueError("Dataset does not exist.") | |||
| if not dataset.indexing_technique and not args.get("indexing_technique"): | |||
| raise ValueError("indexing_technique is required.") | |||
| @@ -239,7 +241,7 @@ class DocumentUpdateByFileApi(DatasetApiResource): | |||
| dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| raise ValueError("Dataset does not exist.") | |||
| # indexing_technique is already set in dataset since this is an update | |||
| args["indexing_technique"] = dataset.indexing_technique | |||
| @@ -303,7 +305,7 @@ class DocumentDeleteApi(DatasetApiResource): | |||
| dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() | |||
| if not dataset: | |||
| raise ValueError("Dataset is not exist.") | |||
| raise ValueError("Dataset does not exist.") | |||
| document = DocumentService.get_document(dataset.id, document_id) | |||
| @@ -444,7 +444,7 @@ class QdrantVectorFactory(AbstractVectorFactory): | |||
| if dataset_collection_binding: | |||
| collection_name = dataset_collection_binding.collection_name | |||
| else: | |||
| raise ValueError("Dataset Collection Bindings is not exist!") | |||
| raise ValueError("Dataset Collection Bindings does not exist!") | |||
| else: | |||
| if dataset.index_struct_dict: | |||
| class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] | |||
| @@ -169,6 +169,9 @@ class DatasetService: | |||
| provider: str = "vendor", | |||
| external_knowledge_api_id: Optional[str] = None, | |||
| external_knowledge_id: Optional[str] = None, | |||
| embedding_model_provider: Optional[str] = None, | |||
| embedding_model_name: Optional[str] = None, | |||
| retrieval_model: Optional[RetrievalModel] = None, | |||
| ): | |||
| # check if dataset name already exists | |||
| if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first(): | |||
| @@ -176,9 +179,30 @@ class DatasetService: | |||
| embedding_model = None | |||
| if indexing_technique == "high_quality": | |||
| model_manager = ModelManager() | |||
| embedding_model = model_manager.get_default_model_instance( | |||
| tenant_id=tenant_id, model_type=ModelType.TEXT_EMBEDDING | |||
| ) | |||
| if embedding_model_provider and embedding_model_name: | |||
| # check if embedding model setting is valid | |||
| DatasetService.check_embedding_model_setting(tenant_id, embedding_model_provider, embedding_model_name) | |||
| embedding_model = model_manager.get_model_instance( | |||
| tenant_id=tenant_id, | |||
| provider=embedding_model_provider, | |||
| model_type=ModelType.TEXT_EMBEDDING, | |||
| model=embedding_model_name, | |||
| ) | |||
| else: | |||
| embedding_model = model_manager.get_default_model_instance( | |||
| tenant_id=tenant_id, model_type=ModelType.TEXT_EMBEDDING | |||
| ) | |||
| if retrieval_model and retrieval_model.reranking_model: | |||
| if ( | |||
| retrieval_model.reranking_model.reranking_provider_name | |||
| and retrieval_model.reranking_model.reranking_model_name | |||
| ): | |||
| # check if reranking model setting is valid | |||
| DatasetService.check_embedding_model_setting( | |||
| tenant_id, | |||
| retrieval_model.reranking_model.reranking_provider_name, | |||
| retrieval_model.reranking_model.reranking_model_name, | |||
| ) | |||
| dataset = Dataset(name=name, indexing_technique=indexing_technique) | |||
| # dataset = Dataset(name=name, provider=provider, config=config) | |||
| dataset.description = description | |||
| @@ -187,6 +211,7 @@ class DatasetService: | |||
| dataset.tenant_id = tenant_id | |||
| dataset.embedding_model_provider = embedding_model.provider if embedding_model else None | |||
| dataset.embedding_model = embedding_model.model if embedding_model else None | |||
| dataset.retrieval_model = retrieval_model.model_dump() if retrieval_model else None | |||
| dataset.permission = permission or DatasetPermissionEnum.ONLY_ME | |||
| dataset.provider = provider | |||
| db.session.add(dataset) | |||
| @@ -923,11 +948,11 @@ class DocumentService: | |||
| "score_threshold_enabled": False, | |||
| } | |||
| dataset.retrieval_model = ( | |||
| knowledge_config.retrieval_model.model_dump() | |||
| if knowledge_config.retrieval_model | |||
| else default_retrieval_model | |||
| ) # type: ignore | |||
| dataset.retrieval_model = ( | |||
| knowledge_config.retrieval_model.model_dump() | |||
| if knowledge_config.retrieval_model | |||
| else default_retrieval_model | |||
| ) # type: ignore | |||
| documents = [] | |||
| if knowledge_config.original_document_id: | |||
| @@ -314,6 +314,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| </Property> | |||
| <Property name='indexing_technique' type='string' key='indexing_technique'> | |||
| Index technique (optional) | |||
| If this is not set, embedding_model, embedding_provider_name and retrieval_model will be set to null | |||
| - <code>high_quality</code> High quality | |||
| - <code>economy</code> Economy | |||
| </Property> | |||
| @@ -334,6 +335,26 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='external_knowledge_id' type='str' key='external_knowledge_id'> | |||
| External knowledge ID (optional) | |||
| </Property> | |||
| <Property name='embedding_model' type='str' key='embedding_model'> | |||
| Embedding model name (optional) | |||
| </Property> | |||
| <Property name='embedding_provider_name' type='str' key='embedding_provider_name'> | |||
| Embedding model provider name (optional) | |||
| </Property> | |||
| <Property name='retrieval_model' type='object' key='retrieval_model'> | |||
| Retrieval model (optional) | |||
| - <code>search_method</code> (string) Search method | |||
| - <code>hybrid_search</code> Hybrid search | |||
| - <code>semantic_search</code> Semantic search | |||
| - <code>full_text_search</code> Full-text search | |||
| - <code>reranking_enable</code> (bool) Whether to enable reranking | |||
| - <code>reranking_model</code> (object) Rerank model configuration | |||
| - <code>reranking_provider_name</code> (string) Rerank model provider | |||
| - <code>reranking_model_name</code> (string) Rerank model name | |||
| - <code>top_k</code> (int) Number of results to return | |||
| - <code>score_threshold_enabled</code> (bool) Whether to enable score threshold | |||
| - <code>score_threshold</code> (float) Score threshold | |||
| </Property> | |||
| </Properties> | |||
| </Col> | |||
| <Col sticky> | |||
| @@ -2281,4 +2302,4 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| </tr> | |||
| </tbody> | |||
| </table> | |||
| <div className="pb-4" /> | |||
| <div className="pb-4" /> | |||
| @@ -334,6 +334,26 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='external_knowledge_id' type='str' key='external_knowledge_id'> | |||
| 外部ナレッジ ID (オプション) | |||
| </Property> | |||
| <Property name='embedding_model' type='str' key='embedding_model'> | |||
| 埋め込みモデル名(任意) | |||
| </Property> | |||
| <Property name='embedding_provider_name' type='str' key='embedding_provider_name'> | |||
| 埋め込みモデルのプロバイダ名(任意) | |||
| </Property> | |||
| <Property name='retrieval_model' type='object' key='retrieval_model'> | |||
| 検索モデル(任意) | |||
| - <code>search_method</code> (文字列) 検索方法 | |||
| - <code>hybrid_search</code> ハイブリッド検索 | |||
| - <code>semantic_search</code> セマンティック検索 | |||
| - <code>full_text_search</code> 全文検索 | |||
| - <code>reranking_enable</code> (ブール値) リランキングを有効にするかどうか | |||
| - <code>reranking_model</code> (オブジェクト) リランクモデルの設定 | |||
| - <code>reranking_provider_name</code> (文字列) リランクモデルのプロバイダ | |||
| - <code>reranking_model_name</code> (文字列) リランクモデル名 | |||
| - <code>top_k</code> (整数) 返される結果の数 | |||
| - <code>score_threshold_enabled</code> (ブール値) スコア閾値を有効にするかどうか | |||
| - <code>score_threshold</code> (浮動小数点数) スコア閾値 | |||
| </Property> | |||
| </Properties> | |||
| </Col> | |||
| <Col sticky> | |||
| @@ -335,6 +335,26 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi | |||
| <Property name='external_knowledge_id' type='str' key='external_knowledge_id'> | |||
| 外部知识库 ID(选填) | |||
| </Property> | |||
| <Property name='embedding_model' type='str' key='embedding_model'> | |||
| Embedding 模型名称 | |||
| </Property> | |||
| <Property name='embedding_provider_name' type='str' key='embedding_provider_name'> | |||
| Embedding 模型供应商 | |||
| </Property> | |||
| <Property name='retrieval_model' type='object' key='retrieval_model'> | |||
| 检索模式 | |||
| - <code>search_method</code> (string) 检索方法 | |||
| - <code>hybrid_search</code> 混合检索 | |||
| - <code>semantic_search</code> 语义检索 | |||
| - <code>full_text_search</code> 全文检索 | |||
| - <code>reranking_enable</code> (bool) 是否开启rerank | |||
| - <code>reranking_model</code> (object) Rerank 模型配置 | |||
| - <code>reranking_provider_name</code> (string) Rerank 模型的提供商 | |||
| - <code>reranking_model_name</code> (string) Rerank 模型的名称 | |||
| - <code>top_k</code> (int) 召回条数 | |||
| - <code>score_threshold_enabled</code> (bool)是否开启召回分数限制 | |||
| - <code>score_threshold</code> (float) 召回分数限制 | |||
| </Property> | |||
| </Properties> | |||
| </Col> | |||
| <Col sticky> | |||