| @@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource): | |||
| | VectorType.MYSCALE | |||
| | VectorType.ORACLE | |||
| | VectorType.ELASTICSEARCH | |||
| | VectorType.ELASTICSEARCH_JA | |||
| | VectorType.PGVECTOR | |||
| | VectorType.TIDB_ON_QDRANT | |||
| | VectorType.LINDORM | |||
| @@ -683,6 +684,7 @@ class DatasetRetrievalSettingMockApi(Resource): | |||
| | VectorType.MYSCALE | |||
| | VectorType.ORACLE | |||
| | VectorType.ELASTICSEARCH | |||
| | VectorType.ELASTICSEARCH_JA | |||
| | VectorType.COUCHBASE | |||
| | VectorType.PGVECTOR | |||
| | VectorType.LINDORM | |||
| @@ -0,0 +1,104 @@ | |||
| import json | |||
| import logging | |||
| from typing import Any, Optional | |||
| from flask import current_app | |||
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ( | |||
| ElasticSearchConfig, | |||
| ElasticSearchVector, | |||
| ElasticSearchVectorFactory, | |||
| ) | |||
| from core.rag.datasource.vdb.field import Field | |||
| from core.rag.datasource.vdb.vector_type import VectorType | |||
| from core.rag.embedding.embedding_base import Embeddings | |||
| from extensions.ext_redis import redis_client | |||
| from models.dataset import Dataset | |||
| logger = logging.getLogger(__name__) | |||
| class ElasticSearchJaVector(ElasticSearchVector): | |||
| def create_collection( | |||
| self, | |||
| embeddings: list[list[float]], | |||
| metadatas: Optional[list[dict[Any, Any]]] = None, | |||
| index_params: Optional[dict] = None, | |||
| ): | |||
| lock_name = f"vector_indexing_lock_{self._collection_name}" | |||
| with redis_client.lock(lock_name, timeout=20): | |||
| collection_exist_cache_key = f"vector_indexing_{self._collection_name}" | |||
| if redis_client.get(collection_exist_cache_key): | |||
| logger.info(f"Collection {self._collection_name} already exists.") | |||
| return | |||
| if not self._client.indices.exists(index=self._collection_name): | |||
| dim = len(embeddings[0]) | |||
| settings = { | |||
| "analysis": { | |||
| "analyzer": { | |||
| "ja_analyzer": { | |||
| "type": "custom", | |||
| "char_filter": [ | |||
| "icu_normalizer", | |||
| "kuromoji_iteration_mark", | |||
| ], | |||
| "tokenizer": "kuromoji_tokenizer", | |||
| "filter": [ | |||
| "kuromoji_baseform", | |||
| "kuromoji_part_of_speech", | |||
| "ja_stop", | |||
| "kuromoji_number", | |||
| "kuromoji_stemmer", | |||
| ], | |||
| } | |||
| } | |||
| } | |||
| } | |||
| mappings = { | |||
| "properties": { | |||
| Field.CONTENT_KEY.value: { | |||
| "type": "text", | |||
| "analyzer": "ja_analyzer", | |||
| "search_analyzer": "ja_analyzer", | |||
| }, | |||
| Field.VECTOR.value: { # Make sure the dimension is correct here | |||
| "type": "dense_vector", | |||
| "dims": dim, | |||
| "index": True, | |||
| "similarity": "cosine", | |||
| }, | |||
| Field.METADATA_KEY.value: { | |||
| "type": "object", | |||
| "properties": { | |||
| "doc_id": {"type": "keyword"} # Map doc_id to keyword type | |||
| }, | |||
| }, | |||
| } | |||
| } | |||
| self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings) | |||
| redis_client.set(collection_exist_cache_key, 1, ex=3600) | |||
| class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory): | |||
| def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector: | |||
| if dataset.index_struct_dict: | |||
| class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] | |||
| collection_name = class_prefix | |||
| else: | |||
| dataset_id = dataset.id | |||
| collection_name = Dataset.gen_collection_name_by_id(dataset_id) | |||
| dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name)) | |||
| config = current_app.config | |||
| return ElasticSearchJaVector( | |||
| index_name=collection_name, | |||
| config=ElasticSearchConfig( | |||
| host=config.get("ELASTICSEARCH_HOST", "localhost"), | |||
| port=config.get("ELASTICSEARCH_PORT", 9200), | |||
| username=config.get("ELASTICSEARCH_USERNAME", ""), | |||
| password=config.get("ELASTICSEARCH_PASSWORD", ""), | |||
| ), | |||
| attributes=[], | |||
| ) | |||
| @@ -90,6 +90,12 @@ class Vector: | |||
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory | |||
| return ElasticSearchVectorFactory | |||
| case VectorType.ELASTICSEARCH_JA: | |||
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import ( | |||
| ElasticSearchJaVectorFactory, | |||
| ) | |||
| return ElasticSearchJaVectorFactory | |||
| case VectorType.TIDB_VECTOR: | |||
| from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory | |||
| @@ -16,6 +16,7 @@ class VectorType(StrEnum): | |||
| TENCENT = "tencent" | |||
| ORACLE = "oracle" | |||
| ELASTICSEARCH = "elasticsearch" | |||
| ELASTICSEARCH_JA = "elasticsearch-ja" | |||
| LINDORM = "lindorm" | |||
| COUCHBASE = "couchbase" | |||
| BAIDU = "baidu" | |||
| @@ -383,7 +383,7 @@ SUPABASE_URL=your-server-url | |||
| # ------------------------------ | |||
| # The type of vector store to use. | |||
| # Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. | |||
| # Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. | |||
| VECTOR_STORE=weaviate | |||
| # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. | |||
| @@ -512,7 +512,7 @@ TENCENT_VECTOR_DB_SHARD=1 | |||
| TENCENT_VECTOR_DB_REPLICAS=2 | |||
| # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` | |||
| ELASTICSEARCH_HOST=0.0.0.0 | |||
| ELASTICSEARCH_HOST=elasticsearch | |||
| ELASTICSEARCH_PORT=9200 | |||
| ELASTICSEARCH_USERNAME=elastic | |||
| ELASTICSEARCH_PASSWORD=elastic | |||
| @@ -883,20 +883,28 @@ services: | |||
| container_name: elasticsearch | |||
| profiles: | |||
| - elasticsearch | |||
| - elasticsearch-ja | |||
| restart: always | |||
| volumes: | |||
| - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh | |||
| - dify_es01_data:/usr/share/elasticsearch/data | |||
| environment: | |||
| ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic} | |||
| VECTOR_STORE: ${VECTOR_STORE:-} | |||
| cluster.name: dify-es-cluster | |||
| node.name: dify-es0 | |||
| discovery.type: single-node | |||
| xpack.license.self_generated.type: trial | |||
| xpack.license.self_generated.type: basic | |||
| xpack.security.enabled: 'true' | |||
| xpack.security.enrollment.enabled: 'false' | |||
| xpack.security.http.ssl.enabled: 'false' | |||
| ports: | |||
| - ${ELASTICSEARCH_PORT:-9200}:9200 | |||
| deploy: | |||
| resources: | |||
| limits: | |||
| memory: 2g | |||
| entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ] | |||
| healthcheck: | |||
| test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ] | |||
| interval: 30s | |||
| @@ -0,0 +1,25 @@ | |||
| #!/bin/bash | |||
| set -e | |||
| if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then | |||
| # Check if the ICU tokenizer plugin is installed | |||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then | |||
| printf '%s\n' "Installing the ICU tokenizer plugin" | |||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then | |||
| printf '%s\n' "Failed to install the ICU tokenizer plugin" | |||
| exit 1 | |||
| fi | |||
| fi | |||
| # Check if the Japanese language analyzer plugin is installed | |||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then | |||
| printf '%s\n' "Installing the Japanese language analyzer plugin" | |||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then | |||
| printf '%s\n' "Failed to install the Japanese language analyzer plugin" | |||
| exit 1 | |||
| fi | |||
| fi | |||
| fi | |||
| # Run the original entrypoint script | |||
| exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh | |||