| | VectorType.MYSCALE | | VectorType.MYSCALE | ||||
| | VectorType.ORACLE | | VectorType.ORACLE | ||||
| | VectorType.ELASTICSEARCH | | VectorType.ELASTICSEARCH | ||||
| | VectorType.ELASTICSEARCH_JA | |||||
| | VectorType.PGVECTOR | | VectorType.PGVECTOR | ||||
| | VectorType.TIDB_ON_QDRANT | | VectorType.TIDB_ON_QDRANT | ||||
| | VectorType.LINDORM | | VectorType.LINDORM | ||||
| | VectorType.MYSCALE | | VectorType.MYSCALE | ||||
| | VectorType.ORACLE | | VectorType.ORACLE | ||||
| | VectorType.ELASTICSEARCH | | VectorType.ELASTICSEARCH | ||||
| | VectorType.ELASTICSEARCH_JA | |||||
| | VectorType.COUCHBASE | | VectorType.COUCHBASE | ||||
| | VectorType.PGVECTOR | | VectorType.PGVECTOR | ||||
| | VectorType.LINDORM | | VectorType.LINDORM |
| import json | |||||
| import logging | |||||
| from typing import Any, Optional | |||||
| from flask import current_app | |||||
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ( | |||||
| ElasticSearchConfig, | |||||
| ElasticSearchVector, | |||||
| ElasticSearchVectorFactory, | |||||
| ) | |||||
| from core.rag.datasource.vdb.field import Field | |||||
| from core.rag.datasource.vdb.vector_type import VectorType | |||||
| from core.rag.embedding.embedding_base import Embeddings | |||||
| from extensions.ext_redis import redis_client | |||||
| from models.dataset import Dataset | |||||
| logger = logging.getLogger(__name__) | |||||
| class ElasticSearchJaVector(ElasticSearchVector): | |||||
| def create_collection( | |||||
| self, | |||||
| embeddings: list[list[float]], | |||||
| metadatas: Optional[list[dict[Any, Any]]] = None, | |||||
| index_params: Optional[dict] = None, | |||||
| ): | |||||
| lock_name = f"vector_indexing_lock_{self._collection_name}" | |||||
| with redis_client.lock(lock_name, timeout=20): | |||||
| collection_exist_cache_key = f"vector_indexing_{self._collection_name}" | |||||
| if redis_client.get(collection_exist_cache_key): | |||||
| logger.info(f"Collection {self._collection_name} already exists.") | |||||
| return | |||||
| if not self._client.indices.exists(index=self._collection_name): | |||||
| dim = len(embeddings[0]) | |||||
| settings = { | |||||
| "analysis": { | |||||
| "analyzer": { | |||||
| "ja_analyzer": { | |||||
| "type": "custom", | |||||
| "char_filter": [ | |||||
| "icu_normalizer", | |||||
| "kuromoji_iteration_mark", | |||||
| ], | |||||
| "tokenizer": "kuromoji_tokenizer", | |||||
| "filter": [ | |||||
| "kuromoji_baseform", | |||||
| "kuromoji_part_of_speech", | |||||
| "ja_stop", | |||||
| "kuromoji_number", | |||||
| "kuromoji_stemmer", | |||||
| ], | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| mappings = { | |||||
| "properties": { | |||||
| Field.CONTENT_KEY.value: { | |||||
| "type": "text", | |||||
| "analyzer": "ja_analyzer", | |||||
| "search_analyzer": "ja_analyzer", | |||||
| }, | |||||
| Field.VECTOR.value: { # Make sure the dimension is correct here | |||||
| "type": "dense_vector", | |||||
| "dims": dim, | |||||
| "index": True, | |||||
| "similarity": "cosine", | |||||
| }, | |||||
| Field.METADATA_KEY.value: { | |||||
| "type": "object", | |||||
| "properties": { | |||||
| "doc_id": {"type": "keyword"} # Map doc_id to keyword type | |||||
| }, | |||||
| }, | |||||
| } | |||||
| } | |||||
| self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings) | |||||
| redis_client.set(collection_exist_cache_key, 1, ex=3600) | |||||
| class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory): | |||||
| def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector: | |||||
| if dataset.index_struct_dict: | |||||
| class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"] | |||||
| collection_name = class_prefix | |||||
| else: | |||||
| dataset_id = dataset.id | |||||
| collection_name = Dataset.gen_collection_name_by_id(dataset_id) | |||||
| dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name)) | |||||
| config = current_app.config | |||||
| return ElasticSearchJaVector( | |||||
| index_name=collection_name, | |||||
| config=ElasticSearchConfig( | |||||
| host=config.get("ELASTICSEARCH_HOST", "localhost"), | |||||
| port=config.get("ELASTICSEARCH_PORT", 9200), | |||||
| username=config.get("ELASTICSEARCH_USERNAME", ""), | |||||
| password=config.get("ELASTICSEARCH_PASSWORD", ""), | |||||
| ), | |||||
| attributes=[], | |||||
| ) |
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory | from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory | ||||
| return ElasticSearchVectorFactory | return ElasticSearchVectorFactory | ||||
| case VectorType.ELASTICSEARCH_JA: | |||||
| from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import ( | |||||
| ElasticSearchJaVectorFactory, | |||||
| ) | |||||
| return ElasticSearchJaVectorFactory | |||||
| case VectorType.TIDB_VECTOR: | case VectorType.TIDB_VECTOR: | ||||
| from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory | from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory | ||||
| TENCENT = "tencent" | TENCENT = "tencent" | ||||
| ORACLE = "oracle" | ORACLE = "oracle" | ||||
| ELASTICSEARCH = "elasticsearch" | ELASTICSEARCH = "elasticsearch" | ||||
| ELASTICSEARCH_JA = "elasticsearch-ja" | |||||
| LINDORM = "lindorm" | LINDORM = "lindorm" | ||||
| COUCHBASE = "couchbase" | COUCHBASE = "couchbase" | ||||
| BAIDU = "baidu" | BAIDU = "baidu" |
| # ------------------------------ | # ------------------------------ | ||||
| # The type of vector store to use. | # The type of vector store to use. | ||||
| # Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. | |||||
| # Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`. | |||||
| VECTOR_STORE=weaviate | VECTOR_STORE=weaviate | ||||
| # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. | # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. | ||||
| TENCENT_VECTOR_DB_REPLICAS=2 | TENCENT_VECTOR_DB_REPLICAS=2 | ||||
| # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` | # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` | ||||
| ELASTICSEARCH_HOST=0.0.0.0 | |||||
| ELASTICSEARCH_HOST=elasticsearch | |||||
| ELASTICSEARCH_PORT=9200 | ELASTICSEARCH_PORT=9200 | ||||
| ELASTICSEARCH_USERNAME=elastic | ELASTICSEARCH_USERNAME=elastic | ||||
| ELASTICSEARCH_PASSWORD=elastic | ELASTICSEARCH_PASSWORD=elastic |
| container_name: elasticsearch | container_name: elasticsearch | ||||
| profiles: | profiles: | ||||
| - elasticsearch | - elasticsearch | ||||
| - elasticsearch-ja | |||||
| restart: always | restart: always | ||||
| volumes: | volumes: | ||||
| - ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh | |||||
| - dify_es01_data:/usr/share/elasticsearch/data | - dify_es01_data:/usr/share/elasticsearch/data | ||||
| environment: | environment: | ||||
| ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic} | ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic} | ||||
| VECTOR_STORE: ${VECTOR_STORE:-} | |||||
| cluster.name: dify-es-cluster | cluster.name: dify-es-cluster | ||||
| node.name: dify-es0 | node.name: dify-es0 | ||||
| discovery.type: single-node | discovery.type: single-node | ||||
| xpack.license.self_generated.type: trial | |||||
| xpack.license.self_generated.type: basic | |||||
| xpack.security.enabled: 'true' | xpack.security.enabled: 'true' | ||||
| xpack.security.enrollment.enabled: 'false' | xpack.security.enrollment.enabled: 'false' | ||||
| xpack.security.http.ssl.enabled: 'false' | xpack.security.http.ssl.enabled: 'false' | ||||
| ports: | ports: | ||||
| - ${ELASTICSEARCH_PORT:-9200}:9200 | - ${ELASTICSEARCH_PORT:-9200}:9200 | ||||
| deploy: | |||||
| resources: | |||||
| limits: | |||||
| memory: 2g | |||||
| entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ] | |||||
| healthcheck: | healthcheck: | ||||
| test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ] | test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ] | ||||
| interval: 30s | interval: 30s |
| #!/bin/bash | |||||
| set -e | |||||
| if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then | |||||
| # Check if the ICU tokenizer plugin is installed | |||||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then | |||||
| printf '%s\n' "Installing the ICU tokenizer plugin" | |||||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then | |||||
| printf '%s\n' "Failed to install the ICU tokenizer plugin" | |||||
| exit 1 | |||||
| fi | |||||
| fi | |||||
| # Check if the Japanese language analyzer plugin is installed | |||||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then | |||||
| printf '%s\n' "Installing the Japanese language analyzer plugin" | |||||
| if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then | |||||
| printf '%s\n' "Failed to install the Japanese language analyzer plugin" | |||||
| exit 1 | |||||
| fi | |||||
| fi | |||||
| fi | |||||
| # Run the original entrypoint script | |||||
| exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh |