Browse Source

Feat elasticsearch japanese (#12194)

tags/1.0.0-beta.1
Hiroshi Fujita 9 months ago
parent
commit
d2586278d6
No account linked to committer's email address

+ 2
- 0
api/controllers/console/datasets/datasets.py View File

| VectorType.MYSCALE | VectorType.MYSCALE
| VectorType.ORACLE | VectorType.ORACLE
| VectorType.ELASTICSEARCH | VectorType.ELASTICSEARCH
| VectorType.ELASTICSEARCH_JA
| VectorType.PGVECTOR | VectorType.PGVECTOR
| VectorType.TIDB_ON_QDRANT | VectorType.TIDB_ON_QDRANT
| VectorType.LINDORM | VectorType.LINDORM
| VectorType.MYSCALE | VectorType.MYSCALE
| VectorType.ORACLE | VectorType.ORACLE
| VectorType.ELASTICSEARCH | VectorType.ELASTICSEARCH
| VectorType.ELASTICSEARCH_JA
| VectorType.COUCHBASE | VectorType.COUCHBASE
| VectorType.PGVECTOR | VectorType.PGVECTOR
| VectorType.LINDORM | VectorType.LINDORM

+ 104
- 0
api/core/rag/datasource/vdb/elasticsearch/elasticsearch_ja_vector.py View File

import json
import logging
from typing import Any, Optional

from flask import current_app

from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import (
ElasticSearchConfig,
ElasticSearchVector,
ElasticSearchVectorFactory,
)
from core.rag.datasource.vdb.field import Field
from core.rag.datasource.vdb.vector_type import VectorType
from core.rag.embedding.embedding_base import Embeddings
from extensions.ext_redis import redis_client
from models.dataset import Dataset

logger = logging.getLogger(__name__)


class ElasticSearchJaVector(ElasticSearchVector):
def create_collection(
self,
embeddings: list[list[float]],
metadatas: Optional[list[dict[Any, Any]]] = None,
index_params: Optional[dict] = None,
):
lock_name = f"vector_indexing_lock_{self._collection_name}"
with redis_client.lock(lock_name, timeout=20):
collection_exist_cache_key = f"vector_indexing_{self._collection_name}"
if redis_client.get(collection_exist_cache_key):
logger.info(f"Collection {self._collection_name} already exists.")
return

if not self._client.indices.exists(index=self._collection_name):
dim = len(embeddings[0])
settings = {
"analysis": {
"analyzer": {
"ja_analyzer": {
"type": "custom",
"char_filter": [
"icu_normalizer",
"kuromoji_iteration_mark",
],
"tokenizer": "kuromoji_tokenizer",
"filter": [
"kuromoji_baseform",
"kuromoji_part_of_speech",
"ja_stop",
"kuromoji_number",
"kuromoji_stemmer",
],
}
}
}
}
mappings = {
"properties": {
Field.CONTENT_KEY.value: {
"type": "text",
"analyzer": "ja_analyzer",
"search_analyzer": "ja_analyzer",
},
Field.VECTOR.value: { # Make sure the dimension is correct here
"type": "dense_vector",
"dims": dim,
"index": True,
"similarity": "cosine",
},
Field.METADATA_KEY.value: {
"type": "object",
"properties": {
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
},
},
}
}
self._client.indices.create(index=self._collection_name, settings=settings, mappings=mappings)

redis_client.set(collection_exist_cache_key, 1, ex=3600)


class ElasticSearchJaVectorFactory(ElasticSearchVectorFactory):
def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> ElasticSearchJaVector:
if dataset.index_struct_dict:
class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
collection_name = class_prefix
else:
dataset_id = dataset.id
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.ELASTICSEARCH, collection_name))

config = current_app.config
return ElasticSearchJaVector(
index_name=collection_name,
config=ElasticSearchConfig(
host=config.get("ELASTICSEARCH_HOST", "localhost"),
port=config.get("ELASTICSEARCH_PORT", 9200),
username=config.get("ELASTICSEARCH_USERNAME", ""),
password=config.get("ELASTICSEARCH_PASSWORD", ""),
),
attributes=[],
)

+ 6
- 0
api/core/rag/datasource/vdb/vector_factory.py View File

from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory from core.rag.datasource.vdb.elasticsearch.elasticsearch_vector import ElasticSearchVectorFactory


return ElasticSearchVectorFactory return ElasticSearchVectorFactory
case VectorType.ELASTICSEARCH_JA:
from core.rag.datasource.vdb.elasticsearch.elasticsearch_ja_vector import (
ElasticSearchJaVectorFactory,
)

return ElasticSearchJaVectorFactory
case VectorType.TIDB_VECTOR: case VectorType.TIDB_VECTOR:
from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory from core.rag.datasource.vdb.tidb_vector.tidb_vector import TiDBVectorFactory



+ 1
- 0
api/core/rag/datasource/vdb/vector_type.py View File

TENCENT = "tencent" TENCENT = "tencent"
ORACLE = "oracle" ORACLE = "oracle"
ELASTICSEARCH = "elasticsearch" ELASTICSEARCH = "elasticsearch"
ELASTICSEARCH_JA = "elasticsearch-ja"
LINDORM = "lindorm" LINDORM = "lindorm"
COUCHBASE = "couchbase" COUCHBASE = "couchbase"
BAIDU = "baidu" BAIDU = "baidu"

+ 2
- 2
docker/.env.example View File

# ------------------------------ # ------------------------------


# The type of vector store to use. # The type of vector store to use.
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `elasticsearch-ja`, `analyticdb`, `couchbase`, `vikingdb`, `oceanbase`.
VECTOR_STORE=weaviate VECTOR_STORE=weaviate


# The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
TENCENT_VECTOR_DB_REPLICAS=2 TENCENT_VECTOR_DB_REPLICAS=2


# ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch` # ElasticSearch configuration, only available when VECTOR_STORE is `elasticsearch`
ELASTICSEARCH_HOST=0.0.0.0
ELASTICSEARCH_HOST=elasticsearch
ELASTICSEARCH_PORT=9200 ELASTICSEARCH_PORT=9200
ELASTICSEARCH_USERNAME=elastic ELASTICSEARCH_USERNAME=elastic
ELASTICSEARCH_PASSWORD=elastic ELASTICSEARCH_PASSWORD=elastic

+ 9
- 1
docker/docker-compose.yaml View File

container_name: elasticsearch container_name: elasticsearch
profiles: profiles:
- elasticsearch - elasticsearch
- elasticsearch-ja
restart: always restart: always
volumes: volumes:
- ./elasticsearch/docker-entrypoint.sh:/docker-entrypoint-mount.sh
- dify_es01_data:/usr/share/elasticsearch/data - dify_es01_data:/usr/share/elasticsearch/data
environment: environment:
ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic} ELASTIC_PASSWORD: ${ELASTICSEARCH_PASSWORD:-elastic}
VECTOR_STORE: ${VECTOR_STORE:-}
cluster.name: dify-es-cluster cluster.name: dify-es-cluster
node.name: dify-es0 node.name: dify-es0
discovery.type: single-node discovery.type: single-node
xpack.license.self_generated.type: trial
xpack.license.self_generated.type: basic
xpack.security.enabled: 'true' xpack.security.enabled: 'true'
xpack.security.enrollment.enabled: 'false' xpack.security.enrollment.enabled: 'false'
xpack.security.http.ssl.enabled: 'false' xpack.security.http.ssl.enabled: 'false'
ports: ports:
- ${ELASTICSEARCH_PORT:-9200}:9200 - ${ELASTICSEARCH_PORT:-9200}:9200
deploy:
resources:
limits:
memory: 2g
entrypoint: [ 'sh', '-c', "sh /docker-entrypoint-mount.sh" ]
healthcheck: healthcheck:
test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ] test: [ 'CMD', 'curl', '-s', 'http://localhost:9200/_cluster/health?pretty' ]
interval: 30s interval: 30s

+ 25
- 0
docker/elasticsearch/docker-entrypoint.sh View File

#!/bin/bash

set -e

if [ "${VECTOR_STORE}" = "elasticsearch-ja" ]; then
# Check if the ICU tokenizer plugin is installed
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-icu; then
printf '%s\n' "Installing the ICU tokenizer plugin"
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu; then
printf '%s\n' "Failed to install the ICU tokenizer plugin"
exit 1
fi
fi
# Check if the Japanese language analyzer plugin is installed
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin list | grep -q analysis-kuromoji; then
printf '%s\n' "Installing the Japanese language analyzer plugin"
if ! /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji; then
printf '%s\n' "Failed to install the Japanese language analyzer plugin"
exit 1
fi
fi
fi

# Run the original entrypoint script
exec /bin/tini -- /usr/local/bin/docker-entrypoint.sh

Loading…
Cancel
Save