Signed-off-by: Yuichiro Utsumi <utsumi.yuichiro@fujitsu.com>tags/1.1.0
| @@ -43,3 +43,8 @@ class PGVectorConfig(BaseSettings): | |||
| description="Max connection of the PostgreSQL database", | |||
| default=5, | |||
| ) | |||
| PGVECTOR_PG_BIGM: bool = Field( | |||
| description="Whether to use pg_bigm module for full text search", | |||
| default=False, | |||
| ) | |||
| @@ -25,6 +25,7 @@ class PGVectorConfig(BaseModel): | |||
| database: str | |||
| min_connection: int | |||
| max_connection: int | |||
| pg_bigm: bool = False | |||
| @model_validator(mode="before") | |||
| @classmethod | |||
| @@ -62,12 +63,18 @@ CREATE INDEX IF NOT EXISTS embedding_cosine_v1_idx ON {table_name} | |||
| USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); | |||
| """ | |||
| SQL_CREATE_INDEX_PG_BIGM = """ | |||
| CREATE INDEX IF NOT EXISTS bigm_idx ON {table_name} | |||
| USING gin (text gin_bigm_ops); | |||
| """ | |||
| class PGVector(BaseVector): | |||
| def __init__(self, collection_name: str, config: PGVectorConfig): | |||
| super().__init__(collection_name) | |||
| self.pool = self._create_connection_pool(config) | |||
| self.table_name = f"embedding_{collection_name}" | |||
| self.pg_bigm = config.pg_bigm | |||
| def get_type(self) -> str: | |||
| return VectorType.PGVECTOR | |||
| @@ -176,15 +183,27 @@ class PGVector(BaseVector): | |||
| top_k = kwargs.get("top_k", 5) | |||
| with self._get_cursor() as cur: | |||
| cur.execute( | |||
| f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score | |||
| FROM {self.table_name} | |||
| WHERE to_tsvector(text) @@ plainto_tsquery(%s) | |||
| ORDER BY score DESC | |||
| LIMIT {top_k}""", | |||
| # f"'{query}'" is required in order to account for whitespace in query | |||
| (f"'{query}'", f"'{query}'"), | |||
| ) | |||
| if self.pg_bigm: | |||
| cur.execute("SET pg_bigm.similarity_limit TO 0.000001") | |||
| cur.execute( | |||
| f"""SELECT meta, text, bigm_similarity(unistr(%s), coalesce(text, '')) AS score | |||
| FROM {self.table_name} | |||
| WHERE text =%% unistr(%s) | |||
| ORDER BY score DESC | |||
| LIMIT {top_k}""", | |||
| # f"'{query}'" is required in order to account for whitespace in query | |||
| (f"'{query}'", f"'{query}'"), | |||
| ) | |||
| else: | |||
| cur.execute( | |||
| f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score | |||
| FROM {self.table_name} | |||
| WHERE to_tsvector(text) @@ plainto_tsquery(%s) | |||
| ORDER BY score DESC | |||
| LIMIT {top_k}""", | |||
| # f"'{query}'" is required in order to account for whitespace in query | |||
| (f"'{query}'", f"'{query}'"), | |||
| ) | |||
| docs = [] | |||
| @@ -214,6 +233,9 @@ class PGVector(BaseVector): | |||
| # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing | |||
| if dimension <= 2000: | |||
| cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name)) | |||
| if self.pg_bigm: | |||
| cur.execute("CREATE EXTENSION IF NOT EXISTS pg_bigm") | |||
| cur.execute(SQL_CREATE_INDEX_PG_BIGM.format(table_name=self.table_name)) | |||
| redis_client.set(collection_exist_cache_key, 1, ex=3600) | |||
| @@ -237,5 +259,6 @@ class PGVectorFactory(AbstractVectorFactory): | |||
| database=dify_config.PGVECTOR_DATABASE or "postgres", | |||
| min_connection=dify_config.PGVECTOR_MIN_CONNECTION, | |||
| max_connection=dify_config.PGVECTOR_MAX_CONNECTION, | |||
| pg_bigm=dify_config.PGVECTOR_PG_BIGM, | |||
| ), | |||
| ) | |||
| @@ -431,6 +431,8 @@ PGVECTOR_PASSWORD=difyai123456 | |||
| PGVECTOR_DATABASE=dify | |||
| PGVECTOR_MIN_CONNECTION=1 | |||
| PGVECTOR_MAX_CONNECTION=5 | |||
| PGVECTOR_PG_BIGM=false | |||
| PGVECTOR_PG_BIGM_VERSION=1.2-20240606 | |||
| # pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs` | |||
| PGVECTO_RS_HOST=pgvecto-rs | |||
| @@ -322,8 +322,13 @@ services: | |||
| POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | |||
| # postgres data directory | |||
| PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | |||
| # pg_bigm module for full text search | |||
| PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||
| PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||
| volumes: | |||
| - ./volumes/pgvector/data:/var/lib/postgresql/data | |||
| - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh | |||
| entrypoint: [ '/docker-entrypoint.sh' ] | |||
| healthcheck: | |||
| test: [ 'CMD', 'pg_isready' ] | |||
| interval: 1s | |||
| @@ -157,6 +157,8 @@ x-shared-env: &shared-api-worker-env | |||
| PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify} | |||
| PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1} | |||
| PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5} | |||
| PGVECTOR_PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||
| PGVECTOR_PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||
| PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs} | |||
| PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432} | |||
| PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres} | |||
| @@ -741,8 +743,13 @@ services: | |||
| POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | |||
| # postgres data directory | |||
| PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | |||
| # pg_bigm module for full text search | |||
| PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||
| PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||
| volumes: | |||
| - ./volumes/pgvector/data:/var/lib/postgresql/data | |||
| - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh | |||
| entrypoint: [ '/docker-entrypoint.sh' ] | |||
| healthcheck: | |||
| test: [ 'CMD', 'pg_isready' ] | |||
| interval: 1s | |||
| @@ -0,0 +1,24 @@ | |||
| #!/bin/bash | |||
| PG_MAJOR=16 | |||
| if [ "${PG_BIGM}" = "true" ]; then | |||
| # install pg_bigm | |||
| apt-get update | |||
| apt-get install -y curl make gcc postgresql-server-dev-${PG_MAJOR} | |||
| curl -LO https://github.com/pgbigm/pg_bigm/archive/refs/tags/v${PG_BIGM_VERSION}.tar.gz | |||
| tar xf v${PG_BIGM_VERSION}.tar.gz | |||
| cd pg_bigm-${PG_BIGM_VERSION} || exit 1 | |||
| make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config | |||
| make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config install | |||
| cd - || exit 1 | |||
| rm -rf v${PG_BIGM_VERSION}.tar.gz pg_bigm-${PG_BIGM_VERSION} | |||
| # enable pg_bigm | |||
| sed -i -e 's/^#\s*shared_preload_libraries.*/shared_preload_libraries = '\''pg_bigm'\''/' /var/lib/postgresql/data/pgdata/postgresql.conf | |||
| fi | |||
| # Run the original entrypoint script | |||
| exec /usr/local/bin/docker-entrypoint.sh postgres | |||