Signed-off-by: Yuichiro Utsumi <utsumi.yuichiro@fujitsu.com>tags/1.1.0
| description="Max connection of the PostgreSQL database", | description="Max connection of the PostgreSQL database", | ||||
| default=5, | default=5, | ||||
| ) | ) | ||||
| PGVECTOR_PG_BIGM: bool = Field( | |||||
| description="Whether to use pg_bigm module for full text search", | |||||
| default=False, | |||||
| ) | 
| database: str | database: str | ||||
| min_connection: int | min_connection: int | ||||
| max_connection: int | max_connection: int | ||||
| pg_bigm: bool = False | |||||
| @model_validator(mode="before") | @model_validator(mode="before") | ||||
| @classmethod | @classmethod | ||||
| USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); | USING hnsw (embedding vector_cosine_ops) WITH (m = 16, ef_construction = 64); | ||||
| """ | """ | ||||
| SQL_CREATE_INDEX_PG_BIGM = """ | |||||
| CREATE INDEX IF NOT EXISTS bigm_idx ON {table_name} | |||||
| USING gin (text gin_bigm_ops); | |||||
| """ | |||||
| class PGVector(BaseVector): | class PGVector(BaseVector): | ||||
| def __init__(self, collection_name: str, config: PGVectorConfig): | def __init__(self, collection_name: str, config: PGVectorConfig): | ||||
| super().__init__(collection_name) | super().__init__(collection_name) | ||||
| self.pool = self._create_connection_pool(config) | self.pool = self._create_connection_pool(config) | ||||
| self.table_name = f"embedding_{collection_name}" | self.table_name = f"embedding_{collection_name}" | ||||
| self.pg_bigm = config.pg_bigm | |||||
| def get_type(self) -> str: | def get_type(self) -> str: | ||||
| return VectorType.PGVECTOR | return VectorType.PGVECTOR | ||||
| top_k = kwargs.get("top_k", 5) | top_k = kwargs.get("top_k", 5) | ||||
| with self._get_cursor() as cur: | with self._get_cursor() as cur: | ||||
| cur.execute( | |||||
| f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score | |||||
| FROM {self.table_name} | |||||
| WHERE to_tsvector(text) @@ plainto_tsquery(%s) | |||||
| ORDER BY score DESC | |||||
| LIMIT {top_k}""", | |||||
| # f"'{query}'" is required in order to account for whitespace in query | |||||
| (f"'{query}'", f"'{query}'"), | |||||
| ) | |||||
| if self.pg_bigm: | |||||
| cur.execute("SET pg_bigm.similarity_limit TO 0.000001") | |||||
| cur.execute( | |||||
| f"""SELECT meta, text, bigm_similarity(unistr(%s), coalesce(text, '')) AS score | |||||
| FROM {self.table_name} | |||||
| WHERE text =%% unistr(%s) | |||||
| ORDER BY score DESC | |||||
| LIMIT {top_k}""", | |||||
| # f"'{query}'" is required in order to account for whitespace in query | |||||
| (f"'{query}'", f"'{query}'"), | |||||
| ) | |||||
| else: | |||||
| cur.execute( | |||||
| f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score | |||||
| FROM {self.table_name} | |||||
| WHERE to_tsvector(text) @@ plainto_tsquery(%s) | |||||
| ORDER BY score DESC | |||||
| LIMIT {top_k}""", | |||||
| # f"'{query}'" is required in order to account for whitespace in query | |||||
| (f"'{query}'", f"'{query}'"), | |||||
| ) | |||||
| docs = [] | docs = [] | ||||
| # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing | # ref: https://github.com/pgvector/pgvector?tab=readme-ov-file#indexing | ||||
| if dimension <= 2000: | if dimension <= 2000: | ||||
| cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name)) | cur.execute(SQL_CREATE_INDEX.format(table_name=self.table_name)) | ||||
| if self.pg_bigm: | |||||
| cur.execute("CREATE EXTENSION IF NOT EXISTS pg_bigm") | |||||
| cur.execute(SQL_CREATE_INDEX_PG_BIGM.format(table_name=self.table_name)) | |||||
| redis_client.set(collection_exist_cache_key, 1, ex=3600) | redis_client.set(collection_exist_cache_key, 1, ex=3600) | ||||
| database=dify_config.PGVECTOR_DATABASE or "postgres", | database=dify_config.PGVECTOR_DATABASE or "postgres", | ||||
| min_connection=dify_config.PGVECTOR_MIN_CONNECTION, | min_connection=dify_config.PGVECTOR_MIN_CONNECTION, | ||||
| max_connection=dify_config.PGVECTOR_MAX_CONNECTION, | max_connection=dify_config.PGVECTOR_MAX_CONNECTION, | ||||
| pg_bigm=dify_config.PGVECTOR_PG_BIGM, | |||||
| ), | ), | ||||
| ) | ) | 
| PGVECTOR_DATABASE=dify | PGVECTOR_DATABASE=dify | ||||
| PGVECTOR_MIN_CONNECTION=1 | PGVECTOR_MIN_CONNECTION=1 | ||||
| PGVECTOR_MAX_CONNECTION=5 | PGVECTOR_MAX_CONNECTION=5 | ||||
| PGVECTOR_PG_BIGM=false | |||||
| PGVECTOR_PG_BIGM_VERSION=1.2-20240606 | |||||
| # pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs` | # pgvecto-rs configurations, only available when VECTOR_STORE is `pgvecto-rs` | ||||
| PGVECTO_RS_HOST=pgvecto-rs | PGVECTO_RS_HOST=pgvecto-rs | 
| POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | ||||
| # postgres data directory | # postgres data directory | ||||
| PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | ||||
| # pg_bigm module for full text search | |||||
| PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||||
| PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||||
| volumes: | volumes: | ||||
| - ./volumes/pgvector/data:/var/lib/postgresql/data | - ./volumes/pgvector/data:/var/lib/postgresql/data | ||||
| - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh | |||||
| entrypoint: [ '/docker-entrypoint.sh' ] | |||||
| healthcheck: | healthcheck: | ||||
| test: [ 'CMD', 'pg_isready' ] | test: [ 'CMD', 'pg_isready' ] | ||||
| interval: 1s | interval: 1s | 
| PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify} | PGVECTOR_DATABASE: ${PGVECTOR_DATABASE:-dify} | ||||
| PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1} | PGVECTOR_MIN_CONNECTION: ${PGVECTOR_MIN_CONNECTION:-1} | ||||
| PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5} | PGVECTOR_MAX_CONNECTION: ${PGVECTOR_MAX_CONNECTION:-5} | ||||
| PGVECTOR_PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||||
| PGVECTOR_PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||||
| PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs} | PGVECTO_RS_HOST: ${PGVECTO_RS_HOST:-pgvecto-rs} | ||||
| PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432} | PGVECTO_RS_PORT: ${PGVECTO_RS_PORT:-5432} | ||||
| PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres} | PGVECTO_RS_USER: ${PGVECTO_RS_USER:-postgres} | ||||
| POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | POSTGRES_DB: ${PGVECTOR_POSTGRES_DB:-dify} | ||||
| # postgres data directory | # postgres data directory | ||||
| PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | PGDATA: ${PGVECTOR_PGDATA:-/var/lib/postgresql/data/pgdata} | ||||
| # pg_bigm module for full text search | |||||
| PG_BIGM: ${PGVECTOR_PG_BIGM:-false} | |||||
| PG_BIGM_VERSION: ${PGVECTOR_PG_BIGM_VERSION:-1.2-20240606} | |||||
| volumes: | volumes: | ||||
| - ./volumes/pgvector/data:/var/lib/postgresql/data | - ./volumes/pgvector/data:/var/lib/postgresql/data | ||||
| - ./pgvector/docker-entrypoint.sh:/docker-entrypoint.sh | |||||
| entrypoint: [ '/docker-entrypoint.sh' ] | |||||
| healthcheck: | healthcheck: | ||||
| test: [ 'CMD', 'pg_isready' ] | test: [ 'CMD', 'pg_isready' ] | ||||
| interval: 1s | interval: 1s | 
| #!/bin/bash | |||||
| PG_MAJOR=16 | |||||
| if [ "${PG_BIGM}" = "true" ]; then | |||||
| # install pg_bigm | |||||
| apt-get update | |||||
| apt-get install -y curl make gcc postgresql-server-dev-${PG_MAJOR} | |||||
| curl -LO https://github.com/pgbigm/pg_bigm/archive/refs/tags/v${PG_BIGM_VERSION}.tar.gz | |||||
| tar xf v${PG_BIGM_VERSION}.tar.gz | |||||
| cd pg_bigm-${PG_BIGM_VERSION} || exit 1 | |||||
| make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config | |||||
| make USE_PGXS=1 PG_CONFIG=/usr/bin/pg_config install | |||||
| cd - || exit 1 | |||||
| rm -rf v${PG_BIGM_VERSION}.tar.gz pg_bigm-${PG_BIGM_VERSION} | |||||
| # enable pg_bigm | |||||
| sed -i -e 's/^#\s*shared_preload_libraries.*/shared_preload_libraries = '\''pg_bigm'\''/' /var/lib/postgresql/data/pgdata/postgresql.conf | |||||
| fi | |||||
| # Run the original entrypoint script | |||||
| exec /usr/local/bin/docker-entrypoint.sh postgres |