Преглед на файлове

add vdb document id index (#16244)

Co-authored-by: crazywoola <427733928@qq.com>
tags/1.1.1
Jyong преди 7 месеца
родител
ревизия
d135677c25
No account linked to committer's email address

+ 2
- 1
api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py Целия файл

@@ -196,7 +196,8 @@ class ElasticSearchVector(BaseVector):
Field.METADATA_KEY.value: {
"type": "object",
"properties": {
"doc_id": {"type": "keyword"} # Map doc_id to keyword type
"doc_id": {"type": "keyword"}, # Map doc_id to keyword type
"document_id": {"type": "keyword"}, # Map doc_id to keyword type
},
},
}

+ 1
- 0
api/core/rag/datasource/vdb/field.py Целия файл

@@ -11,3 +11,4 @@ class Field(Enum):
TEXT_KEY = "text"
PRIMARY_KEY = "id"
DOC_ID = "metadata.doc_id"
DOCUMENT_ID = "metadata.document_id"

+ 4
- 0
api/core/rag/datasource/vdb/qdrant/qdrant_vector.py Целия файл

@@ -134,6 +134,10 @@ class QdrantVector(BaseVector):
self._client.create_payload_index(
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
)
# create document_id payload index
self._client.create_payload_index(
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
)
# create full text index
text_index_params = TextIndexParams(
type=TextIndexType.TEXT,

+ 14
- 22
api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py Целия файл

@@ -144,6 +144,10 @@ class TidbOnQdrantVector(BaseVector):
self._client.create_payload_index(
collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD
)
# create document_id payload index
self._client.create_payload_index(
collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD
)
# create full text index
text_index_params = TextIndexParams(
type=TextIndexType.TEXT,
@@ -318,23 +322,17 @@ class TidbOnQdrantVector(BaseVector):
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
from qdrant_client.http import models

filter = models.Filter(
must=[
models.FieldCondition(
key="group_id",
match=models.MatchValue(value=self._group_id),
),
],
)
filter = None
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
if filter.must:
filter.must.append(
filter = models.Filter(
must=[
models.FieldCondition(
key="metadata.document_id",
match=models.MatchAny(any=document_ids_filter),
)
)
],
)
results = self._client.search(
collection_name=self._collection_name,
query_vector=query_vector,
@@ -369,23 +367,17 @@ class TidbOnQdrantVector(BaseVector):
"""
from qdrant_client.http import models

scroll_filter = models.Filter(
must=[
models.FieldCondition(
key="page_content",
match=models.MatchText(text=query),
)
]
)
scroll_filter = None
document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter:
if scroll_filter.must:
scroll_filter.must.append(
scroll_filter = models.Filter(
must=[
models.FieldCondition(
key="metadata.document_id",
match=models.MatchAny(any=document_ids_filter),
)
)
]
)
response = self._client.scroll(
collection_name=self._collection_name,
scroll_filter=scroll_filter,

+ 2
- 0
api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py Целия файл

@@ -105,10 +105,12 @@ class TiDBVector(BaseVector):
text TEXT NOT NULL,
meta JSON NOT NULL,
doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED,
document_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.document_id'))) STORED,
vector VECTOR<FLOAT>({dimension}) NOT NULL,
create_time DATETIME DEFAULT CURRENT_TIMESTAMP,
update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY (doc_id),
KEY (document_id),
VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW
);
""")

Loading…
Отказ
Запис