Browse Source

Lindorm VDB bugfix (#17357)

Co-authored-by: jiangzhijie <jiangzhijie.jzj@alibaba-inc.com>
tags/1.2.0
Jiang 7 months ago
parent
commit
fd1e40d22e
No account linked to committer's email address
1 changed files with 49 additions and 14 deletions
  1. 49
    14
      api/core/rag/datasource/vdb/lindorm/lindorm_vector.py

+ 49
- 14
api/core/rag/datasource/vdb/lindorm/lindorm_vector.py View File

import time import time
from typing import Any, Optional from typing import Any, Optional


from opensearchpy import OpenSearch
from opensearchpy import OpenSearch, helpers
from opensearchpy.helpers import BulkIndexError
from pydantic import BaseModel, model_validator from pydantic import BaseModel, model_validator
from tenacity import retry, stop_after_attempt, wait_exponential from tenacity import retry, stop_after_attempt, wait_exponential


actions.append(action_header) actions.append(action_header)
actions.append(action_values) actions.append(action_values)


logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")
# logger.info(f"Processing batch {batch_num + 1}/{num_batches} (documents {start_idx + 1} to {end_idx})")


try: try:
_bulk_with_retry(actions) _bulk_with_retry(actions)
logger.info(f"Successfully processed batch {batch_num + 1}")
# logger.info(f"Successfully processed batch {batch_num + 1}")
# simple latency to avoid too many requests in a short time # simple latency to avoid too many requests in a short time
if batch_num < num_batches - 1: if batch_num < num_batches - 1:
time.sleep(1)
time.sleep(0.5)


except Exception: except Exception:
logger.exception(f"Failed to process batch {batch_num + 1}") logger.exception(f"Failed to process batch {batch_num + 1}")
self.delete_by_ids(ids) self.delete_by_ids(ids)


def delete_by_ids(self, ids: list[str]) -> None: def delete_by_ids(self, ids: list[str]) -> None:
params = {}
if self._using_ugc:
params["routing"] = self._routing
"""Delete documents by their IDs in batch.

Args:
ids: List of document IDs to delete
"""
if not ids:
return

params = {"routing": self._routing} if self._using_ugc else {}

# 1. First check if collection exists
if not self._client.indices.exists(index=self._collection_name):
logger.warning(f"Collection {self._collection_name} does not exist")
return

# 2. Batch process deletions
actions = []
for id in ids: for id in ids:
if self._client.exists(index=self._collection_name, id=id, params=params): if self._client.exists(index=self._collection_name, id=id, params=params):
params = {}
if self._using_ugc:
params["routing"] = self._routing
self._client.delete(index=self._collection_name, id=id, params=params)
actions.append(
{
"_op_type": "delete",
"_index": self._collection_name,
"_id": id,
**params, # Include routing if using UGC
}
)
else: else:
logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.") logger.warning(f"DELETE BY ID: ID {id} does not exist in the index.")


# 3. Perform bulk deletion if there are valid documents to delete
if actions:
try:
helpers.bulk(self._client, actions)
except BulkIndexError as e:
for error in e.errors:
delete_error = error.get("delete", {})
status = delete_error.get("status")
doc_id = delete_error.get("_id")

if status == 404:
logger.warning(f"Document not found for deletion: {doc_id}")
else:
logger.exception(f"Error deleting document: {error}")

def delete(self) -> None: def delete(self) -> None:
if self._using_ugc: if self._using_ugc:
routing_filter_query = { routing_filter_query = {
document_ids_filter = kwargs.get("document_ids_filter") document_ids_filter = kwargs.get("document_ids_filter")
filters = [] filters = []
if document_ids_filter: if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs) query = default_vector_search_query(query_vector=query_vector, k=top_k, filters=filters, **kwargs)


try: try:
filters = kwargs.get("filter", []) filters = kwargs.get("filter", [])
document_ids_filter = kwargs.get("document_ids_filter") document_ids_filter = kwargs.get("document_ids_filter")
if document_ids_filter: if document_ids_filter:
filters.append({"terms": {"metadata.document_id": document_ids_filter}})
filters.append({"terms": {"metadata.document_id.keyword": document_ids_filter}})
routing = self._routing routing = self._routing
full_text_query = default_text_search_query( full_text_query = default_text_search_query(
query_text=query, query_text=query,
routing=routing, routing=routing,
routing_field=self._routing_field, routing_field=self._routing_field,
) )

response = self._client.search(index=self._collection_name, body=full_text_query) response = self._client.search(index=self._collection_name, body=full_text_query)
docs = [] docs = []
for hit in response["hits"]["hits"]: for hit in response["hits"]["hits"]:
**kwargs, **kwargs,
) -> dict: ) -> dict:
if filters is not None: if filters is not None:
filter_type = "post_filter" if filter_type is None else filter_type
filter_type = "pre_filter" if filter_type is None else filter_type
if not isinstance(filters, list): if not isinstance(filters, list):
raise RuntimeError(f"unexpected filter with {type(filters)}") raise RuntimeError(f"unexpected filter with {type(filters)}")
final_ext: dict[str, Any] = {"lvector": {}} final_ext: dict[str, Any] = {"lvector": {}}

Loading…
Cancel
Save