| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- import os
-
- import pytest
-
- from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
- from core.rag.models.document import Document
- from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
-
-
- class TestClickzettaVector(AbstractVectorTest):
- """
- Test cases for Clickzetta vector database integration.
- """
-
- @pytest.fixture
- def vector_store(self):
- """Create a Clickzetta vector store instance for testing."""
- # Skip test if Clickzetta credentials are not configured
- if not os.getenv("CLICKZETTA_USERNAME"):
- pytest.skip("CLICKZETTA_USERNAME is not configured")
- if not os.getenv("CLICKZETTA_PASSWORD"):
- pytest.skip("CLICKZETTA_PASSWORD is not configured")
- if not os.getenv("CLICKZETTA_INSTANCE"):
- pytest.skip("CLICKZETTA_INSTANCE is not configured")
-
- config = ClickzettaConfig(
- username=os.getenv("CLICKZETTA_USERNAME", ""),
- password=os.getenv("CLICKZETTA_PASSWORD", ""),
- instance=os.getenv("CLICKZETTA_INSTANCE", ""),
- service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
- workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
- vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
- schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
- batch_size=10, # Small batch size for testing
- enable_inverted_index=True,
- analyzer_type="chinese",
- analyzer_mode="smart",
- vector_distance_function="cosine_distance",
- )
-
- with setup_mock_redis():
- vector = ClickzettaVector(
- collection_name="test_collection_" + str(os.getpid()),
- config=config
- )
-
- yield vector
-
- # Cleanup: delete the test collection
- try:
- vector.delete()
- except Exception:
- pass
-
- def test_clickzetta_vector_basic_operations(self, vector_store):
- """Test basic CRUD operations on Clickzetta vector store."""
- # Prepare test data
- texts = [
- "这是第一个测试文档,包含一些中文内容。",
- "This is the second test document with English content.",
- "第三个文档混合了English和中文内容。",
- ]
- embeddings = [
- [0.1, 0.2, 0.3, 0.4],
- [0.5, 0.6, 0.7, 0.8],
- [0.9, 1.0, 1.1, 1.2],
- ]
- documents = [
- Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
- for i, text in enumerate(texts)
- ]
-
- # Test create (initial insert)
- vector_store.create(texts=documents, embeddings=embeddings)
-
- # Test text_exists
- assert vector_store.text_exists("doc_0")
- assert not vector_store.text_exists("doc_999")
-
- # Test search_by_vector
- query_vector = [0.1, 0.2, 0.3, 0.4]
- results = vector_store.search_by_vector(query_vector, top_k=2)
- assert len(results) > 0
- assert results[0].page_content == texts[0] # Should match the first document
-
- # Test search_by_full_text (Chinese)
- results = vector_store.search_by_full_text("中文", top_k=3)
- assert len(results) >= 2 # Should find documents with Chinese content
-
- # Test search_by_full_text (English)
- results = vector_store.search_by_full_text("English", top_k=3)
- assert len(results) >= 2 # Should find documents with English content
-
- # Test delete_by_ids
- vector_store.delete_by_ids(["doc_0"])
- assert not vector_store.text_exists("doc_0")
- assert vector_store.text_exists("doc_1")
-
- # Test delete_by_metadata_field
- vector_store.delete_by_metadata_field("source", "test")
- assert not vector_store.text_exists("doc_1")
- assert not vector_store.text_exists("doc_2")
-
- def test_clickzetta_vector_advanced_search(self, vector_store):
- """Test advanced search features of Clickzetta vector store."""
- # Prepare test data with more complex metadata
- documents = []
- embeddings = []
- for i in range(10):
- doc = Document(
- page_content=f"Document {i}: " + get_example_text(),
- metadata={
- "doc_id": f"adv_doc_{i}",
- "category": "technical" if i % 2 == 0 else "general",
- "document_id": f"doc_{i // 3}", # Group documents
- "importance": i,
- }
- )
- documents.append(doc)
- # Create varied embeddings
- embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
-
- vector_store.create(texts=documents, embeddings=embeddings)
-
- # Test vector search with document filter
- query_vector = [0.5, 1.0, 1.5, 2.0]
- results = vector_store.search_by_vector(
- query_vector,
- top_k=5,
- document_ids_filter=["doc_0", "doc_1"]
- )
- assert len(results) > 0
- # All results should belong to doc_0 or doc_1 groups
- for result in results:
- assert result.metadata["document_id"] in ["doc_0", "doc_1"]
-
- # Test score threshold
- results = vector_store.search_by_vector(
- query_vector,
- top_k=10,
- score_threshold=0.5
- )
- # Check that all results have a score above threshold
- for result in results:
- assert result.metadata.get("score", 0) >= 0.5
-
- def test_clickzetta_batch_operations(self, vector_store):
- """Test batch insertion operations."""
- # Prepare large batch of documents
- batch_size = 25
- documents = []
- embeddings = []
-
- for i in range(batch_size):
- doc = Document(
- page_content=f"Batch document {i}: This is a test document for batch processing.",
- metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"}
- )
- documents.append(doc)
- embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
-
- # Test batch insert
- vector_store.add_texts(documents=documents, embeddings=embeddings)
-
- # Verify all documents were inserted
- for i in range(batch_size):
- assert vector_store.text_exists(f"batch_doc_{i}")
-
- # Clean up
- vector_store.delete_by_metadata_field("batch", "test_batch")
-
- def test_clickzetta_edge_cases(self, vector_store):
- """Test edge cases and error handling."""
- # Test empty operations
- vector_store.create(texts=[], embeddings=[])
- vector_store.add_texts(documents=[], embeddings=[])
- vector_store.delete_by_ids([])
-
- # Test special characters in content
- special_doc = Document(
- page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
- metadata={"doc_id": "special_doc", "test": "edge_case"}
- )
- embeddings = [[0.1, 0.2, 0.3, 0.4]]
-
- vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
- assert vector_store.text_exists("special_doc")
-
- # Test search with special characters
- results = vector_store.search_by_full_text("quotes", top_k=1)
- if results: # Full-text search might not be available
- assert len(results) > 0
-
- # Clean up
- vector_store.delete_by_ids(["special_doc"])
-
- def test_clickzetta_full_text_search_modes(self, vector_store):
- """Test different full-text search capabilities."""
- # Prepare documents with various language content
- documents = [
- Document(
- page_content="云器科技提供强大的Lakehouse解决方案",
- metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
- ),
- Document(
- page_content="Clickzetta provides powerful Lakehouse solutions",
- metadata={"doc_id": "en_doc_1", "lang": "english"}
- ),
- Document(
- page_content="Lakehouse是现代数据架构的重要组成部分",
- metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
- ),
- Document(
- page_content="Modern data architecture includes Lakehouse technology",
- metadata={"doc_id": "en_doc_2", "lang": "english"}
- ),
- ]
-
- embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
-
- vector_store.create(texts=documents, embeddings=embeddings)
-
- # Test Chinese full-text search
- results = vector_store.search_by_full_text("Lakehouse", top_k=4)
- assert len(results) >= 2 # Should find at least documents with "Lakehouse"
-
- # Test English full-text search
- results = vector_store.search_by_full_text("solutions", top_k=2)
- assert len(results) >= 1 # Should find English documents with "solutions"
-
- # Test mixed search
- results = vector_store.search_by_full_text("数据架构", top_k=2)
- assert len(results) >= 1 # Should find Chinese documents with this phrase
-
- # Clean up
- vector_store.delete_by_metadata_field("lang", "chinese")
- vector_store.delete_by_metadata_field("lang", "english")
|