| 
                        123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 | 
                        - import os
 - 
 - import pytest
 - 
 - from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
 - from core.rag.models.document import Document
 - from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
 - 
 - 
 - class TestClickzettaVector(AbstractVectorTest):
 -     """
 -     Test cases for Clickzetta vector database integration.
 -     """
 - 
 -     @pytest.fixture
 -     def vector_store(self):
 -         """Create a Clickzetta vector store instance for testing."""
 -         # Skip test if Clickzetta credentials are not configured
 -         if not os.getenv("CLICKZETTA_USERNAME"):
 -             pytest.skip("CLICKZETTA_USERNAME is not configured")
 -         if not os.getenv("CLICKZETTA_PASSWORD"):
 -             pytest.skip("CLICKZETTA_PASSWORD is not configured")
 -         if not os.getenv("CLICKZETTA_INSTANCE"):
 -             pytest.skip("CLICKZETTA_INSTANCE is not configured")
 - 
 -         config = ClickzettaConfig(
 -             username=os.getenv("CLICKZETTA_USERNAME", ""),
 -             password=os.getenv("CLICKZETTA_PASSWORD", ""),
 -             instance=os.getenv("CLICKZETTA_INSTANCE", ""),
 -             service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
 -             workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
 -             vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
 -             schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
 -             batch_size=10,  # Small batch size for testing
 -             enable_inverted_index=True,
 -             analyzer_type="chinese",
 -             analyzer_mode="smart",
 -             vector_distance_function="cosine_distance",
 -         )
 - 
 -         with setup_mock_redis():
 -             vector = ClickzettaVector(
 -                 collection_name="test_collection_" + str(os.getpid()),
 -                 config=config
 -             )
 - 
 -             yield vector
 - 
 -             # Cleanup: delete the test collection
 -             try:
 -                 vector.delete()
 -             except Exception:
 -                 pass
 - 
 -     def test_clickzetta_vector_basic_operations(self, vector_store):
 -         """Test basic CRUD operations on Clickzetta vector store."""
 -         # Prepare test data
 -         texts = [
 -             "这是第一个测试文档,包含一些中文内容。",
 -             "This is the second test document with English content.",
 -             "第三个文档混合了English和中文内容。",
 -         ]
 -         embeddings = [
 -             [0.1, 0.2, 0.3, 0.4],
 -             [0.5, 0.6, 0.7, 0.8],
 -             [0.9, 1.0, 1.1, 1.2],
 -         ]
 -         documents = [
 -             Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
 -             for i, text in enumerate(texts)
 -         ]
 - 
 -         # Test create (initial insert)
 -         vector_store.create(texts=documents, embeddings=embeddings)
 - 
 -         # Test text_exists
 -         assert vector_store.text_exists("doc_0")
 -         assert not vector_store.text_exists("doc_999")
 - 
 -         # Test search_by_vector
 -         query_vector = [0.1, 0.2, 0.3, 0.4]
 -         results = vector_store.search_by_vector(query_vector, top_k=2)
 -         assert len(results) > 0
 -         assert results[0].page_content == texts[0]  # Should match the first document
 - 
 -         # Test search_by_full_text (Chinese)
 -         results = vector_store.search_by_full_text("中文", top_k=3)
 -         assert len(results) >= 2  # Should find documents with Chinese content
 - 
 -         # Test search_by_full_text (English)
 -         results = vector_store.search_by_full_text("English", top_k=3)
 -         assert len(results) >= 2  # Should find documents with English content
 - 
 -         # Test delete_by_ids
 -         vector_store.delete_by_ids(["doc_0"])
 -         assert not vector_store.text_exists("doc_0")
 -         assert vector_store.text_exists("doc_1")
 - 
 -         # Test delete_by_metadata_field
 -         vector_store.delete_by_metadata_field("source", "test")
 -         assert not vector_store.text_exists("doc_1")
 -         assert not vector_store.text_exists("doc_2")
 - 
 -     def test_clickzetta_vector_advanced_search(self, vector_store):
 -         """Test advanced search features of Clickzetta vector store."""
 -         # Prepare test data with more complex metadata
 -         documents = []
 -         embeddings = []
 -         for i in range(10):
 -             doc = Document(
 -                 page_content=f"Document {i}: " + get_example_text(),
 -                 metadata={
 -                     "doc_id": f"adv_doc_{i}",
 -                     "category": "technical" if i % 2 == 0 else "general",
 -                     "document_id": f"doc_{i // 3}",  # Group documents
 -                     "importance": i,
 -                 }
 -             )
 -             documents.append(doc)
 -             # Create varied embeddings
 -             embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
 - 
 -         vector_store.create(texts=documents, embeddings=embeddings)
 - 
 -         # Test vector search with document filter
 -         query_vector = [0.5, 1.0, 1.5, 2.0]
 -         results = vector_store.search_by_vector(
 -             query_vector,
 -             top_k=5,
 -             document_ids_filter=["doc_0", "doc_1"]
 -         )
 -         assert len(results) > 0
 -         # All results should belong to doc_0 or doc_1 groups
 -         for result in results:
 -             assert result.metadata["document_id"] in ["doc_0", "doc_1"]
 - 
 -         # Test score threshold
 -         results = vector_store.search_by_vector(
 -             query_vector,
 -             top_k=10,
 -             score_threshold=0.5
 -         )
 -         # Check that all results have a score above threshold
 -         for result in results:
 -             assert result.metadata.get("score", 0) >= 0.5
 - 
 -     def test_clickzetta_batch_operations(self, vector_store):
 -         """Test batch insertion operations."""
 -         # Prepare large batch of documents
 -         batch_size = 25
 -         documents = []
 -         embeddings = []
 - 
 -         for i in range(batch_size):
 -             doc = Document(
 -                 page_content=f"Batch document {i}: This is a test document for batch processing.",
 -                 metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"}
 -             )
 -             documents.append(doc)
 -             embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
 - 
 -         # Test batch insert
 -         vector_store.add_texts(documents=documents, embeddings=embeddings)
 - 
 -         # Verify all documents were inserted
 -         for i in range(batch_size):
 -             assert vector_store.text_exists(f"batch_doc_{i}")
 - 
 -         # Clean up
 -         vector_store.delete_by_metadata_field("batch", "test_batch")
 - 
 -     def test_clickzetta_edge_cases(self, vector_store):
 -         """Test edge cases and error handling."""
 -         # Test empty operations
 -         vector_store.create(texts=[], embeddings=[])
 -         vector_store.add_texts(documents=[], embeddings=[])
 -         vector_store.delete_by_ids([])
 - 
 -         # Test special characters in content
 -         special_doc = Document(
 -             page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
 -             metadata={"doc_id": "special_doc", "test": "edge_case"}
 -         )
 -         embeddings = [[0.1, 0.2, 0.3, 0.4]]
 - 
 -         vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
 -         assert vector_store.text_exists("special_doc")
 - 
 -         # Test search with special characters
 -         results = vector_store.search_by_full_text("quotes", top_k=1)
 -         if results:  # Full-text search might not be available
 -             assert len(results) > 0
 - 
 -         # Clean up
 -         vector_store.delete_by_ids(["special_doc"])
 - 
 -     def test_clickzetta_full_text_search_modes(self, vector_store):
 -         """Test different full-text search capabilities."""
 -         # Prepare documents with various language content
 -         documents = [
 -             Document(
 -                 page_content="云器科技提供强大的Lakehouse解决方案",
 -                 metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
 -             ),
 -             Document(
 -                 page_content="Clickzetta provides powerful Lakehouse solutions",
 -                 metadata={"doc_id": "en_doc_1", "lang": "english"}
 -             ),
 -             Document(
 -                 page_content="Lakehouse是现代数据架构的重要组成部分",
 -                 metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
 -             ),
 -             Document(
 -                 page_content="Modern data architecture includes Lakehouse technology",
 -                 metadata={"doc_id": "en_doc_2", "lang": "english"}
 -             ),
 -         ]
 - 
 -         embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
 - 
 -         vector_store.create(texts=documents, embeddings=embeddings)
 - 
 -         # Test Chinese full-text search
 -         results = vector_store.search_by_full_text("Lakehouse", top_k=4)
 -         assert len(results) >= 2  # Should find at least documents with "Lakehouse"
 - 
 -         # Test English full-text search
 -         results = vector_store.search_by_full_text("solutions", top_k=2)
 -         assert len(results) >= 1  # Should find English documents with "solutions"
 - 
 -         # Test mixed search
 -         results = vector_store.search_by_full_text("数据架构", top_k=2)
 -         assert len(results) >= 1  # Should find Chinese documents with this phrase
 - 
 -         # Clean up
 -         vector_store.delete_by_metadata_field("lang", "chinese")
 -         vector_store.delete_by_metadata_field("lang", "english")
 
 
  |