You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_clickzetta.py 9.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. import contextlib
  2. import os
  3. import pytest
  4. from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
  5. from core.rag.models.document import Document
  6. from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
  7. class TestClickzettaVector(AbstractVectorTest):
  8. """
  9. Test cases for Clickzetta vector database integration.
  10. """
  11. @pytest.fixture
  12. def vector_store(self):
  13. """Create a Clickzetta vector store instance for testing."""
  14. # Skip test if Clickzetta credentials are not configured
  15. if not os.getenv("CLICKZETTA_USERNAME"):
  16. pytest.skip("CLICKZETTA_USERNAME is not configured")
  17. if not os.getenv("CLICKZETTA_PASSWORD"):
  18. pytest.skip("CLICKZETTA_PASSWORD is not configured")
  19. if not os.getenv("CLICKZETTA_INSTANCE"):
  20. pytest.skip("CLICKZETTA_INSTANCE is not configured")
  21. config = ClickzettaConfig(
  22. username=os.getenv("CLICKZETTA_USERNAME", ""),
  23. password=os.getenv("CLICKZETTA_PASSWORD", ""),
  24. instance=os.getenv("CLICKZETTA_INSTANCE", ""),
  25. service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
  26. workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
  27. vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
  28. schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
  29. batch_size=10, # Small batch size for testing
  30. enable_inverted_index=True,
  31. analyzer_type="chinese",
  32. analyzer_mode="smart",
  33. vector_distance_function="cosine_distance",
  34. )
  35. with setup_mock_redis():
  36. vector = ClickzettaVector(collection_name="test_collection_" + str(os.getpid()), config=config)
  37. yield vector
  38. # Cleanup: delete the test collection
  39. with contextlib.suppress(Exception):
  40. vector.delete()
  41. def test_clickzetta_vector_basic_operations(self, vector_store):
  42. """Test basic CRUD operations on Clickzetta vector store."""
  43. # Prepare test data
  44. texts = [
  45. "这是第一个测试文档,包含一些中文内容。",
  46. "This is the second test document with English content.",
  47. "第三个文档混合了English和中文内容。",
  48. ]
  49. embeddings = [
  50. [0.1, 0.2, 0.3, 0.4],
  51. [0.5, 0.6, 0.7, 0.8],
  52. [0.9, 1.0, 1.1, 1.2],
  53. ]
  54. documents = [
  55. Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
  56. for i, text in enumerate(texts)
  57. ]
  58. # Test create (initial insert)
  59. vector_store.create(texts=documents, embeddings=embeddings)
  60. # Test text_exists
  61. assert vector_store.text_exists("doc_0")
  62. assert not vector_store.text_exists("doc_999")
  63. # Test search_by_vector
  64. query_vector = [0.1, 0.2, 0.3, 0.4]
  65. results = vector_store.search_by_vector(query_vector, top_k=2)
  66. assert len(results) > 0
  67. assert results[0].page_content == texts[0] # Should match the first document
  68. # Test search_by_full_text (Chinese)
  69. results = vector_store.search_by_full_text("中文", top_k=3)
  70. assert len(results) >= 2 # Should find documents with Chinese content
  71. # Test search_by_full_text (English)
  72. results = vector_store.search_by_full_text("English", top_k=3)
  73. assert len(results) >= 2 # Should find documents with English content
  74. # Test delete_by_ids
  75. vector_store.delete_by_ids(["doc_0"])
  76. assert not vector_store.text_exists("doc_0")
  77. assert vector_store.text_exists("doc_1")
  78. # Test delete_by_metadata_field
  79. vector_store.delete_by_metadata_field("source", "test")
  80. assert not vector_store.text_exists("doc_1")
  81. assert not vector_store.text_exists("doc_2")
  82. def test_clickzetta_vector_advanced_search(self, vector_store):
  83. """Test advanced search features of Clickzetta vector store."""
  84. # Prepare test data with more complex metadata
  85. documents = []
  86. embeddings = []
  87. for i in range(10):
  88. doc = Document(
  89. page_content=f"Document {i}: " + get_example_text(),
  90. metadata={
  91. "doc_id": f"adv_doc_{i}",
  92. "category": "technical" if i % 2 == 0 else "general",
  93. "document_id": f"doc_{i // 3}", # Group documents
  94. "importance": i,
  95. },
  96. )
  97. documents.append(doc)
  98. # Create varied embeddings
  99. embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
  100. vector_store.create(texts=documents, embeddings=embeddings)
  101. # Test vector search with document filter
  102. query_vector = [0.5, 1.0, 1.5, 2.0]
  103. results = vector_store.search_by_vector(query_vector, top_k=5, document_ids_filter=["doc_0", "doc_1"])
  104. assert len(results) > 0
  105. # All results should belong to doc_0 or doc_1 groups
  106. for result in results:
  107. assert result.metadata["document_id"] in ["doc_0", "doc_1"]
  108. # Test score threshold
  109. results = vector_store.search_by_vector(query_vector, top_k=10, score_threshold=0.5)
  110. # Check that all results have a score above threshold
  111. for result in results:
  112. assert result.metadata.get("score", 0) >= 0.5
  113. def test_clickzetta_batch_operations(self, vector_store):
  114. """Test batch insertion operations."""
  115. # Prepare large batch of documents
  116. batch_size = 25
  117. documents = []
  118. embeddings = []
  119. for i in range(batch_size):
  120. doc = Document(
  121. page_content=f"Batch document {i}: This is a test document for batch processing.",
  122. metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"},
  123. )
  124. documents.append(doc)
  125. embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
  126. # Test batch insert
  127. vector_store.add_texts(documents=documents, embeddings=embeddings)
  128. # Verify all documents were inserted
  129. for i in range(batch_size):
  130. assert vector_store.text_exists(f"batch_doc_{i}")
  131. # Clean up
  132. vector_store.delete_by_metadata_field("batch", "test_batch")
  133. def test_clickzetta_edge_cases(self, vector_store):
  134. """Test edge cases and error handling."""
  135. # Test empty operations
  136. vector_store.create(texts=[], embeddings=[])
  137. vector_store.add_texts(documents=[], embeddings=[])
  138. vector_store.delete_by_ids([])
  139. # Test special characters in content
  140. special_doc = Document(
  141. page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
  142. metadata={"doc_id": "special_doc", "test": "edge_case"},
  143. )
  144. embeddings = [[0.1, 0.2, 0.3, 0.4]]
  145. vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
  146. assert vector_store.text_exists("special_doc")
  147. # Test search with special characters
  148. results = vector_store.search_by_full_text("quotes", top_k=1)
  149. if results: # Full-text search might not be available
  150. assert len(results) > 0
  151. # Clean up
  152. vector_store.delete_by_ids(["special_doc"])
  153. def test_clickzetta_full_text_search_modes(self, vector_store):
  154. """Test different full-text search capabilities."""
  155. # Prepare documents with various language content
  156. documents = [
  157. Document(
  158. page_content="云器科技提供强大的Lakehouse解决方案", metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
  159. ),
  160. Document(
  161. page_content="Clickzetta provides powerful Lakehouse solutions",
  162. metadata={"doc_id": "en_doc_1", "lang": "english"},
  163. ),
  164. Document(
  165. page_content="Lakehouse是现代数据架构的重要组成部分", metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
  166. ),
  167. Document(
  168. page_content="Modern data architecture includes Lakehouse technology",
  169. metadata={"doc_id": "en_doc_2", "lang": "english"},
  170. ),
  171. ]
  172. embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
  173. vector_store.create(texts=documents, embeddings=embeddings)
  174. # Test Chinese full-text search
  175. results = vector_store.search_by_full_text("Lakehouse", top_k=4)
  176. assert len(results) >= 2 # Should find at least documents with "Lakehouse"
  177. # Test English full-text search
  178. results = vector_store.search_by_full_text("solutions", top_k=2)
  179. assert len(results) >= 1 # Should find English documents with "solutions"
  180. # Test mixed search
  181. results = vector_store.search_by_full_text("数据架构", top_k=2)
  182. assert len(results) >= 1 # Should find Chinese documents with this phrase
  183. # Clean up
  184. vector_store.delete_by_metadata_field("lang", "chinese")
  185. vector_store.delete_by_metadata_field("lang", "english")