Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. import os
  2. import pytest
  3. from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
  4. from core.rag.models.document import Document
  5. from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
  6. class TestClickzettaVector(AbstractVectorTest):
  7. """
  8. Test cases for Clickzetta vector database integration.
  9. """
  10. @pytest.fixture
  11. def vector_store(self):
  12. """Create a Clickzetta vector store instance for testing."""
  13. # Skip test if Clickzetta credentials are not configured
  14. if not os.getenv("CLICKZETTA_USERNAME"):
  15. pytest.skip("CLICKZETTA_USERNAME is not configured")
  16. if not os.getenv("CLICKZETTA_PASSWORD"):
  17. pytest.skip("CLICKZETTA_PASSWORD is not configured")
  18. if not os.getenv("CLICKZETTA_INSTANCE"):
  19. pytest.skip("CLICKZETTA_INSTANCE is not configured")
  20. config = ClickzettaConfig(
  21. username=os.getenv("CLICKZETTA_USERNAME", ""),
  22. password=os.getenv("CLICKZETTA_PASSWORD", ""),
  23. instance=os.getenv("CLICKZETTA_INSTANCE", ""),
  24. service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
  25. workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
  26. vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
  27. schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
  28. batch_size=10, # Small batch size for testing
  29. enable_inverted_index=True,
  30. analyzer_type="chinese",
  31. analyzer_mode="smart",
  32. vector_distance_function="cosine_distance",
  33. )
  34. with setup_mock_redis():
  35. vector = ClickzettaVector(collection_name="test_collection_" + str(os.getpid()), config=config)
  36. yield vector
  37. # Cleanup: delete the test collection
  38. try:
  39. vector.delete()
  40. except Exception:
  41. pass
  42. def test_clickzetta_vector_basic_operations(self, vector_store):
  43. """Test basic CRUD operations on Clickzetta vector store."""
  44. # Prepare test data
  45. texts = [
  46. "这是第一个测试文档,包含一些中文内容。",
  47. "This is the second test document with English content.",
  48. "第三个文档混合了English和中文内容。",
  49. ]
  50. embeddings = [
  51. [0.1, 0.2, 0.3, 0.4],
  52. [0.5, 0.6, 0.7, 0.8],
  53. [0.9, 1.0, 1.1, 1.2],
  54. ]
  55. documents = [
  56. Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
  57. for i, text in enumerate(texts)
  58. ]
  59. # Test create (initial insert)
  60. vector_store.create(texts=documents, embeddings=embeddings)
  61. # Test text_exists
  62. assert vector_store.text_exists("doc_0")
  63. assert not vector_store.text_exists("doc_999")
  64. # Test search_by_vector
  65. query_vector = [0.1, 0.2, 0.3, 0.4]
  66. results = vector_store.search_by_vector(query_vector, top_k=2)
  67. assert len(results) > 0
  68. assert results[0].page_content == texts[0] # Should match the first document
  69. # Test search_by_full_text (Chinese)
  70. results = vector_store.search_by_full_text("中文", top_k=3)
  71. assert len(results) >= 2 # Should find documents with Chinese content
  72. # Test search_by_full_text (English)
  73. results = vector_store.search_by_full_text("English", top_k=3)
  74. assert len(results) >= 2 # Should find documents with English content
  75. # Test delete_by_ids
  76. vector_store.delete_by_ids(["doc_0"])
  77. assert not vector_store.text_exists("doc_0")
  78. assert vector_store.text_exists("doc_1")
  79. # Test delete_by_metadata_field
  80. vector_store.delete_by_metadata_field("source", "test")
  81. assert not vector_store.text_exists("doc_1")
  82. assert not vector_store.text_exists("doc_2")
  83. def test_clickzetta_vector_advanced_search(self, vector_store):
  84. """Test advanced search features of Clickzetta vector store."""
  85. # Prepare test data with more complex metadata
  86. documents = []
  87. embeddings = []
  88. for i in range(10):
  89. doc = Document(
  90. page_content=f"Document {i}: " + get_example_text(),
  91. metadata={
  92. "doc_id": f"adv_doc_{i}",
  93. "category": "technical" if i % 2 == 0 else "general",
  94. "document_id": f"doc_{i // 3}", # Group documents
  95. "importance": i,
  96. },
  97. )
  98. documents.append(doc)
  99. # Create varied embeddings
  100. embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
  101. vector_store.create(texts=documents, embeddings=embeddings)
  102. # Test vector search with document filter
  103. query_vector = [0.5, 1.0, 1.5, 2.0]
  104. results = vector_store.search_by_vector(query_vector, top_k=5, document_ids_filter=["doc_0", "doc_1"])
  105. assert len(results) > 0
  106. # All results should belong to doc_0 or doc_1 groups
  107. for result in results:
  108. assert result.metadata["document_id"] in ["doc_0", "doc_1"]
  109. # Test score threshold
  110. results = vector_store.search_by_vector(query_vector, top_k=10, score_threshold=0.5)
  111. # Check that all results have a score above threshold
  112. for result in results:
  113. assert result.metadata.get("score", 0) >= 0.5
  114. def test_clickzetta_batch_operations(self, vector_store):
  115. """Test batch insertion operations."""
  116. # Prepare large batch of documents
  117. batch_size = 25
  118. documents = []
  119. embeddings = []
  120. for i in range(batch_size):
  121. doc = Document(
  122. page_content=f"Batch document {i}: This is a test document for batch processing.",
  123. metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"},
  124. )
  125. documents.append(doc)
  126. embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
  127. # Test batch insert
  128. vector_store.add_texts(documents=documents, embeddings=embeddings)
  129. # Verify all documents were inserted
  130. for i in range(batch_size):
  131. assert vector_store.text_exists(f"batch_doc_{i}")
  132. # Clean up
  133. vector_store.delete_by_metadata_field("batch", "test_batch")
  134. def test_clickzetta_edge_cases(self, vector_store):
  135. """Test edge cases and error handling."""
  136. # Test empty operations
  137. vector_store.create(texts=[], embeddings=[])
  138. vector_store.add_texts(documents=[], embeddings=[])
  139. vector_store.delete_by_ids([])
  140. # Test special characters in content
  141. special_doc = Document(
  142. page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
  143. metadata={"doc_id": "special_doc", "test": "edge_case"},
  144. )
  145. embeddings = [[0.1, 0.2, 0.3, 0.4]]
  146. vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
  147. assert vector_store.text_exists("special_doc")
  148. # Test search with special characters
  149. results = vector_store.search_by_full_text("quotes", top_k=1)
  150. if results: # Full-text search might not be available
  151. assert len(results) > 0
  152. # Clean up
  153. vector_store.delete_by_ids(["special_doc"])
  154. def test_clickzetta_full_text_search_modes(self, vector_store):
  155. """Test different full-text search capabilities."""
  156. # Prepare documents with various language content
  157. documents = [
  158. Document(
  159. page_content="云器科技提供强大的Lakehouse解决方案", metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
  160. ),
  161. Document(
  162. page_content="Clickzetta provides powerful Lakehouse solutions",
  163. metadata={"doc_id": "en_doc_1", "lang": "english"},
  164. ),
  165. Document(
  166. page_content="Lakehouse是现代数据架构的重要组成部分", metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
  167. ),
  168. Document(
  169. page_content="Modern data architecture includes Lakehouse technology",
  170. metadata={"doc_id": "en_doc_2", "lang": "english"},
  171. ),
  172. ]
  173. embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
  174. vector_store.create(texts=documents, embeddings=embeddings)
  175. # Test Chinese full-text search
  176. results = vector_store.search_by_full_text("Lakehouse", top_k=4)
  177. assert len(results) >= 2 # Should find at least documents with "Lakehouse"
  178. # Test English full-text search
  179. results = vector_store.search_by_full_text("solutions", top_k=2)
  180. assert len(results) >= 1 # Should find English documents with "solutions"
  181. # Test mixed search
  182. results = vector_store.search_by_full_text("数据架构", top_k=2)
  183. assert len(results) >= 1 # Should find Chinese documents with this phrase
  184. # Clean up
  185. vector_store.delete_by_metadata_field("lang", "chinese")
  186. vector_store.delete_by_metadata_field("lang", "english")