Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

test_clickzetta.py 9.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. import os
  2. import pytest
  3. from core.rag.datasource.vdb.clickzetta.clickzetta_vector import ClickzettaConfig, ClickzettaVector
  4. from core.rag.models.document import Document
  5. from tests.integration_tests.vdb.test_vector_store import AbstractVectorTest, get_example_text, setup_mock_redis
  6. class TestClickzettaVector(AbstractVectorTest):
  7. """
  8. Test cases for Clickzetta vector database integration.
  9. """
  10. @pytest.fixture
  11. def vector_store(self):
  12. """Create a Clickzetta vector store instance for testing."""
  13. # Skip test if Clickzetta credentials are not configured
  14. if not os.getenv("CLICKZETTA_USERNAME"):
  15. pytest.skip("CLICKZETTA_USERNAME is not configured")
  16. if not os.getenv("CLICKZETTA_PASSWORD"):
  17. pytest.skip("CLICKZETTA_PASSWORD is not configured")
  18. if not os.getenv("CLICKZETTA_INSTANCE"):
  19. pytest.skip("CLICKZETTA_INSTANCE is not configured")
  20. config = ClickzettaConfig(
  21. username=os.getenv("CLICKZETTA_USERNAME", ""),
  22. password=os.getenv("CLICKZETTA_PASSWORD", ""),
  23. instance=os.getenv("CLICKZETTA_INSTANCE", ""),
  24. service=os.getenv("CLICKZETTA_SERVICE", "api.clickzetta.com"),
  25. workspace=os.getenv("CLICKZETTA_WORKSPACE", "quick_start"),
  26. vcluster=os.getenv("CLICKZETTA_VCLUSTER", "default_ap"),
  27. schema=os.getenv("CLICKZETTA_SCHEMA", "dify_test"),
  28. batch_size=10, # Small batch size for testing
  29. enable_inverted_index=True,
  30. analyzer_type="chinese",
  31. analyzer_mode="smart",
  32. vector_distance_function="cosine_distance",
  33. )
  34. with setup_mock_redis():
  35. vector = ClickzettaVector(
  36. collection_name="test_collection_" + str(os.getpid()),
  37. config=config
  38. )
  39. yield vector
  40. # Cleanup: delete the test collection
  41. try:
  42. vector.delete()
  43. except Exception:
  44. pass
  45. def test_clickzetta_vector_basic_operations(self, vector_store):
  46. """Test basic CRUD operations on Clickzetta vector store."""
  47. # Prepare test data
  48. texts = [
  49. "这是第一个测试文档,包含一些中文内容。",
  50. "This is the second test document with English content.",
  51. "第三个文档混合了English和中文内容。",
  52. ]
  53. embeddings = [
  54. [0.1, 0.2, 0.3, 0.4],
  55. [0.5, 0.6, 0.7, 0.8],
  56. [0.9, 1.0, 1.1, 1.2],
  57. ]
  58. documents = [
  59. Document(page_content=text, metadata={"doc_id": f"doc_{i}", "source": "test"})
  60. for i, text in enumerate(texts)
  61. ]
  62. # Test create (initial insert)
  63. vector_store.create(texts=documents, embeddings=embeddings)
  64. # Test text_exists
  65. assert vector_store.text_exists("doc_0")
  66. assert not vector_store.text_exists("doc_999")
  67. # Test search_by_vector
  68. query_vector = [0.1, 0.2, 0.3, 0.4]
  69. results = vector_store.search_by_vector(query_vector, top_k=2)
  70. assert len(results) > 0
  71. assert results[0].page_content == texts[0] # Should match the first document
  72. # Test search_by_full_text (Chinese)
  73. results = vector_store.search_by_full_text("中文", top_k=3)
  74. assert len(results) >= 2 # Should find documents with Chinese content
  75. # Test search_by_full_text (English)
  76. results = vector_store.search_by_full_text("English", top_k=3)
  77. assert len(results) >= 2 # Should find documents with English content
  78. # Test delete_by_ids
  79. vector_store.delete_by_ids(["doc_0"])
  80. assert not vector_store.text_exists("doc_0")
  81. assert vector_store.text_exists("doc_1")
  82. # Test delete_by_metadata_field
  83. vector_store.delete_by_metadata_field("source", "test")
  84. assert not vector_store.text_exists("doc_1")
  85. assert not vector_store.text_exists("doc_2")
  86. def test_clickzetta_vector_advanced_search(self, vector_store):
  87. """Test advanced search features of Clickzetta vector store."""
  88. # Prepare test data with more complex metadata
  89. documents = []
  90. embeddings = []
  91. for i in range(10):
  92. doc = Document(
  93. page_content=f"Document {i}: " + get_example_text(),
  94. metadata={
  95. "doc_id": f"adv_doc_{i}",
  96. "category": "technical" if i % 2 == 0 else "general",
  97. "document_id": f"doc_{i // 3}", # Group documents
  98. "importance": i,
  99. }
  100. )
  101. documents.append(doc)
  102. # Create varied embeddings
  103. embeddings.append([0.1 * i, 0.2 * i, 0.3 * i, 0.4 * i])
  104. vector_store.create(texts=documents, embeddings=embeddings)
  105. # Test vector search with document filter
  106. query_vector = [0.5, 1.0, 1.5, 2.0]
  107. results = vector_store.search_by_vector(
  108. query_vector,
  109. top_k=5,
  110. document_ids_filter=["doc_0", "doc_1"]
  111. )
  112. assert len(results) > 0
  113. # All results should belong to doc_0 or doc_1 groups
  114. for result in results:
  115. assert result.metadata["document_id"] in ["doc_0", "doc_1"]
  116. # Test score threshold
  117. results = vector_store.search_by_vector(
  118. query_vector,
  119. top_k=10,
  120. score_threshold=0.5
  121. )
  122. # Check that all results have a score above threshold
  123. for result in results:
  124. assert result.metadata.get("score", 0) >= 0.5
  125. def test_clickzetta_batch_operations(self, vector_store):
  126. """Test batch insertion operations."""
  127. # Prepare large batch of documents
  128. batch_size = 25
  129. documents = []
  130. embeddings = []
  131. for i in range(batch_size):
  132. doc = Document(
  133. page_content=f"Batch document {i}: This is a test document for batch processing.",
  134. metadata={"doc_id": f"batch_doc_{i}", "batch": "test_batch"}
  135. )
  136. documents.append(doc)
  137. embeddings.append([0.1 * (i % 10), 0.2 * (i % 10), 0.3 * (i % 10), 0.4 * (i % 10)])
  138. # Test batch insert
  139. vector_store.add_texts(documents=documents, embeddings=embeddings)
  140. # Verify all documents were inserted
  141. for i in range(batch_size):
  142. assert vector_store.text_exists(f"batch_doc_{i}")
  143. # Clean up
  144. vector_store.delete_by_metadata_field("batch", "test_batch")
  145. def test_clickzetta_edge_cases(self, vector_store):
  146. """Test edge cases and error handling."""
  147. # Test empty operations
  148. vector_store.create(texts=[], embeddings=[])
  149. vector_store.add_texts(documents=[], embeddings=[])
  150. vector_store.delete_by_ids([])
  151. # Test special characters in content
  152. special_doc = Document(
  153. page_content="Special chars: 'quotes', \"double\", \\backslash, \n newline",
  154. metadata={"doc_id": "special_doc", "test": "edge_case"}
  155. )
  156. embeddings = [[0.1, 0.2, 0.3, 0.4]]
  157. vector_store.add_texts(documents=[special_doc], embeddings=embeddings)
  158. assert vector_store.text_exists("special_doc")
  159. # Test search with special characters
  160. results = vector_store.search_by_full_text("quotes", top_k=1)
  161. if results: # Full-text search might not be available
  162. assert len(results) > 0
  163. # Clean up
  164. vector_store.delete_by_ids(["special_doc"])
  165. def test_clickzetta_full_text_search_modes(self, vector_store):
  166. """Test different full-text search capabilities."""
  167. # Prepare documents with various language content
  168. documents = [
  169. Document(
  170. page_content="云器科技提供强大的Lakehouse解决方案",
  171. metadata={"doc_id": "cn_doc_1", "lang": "chinese"}
  172. ),
  173. Document(
  174. page_content="Clickzetta provides powerful Lakehouse solutions",
  175. metadata={"doc_id": "en_doc_1", "lang": "english"}
  176. ),
  177. Document(
  178. page_content="Lakehouse是现代数据架构的重要组成部分",
  179. metadata={"doc_id": "cn_doc_2", "lang": "chinese"}
  180. ),
  181. Document(
  182. page_content="Modern data architecture includes Lakehouse technology",
  183. metadata={"doc_id": "en_doc_2", "lang": "english"}
  184. ),
  185. ]
  186. embeddings = [[0.1, 0.2, 0.3, 0.4] for _ in documents]
  187. vector_store.create(texts=documents, embeddings=embeddings)
  188. # Test Chinese full-text search
  189. results = vector_store.search_by_full_text("Lakehouse", top_k=4)
  190. assert len(results) >= 2 # Should find at least documents with "Lakehouse"
  191. # Test English full-text search
  192. results = vector_store.search_by_full_text("solutions", top_k=2)
  193. assert len(results) >= 1 # Should find English documents with "solutions"
  194. # Test mixed search
  195. results = vector_store.search_by_full_text("数据架构", top_k=2)
  196. assert len(results) >= 1 # Should find Chinese documents with this phrase
  197. # Clean up
  198. vector_store.delete_by_metadata_field("lang", "chinese")
  199. vector_store.delete_by_metadata_field("lang", "english")