Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

t_chunk.py 7.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. from ragflow_sdk import RAGFlow
  17. from common import HOST_ADDRESS
  18. from time import sleep
  19. def test_parse_document_with_txt(get_api_key_fixture):
  20. API_KEY = get_api_key_fixture
  21. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  22. ds = rag.create_dataset(name="test_parse_document")
  23. name = 'ragflow_test.txt'
  24. with open("test_data/ragflow_test.txt", "rb") as file:
  25. blob = file.read()
  26. docs = ds.upload_documents([{"display_name": name, "blob": blob}])
  27. doc = docs[0]
  28. ds.async_parse_documents(document_ids=[doc.id])
  29. '''
  30. for n in range(100):
  31. if doc.progress == 1:
  32. break
  33. sleep(1)
  34. else:
  35. raise Exception("Run time ERROR: Document parsing did not complete in time.")
  36. '''
  37. def test_parse_and_cancel_document(get_api_key_fixture):
  38. API_KEY = get_api_key_fixture
  39. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  40. ds = rag.create_dataset(name="test_parse_and_cancel_document")
  41. name = 'ragflow_test.txt'
  42. with open("test_data/ragflow_test.txt", "rb") as file:
  43. blob = file.read()
  44. docs = ds.upload_documents([{"display_name": name, "blob": blob}])
  45. doc = docs[0]
  46. ds.async_parse_documents(document_ids=[doc.id])
  47. sleep(1)
  48. if 0 < doc.progress < 1:
  49. ds.async_cancel_parse_documents(document_ids=[doc.id])
  50. def test_bulk_parse_documents(get_api_key_fixture):
  51. API_KEY = get_api_key_fixture
  52. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  53. ds = rag.create_dataset(name="test_bulk_parse_and_cancel_documents")
  54. with open("test_data/ragflow.txt", "rb") as file:
  55. blob = file.read()
  56. documents = [
  57. {'display_name': 'test1.txt', 'blob': blob},
  58. {'display_name': 'test2.txt', 'blob': blob},
  59. {'display_name': 'test3.txt', 'blob': blob}
  60. ]
  61. docs = ds.upload_documents(documents)
  62. ids = [doc.id for doc in docs]
  63. ds.async_parse_documents(ids)
  64. '''
  65. for n in range(100):
  66. all_completed = all(doc.progress == 1 for doc in docs)
  67. if all_completed:
  68. break
  69. sleep(1)
  70. else:
  71. raise Exception("Run time ERROR: Bulk document parsing did not complete in time.")
  72. '''
  73. def test_list_chunks_with_success(get_api_key_fixture):
  74. API_KEY = get_api_key_fixture
  75. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  76. ds = rag.create_dataset(name="test_list_chunks_with_success")
  77. with open("test_data/ragflow_test.txt", "rb") as file:
  78. blob = file.read()
  79. '''
  80. # chunk_size = 1024 * 1024
  81. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  82. documents = [
  83. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  84. ]
  85. '''
  86. documents = [{"display_name": "test_list_chunks_with_success.txt", "blob": blob}]
  87. docs = ds.upload_documents(documents)
  88. ids = [doc.id for doc in docs]
  89. ds.async_parse_documents(ids)
  90. '''
  91. for n in range(100):
  92. all_completed = all(doc.progress == 1 for doc in docs)
  93. if all_completed:
  94. break
  95. sleep(1)
  96. else:
  97. raise Exception("Run time ERROR: Chunk document parsing did not complete in time.")
  98. '''
  99. doc = docs[0]
  100. doc.list_chunks()
  101. def test_add_chunk_with_success(get_api_key_fixture):
  102. API_KEY = get_api_key_fixture
  103. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  104. ds = rag.create_dataset(name="test_add_chunk_with_success")
  105. with open("test_data/ragflow_test.txt", "rb") as file:
  106. blob = file.read()
  107. '''
  108. # chunk_size = 1024 * 1024
  109. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  110. documents = [
  111. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  112. ]
  113. '''
  114. documents = [{"display_name": "test_list_chunks_with_success.txt", "blob": blob}]
  115. docs = ds.upload_documents(documents)
  116. doc = docs[0]
  117. doc.add_chunk(content="This is a chunk addition test")
  118. def test_delete_chunk_with_success(get_api_key_fixture):
  119. API_KEY = get_api_key_fixture
  120. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  121. ds = rag.create_dataset(name="test_delete_chunk_with_success")
  122. with open("test_data/ragflow_test.txt", "rb") as file:
  123. blob = file.read()
  124. '''
  125. # chunk_size = 1024 * 1024
  126. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  127. documents = [
  128. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  129. ]
  130. '''
  131. documents = [{"display_name": "test_delete_chunk_with_success.txt", "blob": blob}]
  132. docs = ds.upload_documents(documents)
  133. doc = docs[0]
  134. chunk = doc.add_chunk(content="This is a chunk addition test")
  135. sleep(5)
  136. doc.delete_chunks([chunk.id])
  137. def test_update_chunk_content(get_api_key_fixture):
  138. API_KEY = get_api_key_fixture
  139. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  140. ds = rag.create_dataset(name="test_update_chunk_content_with_success")
  141. with open("test_data/ragflow_test.txt", "rb") as file:
  142. blob = file.read()
  143. '''
  144. # chunk_size = 1024 * 1024
  145. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  146. documents = [
  147. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  148. ]
  149. '''
  150. documents = [{"display_name": "test_update_chunk_content_with_success.txt", "blob": blob}]
  151. docs = ds.upload_documents(documents)
  152. doc = docs[0]
  153. chunk = doc.add_chunk(content="This is a chunk addition test")
  154. # For Elasticsearch, the chunk is not searchable in shot time (~2s).
  155. sleep(3)
  156. chunk.update({"content": "This is a updated content"})
  157. def test_update_chunk_available(get_api_key_fixture):
  158. API_KEY = get_api_key_fixture
  159. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  160. ds = rag.create_dataset(name="test_update_chunk_available_with_success")
  161. with open("test_data/ragflow_test.txt", "rb") as file:
  162. blob = file.read()
  163. '''
  164. # chunk_size = 1024 * 1024
  165. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  166. documents = [
  167. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  168. ]
  169. '''
  170. documents = [{"display_name": "test_update_chunk_available_with_success.txt", "blob": blob}]
  171. docs = ds.upload_documents(documents)
  172. doc = docs[0]
  173. chunk = doc.add_chunk(content="This is a chunk addition test")
  174. # For Elasticsearch, the chunk is not searchable in shot time (~2s).
  175. sleep(3)
  176. chunk.update({"available": 0})
  177. def test_retrieve_chunks(get_api_key_fixture):
  178. API_KEY = get_api_key_fixture
  179. rag = RAGFlow(API_KEY, HOST_ADDRESS)
  180. ds = rag.create_dataset(name="retrieval")
  181. with open("test_data/ragflow_test.txt", "rb") as file:
  182. blob = file.read()
  183. '''
  184. # chunk_size = 1024 * 1024
  185. # chunks = [blob[i:i + chunk_size] for i in range(0, len(blob), chunk_size)]
  186. documents = [
  187. {'display_name': f'chunk_{i}.txt', 'blob': chunk} for i, chunk in enumerate(chunks)
  188. ]
  189. '''
  190. documents = [{"display_name": "test_retrieve_chunks.txt", "blob": blob}]
  191. docs = ds.upload_documents(documents)
  192. doc = docs[0]
  193. doc.add_chunk(content="This is a chunk addition test")
  194. rag.retrieve(dataset_ids=[ds.id], document_ids=[doc.id])
  195. rag.delete_datasets(ids=[ds.id])
  196. # test different parameters for the retrieval