您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

document.py 6.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import time
  2. from .base import Base
  3. from .chunk import Chunk
  4. class Document(Base):
  5. def __init__(self, rag, res_dict):
  6. self.id = ""
  7. self.name = ""
  8. self.thumbnail = None
  9. self.knowledgebase_id = None
  10. self.parser_method = ""
  11. self.parser_config = {"pages": [[1, 1000000]]}
  12. self.source_type = "local"
  13. self.type = ""
  14. self.created_by = ""
  15. self.size = 0
  16. self.token_count = 0
  17. self.chunk_count = 0
  18. self.progress = 0.0
  19. self.progress_msg = ""
  20. self.process_begin_at = None
  21. self.process_duration = 0.0
  22. self.run = "0"
  23. self.status = "1"
  24. for k in list(res_dict.keys()):
  25. if k not in self.__dict__:
  26. res_dict.pop(k)
  27. super().__init__(rag, res_dict)
  28. def update(self,update_message:dict) -> bool:
  29. """
  30. Save the document details to the server.
  31. """
  32. res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
  33. res = res.json()
  34. if res.get("code") != 0:
  35. raise Exception(res["message"])
  36. def delete(self) -> bool:
  37. """
  38. Delete the document from the server.
  39. """
  40. res = self.rm('/doc/delete',
  41. {"document_id": self.id})
  42. res = res.json()
  43. if res.get("retmsg") == "success":
  44. return True
  45. raise Exception(res["retmsg"])
  46. def download(self) -> bytes:
  47. """
  48. Download the document content from the server using the Flask API.
  49. :return: The downloaded document content in bytes.
  50. """
  51. # Construct the URL for the API request using the document ID and knowledge base ID
  52. res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")
  53. # Check the response status code to ensure the request was successful
  54. if res.status_code == 200:
  55. # Return the document content as bytes
  56. return res.content
  57. else:
  58. # Handle the error and raise an exception
  59. raise Exception(
  60. f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
  61. )
  62. def async_parse(self):
  63. """
  64. Initiate document parsing asynchronously without waiting for completion.
  65. """
  66. try:
  67. # Construct request data including document ID and run status (assuming 1 means to run)
  68. data = {"document_ids": [self.id], "run": 1}
  69. # Send a POST request to the specified parsing status endpoint to start parsing
  70. res = self.post(f'/doc/run', data)
  71. # Check the server response status code
  72. if res.status_code != 200:
  73. raise Exception(f"Failed to start async parsing: {res.text}")
  74. print("Async parsing started successfully.")
  75. except Exception as e:
  76. # Catch and handle exceptions
  77. print(f"Error occurred during async parsing: {str(e)}")
  78. raise
  79. import time
  80. def join(self, interval=5, timeout=3600):
  81. """
  82. Wait for the asynchronous parsing to complete and yield parsing progress periodically.
  83. :param interval: The time interval (in seconds) for progress reports.
  84. :param timeout: The timeout (in seconds) for the parsing operation.
  85. :return: An iterator yielding parsing progress and messages.
  86. """
  87. start_time = time.time()
  88. while time.time() - start_time < timeout:
  89. # Check the parsing status
  90. res = self.get(f'/doc/{self.id}/status', {"document_ids": [self.id]})
  91. res_data = res.json()
  92. data = res_data.get("data", [])
  93. # Retrieve progress and status message
  94. progress = data.get("progress", 0)
  95. progress_msg = data.get("status", "")
  96. yield progress, progress_msg # Yield progress and message
  97. if progress == 100: # Parsing completed
  98. break
  99. time.sleep(interval)
  100. def cancel(self):
  101. """
  102. Cancel the parsing task for the document.
  103. """
  104. try:
  105. # Construct request data, including document ID and action to cancel (assuming 2 means cancel)
  106. data = {"document_ids": [self.id], "run": 2}
  107. # Send a POST request to the specified parsing status endpoint to cancel parsing
  108. res = self.post(f'/doc/run', data)
  109. # Check the server response status code
  110. if res.status_code != 200:
  111. print("Failed to cancel parsing. Server response:", res.text)
  112. else:
  113. print("Parsing cancelled successfully.")
  114. except Exception as e:
  115. print(f"Error occurred during async parsing cancellation: {str(e)}")
  116. raise
  117. def list_chunks(self, page=1, offset=0, limit=12,size=30, keywords="", available_int=None):
  118. """
  119. List all chunks associated with this document by calling the external API.
  120. Args:
  121. page (int): The page number to retrieve (default 1).
  122. size (int): The number of chunks per page (default 30).
  123. keywords (str): Keywords for searching specific chunks (default "").
  124. available_int (int): Filter for available chunks (optional).
  125. Returns:
  126. list: A list of chunks returned from the API.
  127. """
  128. data = {
  129. "document_id": self.id,
  130. "page": page,
  131. "size": size,
  132. "keywords": keywords,
  133. "offset":offset,
  134. "limit":limit
  135. }
  136. if available_int is not None:
  137. data["available_int"] = available_int
  138. res = self.post(f'/doc/chunk/list', data)
  139. if res.status_code == 200:
  140. res_data = res.json()
  141. if res_data.get("retmsg") == "success":
  142. chunks=[]
  143. for chunk_data in res_data["data"].get("chunks", []):
  144. chunk=Chunk(self.rag,chunk_data)
  145. chunks.append(chunk)
  146. return chunks
  147. else:
  148. raise Exception(f"Error fetching chunks: {res_data.get('retmsg')}")
  149. else:
  150. raise Exception(f"API request failed with status code {res.status_code}")
  151. def add_chunk(self, content: str):
  152. res = self.post('/doc/chunk/create', {"document_id": self.id, "content":content})
  153. if res.status_code == 200:
  154. res_data = res.json().get("data")
  155. chunk_data = res_data.get("chunk")
  156. return Chunk(self.rag,chunk_data)
  157. else:
  158. raise Exception(f"Failed to add chunk: {res.status_code} {res.text}")