Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. # The following documents are mainly referenced, and only adaptation modifications have been made
  18. # from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
  19. import json
  20. from typing import Any
  21. from rag.nlp import find_codec
  22. class RAGFlowJsonParser:
  23. def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None):
  24. super().__init__()
  25. self.max_chunk_size = max_chunk_size * 2
  26. self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50)
  27. def __call__(self, binary):
  28. encoding = find_codec(binary)
  29. txt = binary.decode(encoding, errors="ignore")
  30. if self.is_jsonl_format(txt):
  31. sections = self._parse_jsonl(txt)
  32. else:
  33. sections = self._parse_json(txt)
  34. return sections
  35. @staticmethod
  36. def _json_size(data: dict) -> int:
  37. """Calculate the size of the serialized JSON object."""
  38. return len(json.dumps(data, ensure_ascii=False))
  39. @staticmethod
  40. def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
  41. """Set a value in a nested dictionary based on the given path."""
  42. for key in path[:-1]:
  43. d = d.setdefault(key, {})
  44. d[path[-1]] = value
  45. def _list_to_dict_preprocessing(self, data: Any) -> Any:
  46. if isinstance(data, dict):
  47. # Process each key-value pair in the dictionary
  48. return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
  49. elif isinstance(data, list):
  50. # Convert the list to a dictionary with index-based keys
  51. return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)}
  52. else:
  53. # Base case: the item is neither a dict nor a list, so return it unchanged
  54. return data
  55. def _json_split(
  56. self,
  57. data,
  58. current_path: list[str] | None,
  59. chunks: list[dict] | None,
  60. ) -> list[dict]:
  61. """
  62. Split json into maximum size dictionaries while preserving structure.
  63. """
  64. current_path = current_path or []
  65. chunks = chunks or [{}]
  66. if isinstance(data, dict):
  67. for key, value in data.items():
  68. new_path = current_path + [key]
  69. chunk_size = self._json_size(chunks[-1])
  70. size = self._json_size({key: value})
  71. remaining = self.max_chunk_size - chunk_size
  72. if size < remaining:
  73. # Add item to current chunk
  74. self._set_nested_dict(chunks[-1], new_path, value)
  75. else:
  76. if chunk_size >= self.min_chunk_size:
  77. # Chunk is big enough, start a new chunk
  78. chunks.append({})
  79. # Iterate
  80. self._json_split(value, new_path, chunks)
  81. else:
  82. # handle single item
  83. self._set_nested_dict(chunks[-1], current_path, data)
  84. return chunks
  85. def split_json(
  86. self,
  87. json_data,
  88. convert_lists: bool = False,
  89. ) -> list[dict]:
  90. """Splits JSON into a list of JSON chunks"""
  91. if convert_lists:
  92. preprocessed_data = self._list_to_dict_preprocessing(json_data)
  93. chunks = self._json_split(preprocessed_data, None, None)
  94. else:
  95. chunks = self._json_split(json_data, None, None)
  96. # Remove the last chunk if it's empty
  97. if not chunks[-1]:
  98. chunks.pop()
  99. return chunks
  100. def split_text(
  101. self,
  102. json_data: dict[str, Any],
  103. convert_lists: bool = False,
  104. ensure_ascii: bool = True,
  105. ) -> list[str]:
  106. """Splits JSON into a list of JSON formatted strings"""
  107. chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
  108. # Convert to string
  109. return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
  110. def _parse_json(self, content: str) -> list[str]:
  111. sections = []
  112. try:
  113. json_data = json.loads(content)
  114. chunks = self.split_json(json_data, True)
  115. sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
  116. except json.JSONDecodeError:
  117. pass
  118. return sections
  119. def _parse_jsonl(self, content: str) -> list[str]:
  120. lines = content.strip().splitlines()
  121. all_chunks = []
  122. for line in lines:
  123. if not line.strip():
  124. continue
  125. try:
  126. data = json.loads(line)
  127. chunks = self.split_json(data, convert_lists=True)
  128. all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk)
  129. except json.JSONDecodeError:
  130. continue
  131. return all_chunks
  132. def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool:
  133. lines = [line.strip() for line in txt.strip().splitlines() if line.strip()]
  134. if not lines:
  135. return False
  136. try:
  137. json.loads(txt)
  138. return False
  139. except json.JSONDecodeError:
  140. pass
  141. sample_limit = min(len(lines), sample_limit)
  142. sample_lines = lines[:sample_limit]
  143. valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line))
  144. if not valid_lines:
  145. return False
  146. return (valid_lines / len(sample_lines)) >= threshold
  147. def _is_valid_json(self, line: str) -> bool:
  148. try:
  149. json.loads(line)
  150. return True
  151. except json.JSONDecodeError:
  152. return False