Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

json_parser.py 4.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. # The following documents are mainly referenced, and only adaptation modifications have been made
  18. # from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
  19. import json
  20. from typing import Any
  21. from rag.nlp import find_codec
  22. class RAGFlowJsonParser:
  23. def __init__(
  24. self, max_chunk_size: int = 2000, min_chunk_size: int | None = None
  25. ):
  26. super().__init__()
  27. self.max_chunk_size = max_chunk_size * 2
  28. self.min_chunk_size = (
  29. min_chunk_size
  30. if min_chunk_size is not None
  31. else max(max_chunk_size - 200, 50)
  32. )
  33. def __call__(self, binary):
  34. encoding = find_codec(binary)
  35. txt = binary.decode(encoding, errors="ignore")
  36. json_data = json.loads(txt)
  37. chunks = self.split_json(json_data, True)
  38. sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line]
  39. return sections
  40. @staticmethod
  41. def _json_size(data: dict) -> int:
  42. """Calculate the size of the serialized JSON object."""
  43. return len(json.dumps(data, ensure_ascii=False))
  44. @staticmethod
  45. def _set_nested_dict(d: dict, path: list[str], value: Any) -> None:
  46. """Set a value in a nested dictionary based on the given path."""
  47. for key in path[:-1]:
  48. d = d.setdefault(key, {})
  49. d[path[-1]] = value
  50. def _list_to_dict_preprocessing(self, data: Any) -> Any:
  51. if isinstance(data, dict):
  52. # Process each key-value pair in the dictionary
  53. return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
  54. elif isinstance(data, list):
  55. # Convert the list to a dictionary with index-based keys
  56. return {
  57. str(i): self._list_to_dict_preprocessing(item)
  58. for i, item in enumerate(data)
  59. }
  60. else:
  61. # Base case: the item is neither a dict nor a list, so return it unchanged
  62. return data
  63. def _json_split(
  64. self,
  65. data,
  66. current_path: list[str] | None,
  67. chunks: list[dict] | None,
  68. ) -> list[dict]:
  69. """
  70. Split json into maximum size dictionaries while preserving structure.
  71. """
  72. current_path = current_path or []
  73. chunks = chunks or [{}]
  74. if isinstance(data, dict):
  75. for key, value in data.items():
  76. new_path = current_path + [key]
  77. chunk_size = self._json_size(chunks[-1])
  78. size = self._json_size({key: value})
  79. remaining = self.max_chunk_size - chunk_size
  80. if size < remaining:
  81. # Add item to current chunk
  82. self._set_nested_dict(chunks[-1], new_path, value)
  83. else:
  84. if chunk_size >= self.min_chunk_size:
  85. # Chunk is big enough, start a new chunk
  86. chunks.append({})
  87. # Iterate
  88. self._json_split(value, new_path, chunks)
  89. else:
  90. # handle single item
  91. self._set_nested_dict(chunks[-1], current_path, data)
  92. return chunks
  93. def split_json(
  94. self,
  95. json_data,
  96. convert_lists: bool = False,
  97. ) -> list[dict]:
  98. """Splits JSON into a list of JSON chunks"""
  99. if convert_lists:
  100. preprocessed_data = self._list_to_dict_preprocessing(json_data)
  101. chunks = self._json_split(preprocessed_data, None, None)
  102. else:
  103. chunks = self._json_split(json_data, None, None)
  104. # Remove the last chunk if it's empty
  105. if not chunks[-1]:
  106. chunks.pop()
  107. return chunks
  108. def split_text(
  109. self,
  110. json_data: dict[str, Any],
  111. convert_lists: bool = False,
  112. ensure_ascii: bool = True,
  113. ) -> list[str]:
  114. """Splits JSON into a list of JSON formatted strings"""
  115. chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
  116. # Convert to string
  117. return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]