|
|
|
@@ -22,24 +22,22 @@ import json |
|
|
|
from typing import Any |
|
|
|
|
|
|
|
from rag.nlp import find_codec |
|
|
|
|
|
|
|
|
|
|
|
class RAGFlowJsonParser: |
|
|
|
def __init__( |
|
|
|
self, max_chunk_size: int = 2000, min_chunk_size: int | None = None |
|
|
|
): |
|
|
|
def __init__(self, max_chunk_size: int = 2000, min_chunk_size: int | None = None): |
|
|
|
super().__init__() |
|
|
|
self.max_chunk_size = max_chunk_size * 2 |
|
|
|
self.min_chunk_size = ( |
|
|
|
min_chunk_size |
|
|
|
if min_chunk_size is not None |
|
|
|
else max(max_chunk_size - 200, 50) |
|
|
|
) |
|
|
|
self.min_chunk_size = min_chunk_size if min_chunk_size is not None else max(max_chunk_size - 200, 50) |
|
|
|
|
|
|
|
def __call__(self, binary): |
|
|
|
encoding = find_codec(binary) |
|
|
|
txt = binary.decode(encoding, errors="ignore") |
|
|
|
json_data = json.loads(txt) |
|
|
|
chunks = self.split_json(json_data, True) |
|
|
|
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line] |
|
|
|
|
|
|
|
if self.is_jsonl_format(txt): |
|
|
|
sections = self._parse_jsonl(txt) |
|
|
|
else: |
|
|
|
sections = self._parse_json(txt) |
|
|
|
return sections |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
@@ -60,14 +58,11 @@ class RAGFlowJsonParser: |
|
|
|
return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()} |
|
|
|
elif isinstance(data, list): |
|
|
|
# Convert the list to a dictionary with index-based keys |
|
|
|
return { |
|
|
|
str(i): self._list_to_dict_preprocessing(item) |
|
|
|
for i, item in enumerate(data) |
|
|
|
} |
|
|
|
return {str(i): self._list_to_dict_preprocessing(item) for i, item in enumerate(data)} |
|
|
|
else: |
|
|
|
# Base case: the item is neither a dict nor a list, so return it unchanged |
|
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
def _json_split( |
|
|
|
self, |
|
|
|
data, |
|
|
|
@@ -131,3 +126,54 @@ class RAGFlowJsonParser: |
|
|
|
|
|
|
|
# Convert to string |
|
|
|
return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks] |
|
|
|
|
|
|
|
def _parse_json(self, content: str) -> list[str]: |
|
|
|
sections = [] |
|
|
|
try: |
|
|
|
json_data = json.loads(content) |
|
|
|
chunks = self.split_json(json_data, True) |
|
|
|
sections = [json.dumps(line, ensure_ascii=False) for line in chunks if line] |
|
|
|
except json.JSONDecodeError: |
|
|
|
pass |
|
|
|
return sections |
|
|
|
|
|
|
|
def _parse_jsonl(self, content: str) -> list[str]: |
|
|
|
lines = content.strip().splitlines() |
|
|
|
all_chunks = [] |
|
|
|
for line in lines: |
|
|
|
if not line.strip(): |
|
|
|
continue |
|
|
|
try: |
|
|
|
data = json.loads(line) |
|
|
|
chunks = self.split_json(data, convert_lists=True) |
|
|
|
all_chunks.extend(json.dumps(chunk, ensure_ascii=False) for chunk in chunks if chunk) |
|
|
|
except json.JSONDecodeError: |
|
|
|
continue |
|
|
|
return all_chunks |
|
|
|
|
|
|
|
def is_jsonl_format(self, txt: str, sample_limit: int = 10, threshold: float = 0.8) -> bool: |
|
|
|
lines = [line.strip() for line in txt.strip().splitlines() if line.strip()] |
|
|
|
if not lines: |
|
|
|
return False |
|
|
|
|
|
|
|
try: |
|
|
|
json.loads(txt) |
|
|
|
return False |
|
|
|
except json.JSONDecodeError: |
|
|
|
pass |
|
|
|
|
|
|
|
sample_limit = min(len(lines), sample_limit) |
|
|
|
sample_lines = lines[:sample_limit] |
|
|
|
valid_lines = sum(1 for line in sample_lines if self._is_valid_json(line)) |
|
|
|
|
|
|
|
if not valid_lines: |
|
|
|
return False |
|
|
|
|
|
|
|
return (valid_lines / len(sample_lines)) >= threshold |
|
|
|
|
|
|
|
def _is_valid_json(self, line: str) -> bool: |
|
|
|
try: |
|
|
|
json.loads(line) |
|
|
|
return True |
|
|
|
except json.JSONDecodeError: |
|
|
|
return False |