瀏覽代碼

feat: document extractor chardet encoding (#20269)

Signed-off-by: -LAN- <laipz8200@outlook.com>
tags/1.4.1
-LAN- 5 月之前
父節點
當前提交
9c9d3d7bd0
沒有連結到貢獻者的電子郵件帳戶。

+ 62
- 10
api/core/workflow/nodes/document_extractor/node.py 查看文件

from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any, cast from typing import Any, cast


import chardet
import docx import docx
import pandas as pd import pandas as pd
import pypandoc # type: ignore import pypandoc # type: ignore


def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_plain_text(file_content: bytes) -> str:
try: try:
return file_content.decode("utf-8", "ignore")
except UnicodeDecodeError as e:
raise TextExtractionError("Failed to decode plain text file") from e
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]

# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"

return file_content.decode(encoding, errors="ignore")
except (UnicodeDecodeError, LookupError) as e:
# If decoding fails, try with utf-8 as last resort
try:
return file_content.decode("utf-8", errors="ignore")
except UnicodeDecodeError:
raise TextExtractionError(f"Failed to decode plain text file: {e}") from e




def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str:
try: try:
json_data = json.loads(file_content.decode("utf-8", "ignore"))
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]

# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"

json_data = json.loads(file_content.decode(encoding, errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False) return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
# If decoding fails, try with utf-8 as last resort
try:
json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError):
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e




def _extract_text_from_yaml(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file""" """Extract the content from yaml file"""
try: try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]

# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"

yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError) as e:
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
# If decoding fails, try with utf-8 as last resort
try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError):
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e




def _extract_text_from_pdf(file_content: bytes) -> str: def _extract_text_from_pdf(file_content: bytes) -> str:


def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_csv(file_content: bytes) -> str:
try: try:
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
# Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]

# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"

try:
csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
except (UnicodeDecodeError, LookupError):
# If decoding fails, try with utf-8 as last resort
csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))

csv_reader = csv.reader(csv_file) csv_reader = csv.reader(csv_file)
rows = list(csv_reader) rows = list(csv_reader)



+ 1
- 1
api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py 查看文件

temp_file.write(non_utf8_content) temp_file.write(non_utf8_content)
temp_file.seek(0) temp_file.seek(0)
text = _extract_text_from_plain_text(temp_file.read()) text = _extract_text_from_plain_text(temp_file.read())
assert text == "Hello, world."
assert text == "Hello, world©."




@patch("pypdfium2.PdfDocument") @patch("pypdfium2.PdfDocument")

Loading…
取消
儲存