| @@ -5,6 +5,7 @@ import json | |||
| import docx | |||
| import pandas as pd | |||
| import pypdfium2 | |||
| import yaml | |||
| from unstructured.partition.email import partition_email | |||
| from unstructured.partition.epub import partition_epub | |||
| from unstructured.partition.msg import partition_msg | |||
| @@ -101,6 +102,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str: | |||
| return _extract_text_from_msg(file_content) | |||
| case "application/json": | |||
| return _extract_text_from_json(file_content) | |||
| case "application/x-yaml" | "text/yaml": | |||
| return _extract_text_from_yaml(file_content) | |||
| case _: | |||
| raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}") | |||
| @@ -112,6 +115,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) | |||
| return _extract_text_from_plain_text(file_content) | |||
| case ".json": | |||
| return _extract_text_from_json(file_content) | |||
| case ".yaml" | ".yml": | |||
| return _extract_text_from_yaml(file_content) | |||
| case ".pdf": | |||
| return _extract_text_from_pdf(file_content) | |||
| case ".doc" | ".docx": | |||
| @@ -149,6 +154,15 @@ def _extract_text_from_json(file_content: bytes) -> str: | |||
| raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e | |||
| def _extract_text_from_yaml(file_content: bytes) -> str: | |||
| """Extract the content from yaml file""" | |||
| try: | |||
| yaml_data = yaml.safe_load_all(file_content.decode("utf-8")) | |||
| return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False) | |||
| except (UnicodeDecodeError, yaml.YAMLError) as e: | |||
| raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e | |||
| def _extract_text_from_pdf(file_content: bytes) -> str: | |||
| try: | |||
| pdf_file = io.BytesIO(file_content) | |||