| 
														 | 
														 | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														from collections.abc import Mapping, Sequence | 
														 | 
														 | 
														from collections.abc import Mapping, Sequence | 
													
													
												
													
														 | 
														 | 
														from typing import Any, cast | 
														 | 
														 | 
														from typing import Any, cast | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														import chardet | 
													
													
												
													
														 | 
														 | 
														import docx | 
														 | 
														 | 
														import docx | 
													
													
												
													
														 | 
														 | 
														import pandas as pd | 
														 | 
														 | 
														import pandas as pd | 
													
													
												
													
														 | 
														 | 
														import pypandoc  # type: ignore | 
														 | 
														 | 
														import pypandoc  # type: ignore | 
													
													
												
											
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														def _extract_text_from_plain_text(file_content: bytes) -> str: | 
														 | 
														 | 
														def _extract_text_from_plain_text(file_content: bytes) -> str: | 
													
													
												
													
														 | 
														 | 
														    try: | 
														 | 
														 | 
														    try: | 
													
													
												
													
														 | 
														 | 
														        return file_content.decode("utf-8", "ignore") | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														    except UnicodeDecodeError as e: | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														        raise TextExtractionError("Failed to decode plain text file") from e | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Detect encoding using chardet | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        result = chardet.detect(file_content) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        encoding = result["encoding"] | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Fallback to utf-8 if detection fails | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        if not encoding: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            encoding = "utf-8" | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        return file_content.decode(encoding, errors="ignore") | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														    except (UnicodeDecodeError, LookupError) as e: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # If decoding fails, try with utf-8 as last resort | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        try: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            return file_content.decode("utf-8", errors="ignore") | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        except UnicodeDecodeError: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            raise TextExtractionError(f"Failed to decode plain text file: {e}") from e | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														def _extract_text_from_json(file_content: bytes) -> str: | 
														 | 
														 | 
														def _extract_text_from_json(file_content: bytes) -> str: | 
													
													
												
													
														 | 
														 | 
														    try: | 
														 | 
														 | 
														    try: | 
													
													
												
													
														 | 
														 | 
														        json_data = json.loads(file_content.decode("utf-8", "ignore")) | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Detect encoding using chardet | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        result = chardet.detect(file_content) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        encoding = result["encoding"] | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Fallback to utf-8 if detection fails | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        if not encoding: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            encoding = "utf-8" | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        json_data = json.loads(file_content.decode(encoding, errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														        return json.dumps(json_data, indent=2, ensure_ascii=False) | 
														 | 
														 | 
														        return json.dumps(json_data, indent=2, ensure_ascii=False) | 
													
													
												
													
														 | 
														 | 
														    except (UnicodeDecodeError, json.JSONDecodeError) as e: | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														        raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														    except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # If decoding fails, try with utf-8 as last resort | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        try: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            json_data = json.loads(file_content.decode("utf-8", errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            return json.dumps(json_data, indent=2, ensure_ascii=False) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        except (UnicodeDecodeError, json.JSONDecodeError): | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														def _extract_text_from_yaml(file_content: bytes) -> str: | 
														 | 
														 | 
														def _extract_text_from_yaml(file_content: bytes) -> str: | 
													
													
												
													
														 | 
														 | 
														    """Extract the content from yaml file""" | 
														 | 
														 | 
														    """Extract the content from yaml file""" | 
													
													
												
													
														 | 
														 | 
														    try: | 
														 | 
														 | 
														    try: | 
													
													
												
													
														 | 
														 | 
														        yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Detect encoding using chardet | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        result = chardet.detect(file_content) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        encoding = result["encoding"] | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Fallback to utf-8 if detection fails | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        if not encoding: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            encoding = "utf-8" | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														        return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) | 
														 | 
														 | 
														        return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) | 
													
													
												
													
														 | 
														 | 
														    except (UnicodeDecodeError, yaml.YAMLError) as e: | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														        raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														    except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # If decoding fails, try with utf-8 as last resort | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        try: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        except (UnicodeDecodeError, yaml.YAMLError): | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														def _extract_text_from_pdf(file_content: bytes) -> str: | 
														 | 
														 | 
														def _extract_text_from_pdf(file_content: bytes) -> str: | 
													
													
												
											
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														def _extract_text_from_csv(file_content: bytes) -> str: | 
														 | 
														 | 
														def _extract_text_from_csv(file_content: bytes) -> str: | 
													
													
												
													
														 | 
														 | 
														    try: | 
														 | 
														 | 
														    try: | 
													
													
												
													
														 | 
														 | 
														        csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) | 
														 | 
														 | 
														 | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Detect encoding using chardet | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        result = chardet.detect(file_content) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        encoding = result["encoding"] | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        # Fallback to utf-8 if detection fails | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        if not encoding: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            encoding = "utf-8" | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        try: | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            csv_file = io.StringIO(file_content.decode(encoding, errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														        except (UnicodeDecodeError, LookupError): | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            # If decoding fails, try with utf-8 as last resort | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														            csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore")) | 
													
													
												
													
														 | 
														 | 
														 | 
														 | 
														 | 
														
  | 
													
													
												
													
														 | 
														 | 
														        csv_reader = csv.reader(csv_file) | 
														 | 
														 | 
														        csv_reader = csv.reader(csv_file) | 
													
													
												
													
														 | 
														 | 
														        rows = list(csv_reader) | 
														 | 
														 | 
														        rows = list(csv_reader) | 
													
													
												
													
														 | 
														 | 
														
  | 
														 | 
														 | 
														
  |