|
|
|
@@ -4,8 +4,8 @@ import json |
|
|
|
|
|
|
|
import docx |
|
|
|
import pandas as pd |
|
|
|
import pypdfium2 |
|
|
|
import yaml |
|
|
|
import pypdfium2 # type: ignore |
|
|
|
import yaml # type: ignore |
|
|
|
from unstructured.partition.api import partition_via_api |
|
|
|
from unstructured.partition.email import partition_email |
|
|
|
from unstructured.partition.epub import partition_epub |
|
|
|
@@ -237,15 +237,17 @@ def _extract_text_from_csv(file_content: bytes) -> str: |
|
|
|
|
|
|
|
def _extract_text_from_excel(file_content: bytes) -> str: |
|
|
|
"""Extract text from an Excel file using pandas.""" |
|
|
|
|
|
|
|
try: |
|
|
|
df = pd.read_excel(io.BytesIO(file_content)) |
|
|
|
|
|
|
|
# Drop rows where all elements are NaN |
|
|
|
df.dropna(how="all", inplace=True) |
|
|
|
|
|
|
|
# Convert DataFrame to Markdown table |
|
|
|
markdown_table = df.to_markdown(index=False) |
|
|
|
excel_file = pd.ExcelFile(io.BytesIO(file_content)) |
|
|
|
markdown_table = "" |
|
|
|
for sheet_name in excel_file.sheet_names: |
|
|
|
try: |
|
|
|
df = excel_file.parse(sheet_name=sheet_name) |
|
|
|
df.dropna(how="all", inplace=True) |
|
|
|
# Create Markdown table two times to separate tables with a newline |
|
|
|
markdown_table += df.to_markdown(index=False) + "\n\n" |
|
|
|
except Exception as e: |
|
|
|
continue |
|
|
|
return markdown_table |
|
|
|
except Exception as e: |
|
|
|
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e |