|
|
|
@@ -2,6 +2,7 @@ import csv |
|
|
|
import io |
|
|
|
import json |
|
|
|
import logging |
|
|
|
import operator |
|
|
|
import os |
|
|
|
import tempfile |
|
|
|
from typing import cast |
|
|
|
@@ -10,6 +11,8 @@ import docx |
|
|
|
import pandas as pd |
|
|
|
import pypdfium2 # type: ignore |
|
|
|
import yaml # type: ignore |
|
|
|
from docx.table import Table |
|
|
|
from docx.text.paragraph import Paragraph |
|
|
|
|
|
|
|
from configs import dify_config |
|
|
|
from core.file import File, FileTransferMethod, file_manager |
|
|
|
@@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str: |
|
|
|
doc_file = io.BytesIO(file_content) |
|
|
|
doc = docx.Document(doc_file) |
|
|
|
text = [] |
|
|
|
# Process paragraphs |
|
|
|
for paragraph in doc.paragraphs: |
|
|
|
if paragraph.text.strip(): |
|
|
|
text.append(paragraph.text) |
|
|
|
|
|
|
|
# Process tables |
|
|
|
for table in doc.tables: |
|
|
|
# Table header |
|
|
|
try: |
|
|
|
# table maybe cause errors so ignore it. |
|
|
|
if len(table.rows) > 0 and table.rows[0].cells is not None: |
|
|
|
# Keep track of paragraph and table positions |
|
|
|
content_items: list[tuple[int, str, Table | Paragraph]] = [] |
|
|
|
|
|
|
|
# Process paragraphs and tables |
|
|
|
for i, paragraph in enumerate(doc.paragraphs): |
|
|
|
if paragraph.text.strip(): |
|
|
|
content_items.append((i, "paragraph", paragraph)) |
|
|
|
|
|
|
|
for i, table in enumerate(doc.tables): |
|
|
|
content_items.append((i, "table", table)) |
|
|
|
|
|
|
|
# Sort content items based on their original position |
|
|
|
content_items.sort(key=operator.itemgetter(0)) |
|
|
|
|
|
|
|
# Process sorted content |
|
|
|
for _, item_type, item in content_items: |
|
|
|
if item_type == "paragraph": |
|
|
|
if isinstance(item, Table): |
|
|
|
continue |
|
|
|
text.append(item.text) |
|
|
|
elif item_type == "table": |
|
|
|
# Process tables |
|
|
|
if not isinstance(item, Table): |
|
|
|
continue |
|
|
|
try: |
|
|
|
# Check if any cell in the table has text |
|
|
|
has_content = False |
|
|
|
for row in table.rows: |
|
|
|
for row in item.rows: |
|
|
|
if any(cell.text.strip() for cell in row.cells): |
|
|
|
has_content = True |
|
|
|
break |
|
|
|
|
|
|
|
if has_content: |
|
|
|
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n" |
|
|
|
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n" |
|
|
|
for row in table.rows[1:]: |
|
|
|
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n" |
|
|
|
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells] |
|
|
|
markdown_table = f"| {' | '.join(cell_texts)} |\n" |
|
|
|
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n" |
|
|
|
|
|
|
|
for row in item.rows[1:]: |
|
|
|
# Replace newlines with <br> in each cell |
|
|
|
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells] |
|
|
|
markdown_table += "| " + " | ".join(row_cells) + " |\n" |
|
|
|
|
|
|
|
text.append(markdown_table) |
|
|
|
except Exception as e: |
|
|
|
logger.warning(f"Failed to extract table from DOC/DOCX: {e}") |
|
|
|
continue |
|
|
|
except Exception as e: |
|
|
|
logger.warning(f"Failed to extract table from DOC/DOCX: {e}") |
|
|
|
continue |
|
|
|
|
|
|
|
return "\n".join(text) |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e |
|
|
|
|