Browse Source

Fix newline characters in tables during document parsing (#12112)

Co-authored-by: hisir <admin@qq.com>
tags/0.15.0
hisir 9 months ago
parent
commit
41f39bf3fc
No account linked to committer's email address
1 changed files with 42 additions and 18 deletions
  1. 42
    18
      api/core/workflow/nodes/document_extractor/node.py

+ 42
- 18
api/core/workflow/nodes/document_extractor/node.py View File

import io import io
import json import json
import logging import logging
import operator
import os import os
import tempfile import tempfile
from typing import cast from typing import cast
import pandas as pd import pandas as pd
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
import yaml # type: ignore import yaml # type: ignore
from docx.table import Table
from docx.text.paragraph import Paragraph


from configs import dify_config from configs import dify_config
from core.file import File, FileTransferMethod, file_manager from core.file import File, FileTransferMethod, file_manager
doc_file = io.BytesIO(file_content) doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file) doc = docx.Document(doc_file)
text = [] text = []
# Process paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text.append(paragraph.text)


# Process tables
for table in doc.tables:
# Table header
try:
# table maybe cause errors so ignore it.
if len(table.rows) > 0 and table.rows[0].cells is not None:
# Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = []

# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))

for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))

# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))

# Process sorted content
for _, item_type, item in content_items:
if item_type == "paragraph":
if isinstance(item, Table):
continue
text.append(item.text)
elif item_type == "table":
# Process tables
if not isinstance(item, Table):
continue
try:
# Check if any cell in the table has text # Check if any cell in the table has text
has_content = False has_content = False
for row in table.rows:
for row in item.rows:
if any(cell.text.strip() for cell in row.cells): if any(cell.text.strip() for cell in row.cells):
has_content = True has_content = True
break break


if has_content: if has_content:
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
for row in table.rows[1:]:
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
markdown_table = f"| {' | '.join(cell_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"

for row in item.rows[1:]:
# Replace newlines with <br> in each cell
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += "| " + " | ".join(row_cells) + " |\n"

text.append(markdown_table) text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue
except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue


return "\n".join(text) return "\n".join(text)

except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e



Loading…
Cancel
Save