Przeglądaj źródła

fix: doc can not extract tables (#11879)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
Co-authored-by: akinobu-i <akinobu-i@users.noreply.github.com>
tags/0.14.2
yihong 10 miesięcy temu
rodzic
commit
ac635c70cd
No account linked to committer's email address

+ 37
- 1
api/core/workflow/nodes/document_extractor/node.py Wyświetl plik

import csv import csv
import io import io
import json import json
import logging
import os import os
import tempfile import tempfile


from .entities import DocumentExtractorNodeData from .entities import DocumentExtractorNodeData
from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError from .exc import DocumentExtractorError, FileDownloadError, TextExtractionError, UnsupportedFileTypeError


logger = logging.getLogger(__name__)



class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]): class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
""" """




def _extract_text_from_doc(file_content: bytes) -> str: def _extract_text_from_doc(file_content: bytes) -> str:
"""
Extract text from a DOC/DOCX file.
For now support only paragraph and table add more if needed
"""
try: try:
doc_file = io.BytesIO(file_content) doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file) doc = docx.Document(doc_file)
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
text = []
# Process paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text.append(paragraph.text)

# Process tables
for table in doc.tables:
# Table header
try:
# table maybe cause errors so ignore it.
if len(table.rows) > 0 and table.rows[0].cells is not None:
# Check if any cell in the table has text
has_content = False
for row in table.rows:
if any(cell.text.strip() for cell in row.cells):
has_content = True
break

if has_content:
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
for row in table.rows[1:]:
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue

return "\n".join(text)
except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e



Ładowanie…
Anuluj
Zapisz