Browse Source

reopen PR for #14411 (#16148)

tags/1.1.1
cyflhn 7 months ago
parent
commit
1789437cc5
No account linked to committer's email address

+ 17
- 11
api/core/workflow/nodes/document_extractor/node.py View File

import io import io
import json import json
import logging import logging
import operator
import os import os
import tempfile import tempfile
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
import pandas as pd import pandas as pd
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
import yaml # type: ignore import yaml # type: ignore
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table from docx.table import Table
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph


raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e




def paser_docx_part(block, doc: Document, content_items, i):
if isinstance(block, CT_P):
content_items.append((i, "paragraph", Paragraph(block, doc)))
elif isinstance(block, CT_Tbl):
content_items.append((i, "table", Table(block, doc)))


def _extract_text_from_docx(file_content: bytes) -> str: def _extract_text_from_docx(file_content: bytes) -> str:
""" """
Extract text from a DOCX file. Extract text from a DOCX file.
# Keep track of paragraph and table positions # Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = [] content_items: list[tuple[int, str, Table | Paragraph]] = []


# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))

for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))

# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
it = iter(doc.element.body)
part = next(it, None)
i = 0
while part is not None:
paser_docx_part(part, doc, content_items, i)
i = i + 1
part = next(it, None)


# Process sorted content # Process sorted content
for _, item_type, item in content_items: for _, item_type, item in content_items:

+ 7
- 1
api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py View File

from unittest.mock import Mock, patch from unittest.mock import Mock, patch


import pytest import pytest
from docx.oxml.text.paragraph import CT_P


from core.file import File, FileTransferMethod from core.file import File, FileTransferMethod
from core.variables import ArrayFileSegment from core.variables import ArrayFileSegment
mock_paragraph2 = Mock() mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2" mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2] mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]

mock_ct_p1 = Mock(spec=CT_P)
mock_ct_p1.text = "Paragraph 1"
mock_ct_p2 = Mock(spec=CT_P)
mock_ct_p2.text = "Paragraph 2"
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
mock_document.return_value.element = mock_element
text = _extract_text_from_docx(b"PK\x03\x04") text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2" assert text == "Paragraph 1\nParagraph 2"



Loading…
Cancel
Save