|
|
|
|
|
|
|
|
import io |
|
|
import io |
|
|
import json |
|
|
import json |
|
|
import logging |
|
|
import logging |
|
|
import operator |
|
|
|
|
|
import os |
|
|
import os |
|
|
import tempfile |
|
|
import tempfile |
|
|
from collections.abc import Mapping, Sequence |
|
|
from collections.abc import Mapping, Sequence |
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import pandas as pd |
|
|
import pypdfium2 # type: ignore |
|
|
import pypdfium2 # type: ignore |
|
|
import yaml # type: ignore |
|
|
import yaml # type: ignore |
|
|
|
|
|
from docx.document import Document |
|
|
|
|
|
from docx.oxml.table import CT_Tbl |
|
|
|
|
|
from docx.oxml.text.paragraph import CT_P |
|
|
from docx.table import Table |
|
|
from docx.table import Table |
|
|
from docx.text.paragraph import Paragraph |
|
|
from docx.text.paragraph import Paragraph |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e |
|
|
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def paser_docx_part(block, doc: Document, content_items, i): |
|
|
|
|
|
if isinstance(block, CT_P): |
|
|
|
|
|
content_items.append((i, "paragraph", Paragraph(block, doc))) |
|
|
|
|
|
elif isinstance(block, CT_Tbl): |
|
|
|
|
|
content_items.append((i, "table", Table(block, doc))) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_docx(file_content: bytes) -> str: |
|
|
def _extract_text_from_docx(file_content: bytes) -> str: |
|
|
""" |
|
|
""" |
|
|
Extract text from a DOCX file. |
|
|
Extract text from a DOCX file. |
|
|
|
|
|
|
|
|
# Keep track of paragraph and table positions |
|
|
# Keep track of paragraph and table positions |
|
|
content_items: list[tuple[int, str, Table | Paragraph]] = [] |
|
|
content_items: list[tuple[int, str, Table | Paragraph]] = [] |
|
|
|
|
|
|
|
|
# Process paragraphs and tables |
|
|
|
|
|
for i, paragraph in enumerate(doc.paragraphs): |
|
|
|
|
|
if paragraph.text.strip(): |
|
|
|
|
|
content_items.append((i, "paragraph", paragraph)) |
|
|
|
|
|
|
|
|
|
|
|
for i, table in enumerate(doc.tables): |
|
|
|
|
|
content_items.append((i, "table", table)) |
|
|
|
|
|
|
|
|
|
|
|
# Sort content items based on their original position |
|
|
|
|
|
content_items.sort(key=operator.itemgetter(0)) |
|
|
|
|
|
|
|
|
it = iter(doc.element.body) |
|
|
|
|
|
part = next(it, None) |
|
|
|
|
|
i = 0 |
|
|
|
|
|
while part is not None: |
|
|
|
|
|
paser_docx_part(part, doc, content_items, i) |
|
|
|
|
|
i = i + 1 |
|
|
|
|
|
part = next(it, None) |
|
|
|
|
|
|
|
|
# Process sorted content |
|
|
# Process sorted content |
|
|
for _, item_type, item in content_items: |
|
|
for _, item_type, item in content_items: |