|
|
|
|
|
|
|
|
|
|
|
|
|
|
from docx import Document |
|
|
from docx import Document |
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError |
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError |
|
|
|
|
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship |
|
|
|
|
|
from docx.opc.oxml import parse_xml |
|
|
from markdown import markdown |
|
|
from markdown import markdown |
|
|
from PIL import Image |
|
|
from PIL import Image |
|
|
from tika import parser |
|
|
from tika import parser |
|
|
|
|
|
|
|
|
if not embed: |
|
|
if not embed: |
|
|
return None |
|
|
return None |
|
|
embed = embed[0] |
|
|
embed = embed[0] |
|
|
related_part = document.part.related_parts[embed] |
|
|
|
|
|
try: |
|
|
try: |
|
|
|
|
|
related_part = document.part.related_parts[embed] |
|
|
image_blob = related_part.image.blob |
|
|
image_blob = related_part.image.blob |
|
|
except UnrecognizedImageError: |
|
|
except UnrecognizedImageError: |
|
|
logging.info("Unrecognized image format. Skipping image.") |
|
|
logging.info("Unrecognized image format. Skipping image.") |
|
|
|
|
|
|
|
|
except UnicodeDecodeError: |
|
|
except UnicodeDecodeError: |
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.") |
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.") |
|
|
return None |
|
|
return None |
|
|
|
|
|
except Exception: |
|
|
|
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.") |
|
|
|
|
|
return None |
|
|
try: |
|
|
try: |
|
|
image = Image.open(BytesIO(image_blob)).convert('RGB') |
|
|
image = Image.open(BytesIO(image_blob)).convert('RGB') |
|
|
return image |
|
|
return image |
|
|
|
|
|
|
|
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) |
|
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) |
|
|
return sections, tbls |
|
|
return sections, tbls |
|
|
|
|
|
|
|
|
|
|
|
def load_from_xml_v2(baseURI, rels_item_xml): |
|
|
|
|
|
""" |
|
|
|
|
|
Return |_SerializedRelationships| instance loaded with the |
|
|
|
|
|
relationships contained in *rels_item_xml*. Returns an empty |
|
|
|
|
|
collection if *rels_item_xml* is |None|. |
|
|
|
|
|
""" |
|
|
|
|
|
srels = _SerializedRelationships() |
|
|
|
|
|
if rels_item_xml is not None: |
|
|
|
|
|
rels_elm = parse_xml(rels_item_xml) |
|
|
|
|
|
for rel_elm in rels_elm.Relationship_lst: |
|
|
|
|
|
if rel_elm.target_ref in ('../NULL', 'NULL'): |
|
|
|
|
|
continue |
|
|
|
|
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) |
|
|
|
|
|
return srels |
|
|
|
|
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000, |
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000, |
|
|
lang="Chinese", callback=None, **kwargs): |
|
|
lang="Chinese", callback=None, **kwargs): |
|
|
|
|
|
|
|
|
except Exception: |
|
|
except Exception: |
|
|
vision_model = None |
|
|
vision_model = None |
|
|
|
|
|
|
|
|
|
|
|
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 |
|
|
|
|
|
_SerializedRelationships.load_from_xml = load_from_xml_v2 |
|
|
sections, tables = Docx()(filename, binary) |
|
|
sections, tables = Docx()(filename, binary) |
|
|
|
|
|
|
|
|
if vision_model: |
|
|
if vision_model: |