|
|
|
@@ -67,6 +67,111 @@ class Docx(DocxParser): |
|
|
|
line = re.sub(r"\u3000", " ", line).strip() |
|
|
|
return line |
|
|
|
|
|
|
|
def __get_nearest_title(self, table_index, filename): |
|
|
|
"""Get the hierarchical title structure before the table""" |
|
|
|
import re |
|
|
|
from docx.text.paragraph import Paragraph |
|
|
|
|
|
|
|
titles = [] |
|
|
|
blocks = [] |
|
|
|
|
|
|
|
# Get document name from filename parameter |
|
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) |
|
|
|
if not doc_name: |
|
|
|
doc_name = "Untitled Document" |
|
|
|
|
|
|
|
# Collect all document blocks while maintaining document order |
|
|
|
try: |
|
|
|
# Iterate through all paragraphs and tables in document order |
|
|
|
for i, block in enumerate(self.doc._element.body): |
|
|
|
if block.tag.endswith('p'): # Paragraph |
|
|
|
p = Paragraph(block, self.doc) |
|
|
|
blocks.append(('p', i, p)) |
|
|
|
elif block.tag.endswith('tbl'): # Table |
|
|
|
blocks.append(('t', i, None)) # Table object will be retrieved later |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error collecting blocks: {e}") |
|
|
|
return "" |
|
|
|
|
|
|
|
# Find the target table position |
|
|
|
target_table_pos = -1 |
|
|
|
table_count = 0 |
|
|
|
for i, (block_type, pos, _) in enumerate(blocks): |
|
|
|
if block_type == 't': |
|
|
|
if table_count == table_index: |
|
|
|
target_table_pos = pos |
|
|
|
break |
|
|
|
table_count += 1 |
|
|
|
|
|
|
|
if target_table_pos == -1: |
|
|
|
return "" # Target table not found |
|
|
|
|
|
|
|
# Find the nearest heading paragraph in reverse order |
|
|
|
nearest_title = None |
|
|
|
for i in range(len(blocks)-1, -1, -1): |
|
|
|
block_type, pos, block = blocks[i] |
|
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
|
continue |
|
|
|
|
|
|
|
if block_type != 'p': |
|
|
|
continue |
|
|
|
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
|
try: |
|
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
|
if level_match: |
|
|
|
level = int(level_match.group(1)) |
|
|
|
if level <= 7: # Support up to 7 heading levels |
|
|
|
title_text = block.text.strip() |
|
|
|
if title_text: # Avoid empty titles |
|
|
|
nearest_title = (level, title_text) |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error parsing heading level: {e}") |
|
|
|
|
|
|
|
if nearest_title: |
|
|
|
# Add current title |
|
|
|
titles.append(nearest_title) |
|
|
|
current_level = nearest_title[0] |
|
|
|
|
|
|
|
# Find all parent headings, allowing cross-level search |
|
|
|
while current_level > 1: |
|
|
|
found = False |
|
|
|
for i in range(len(blocks)-1, -1, -1): |
|
|
|
block_type, pos, block = blocks[i] |
|
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
|
continue |
|
|
|
|
|
|
|
if block_type != 'p': |
|
|
|
continue |
|
|
|
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
|
try: |
|
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
|
if level_match: |
|
|
|
level = int(level_match.group(1)) |
|
|
|
# Find any heading with a higher level |
|
|
|
if level < current_level: |
|
|
|
title_text = block.text.strip() |
|
|
|
if title_text: # Avoid empty titles |
|
|
|
titles.append((level, title_text)) |
|
|
|
current_level = level |
|
|
|
found = True |
|
|
|
break |
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error parsing parent heading: {e}") |
|
|
|
|
|
|
|
if not found: # Break if no parent heading is found |
|
|
|
break |
|
|
|
|
|
|
|
# Sort by level (ascending, from highest to lowest) |
|
|
|
titles.sort(key=lambda x: x[0]) |
|
|
|
# Organize titles (from highest to lowest) |
|
|
|
hierarchy = [doc_name] + [t[1] for t in titles] |
|
|
|
return " > ".join(hierarchy) |
|
|
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000): |
|
|
|
self.doc = Document( |
|
|
|
filename) if not binary else Document(BytesIO(binary)) |
|
|
|
@@ -108,8 +213,11 @@ class Docx(DocxParser): |
|
|
|
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines] |
|
|
|
|
|
|
|
tbls = [] |
|
|
|
for tb in self.doc.tables: |
|
|
|
for i, tb in enumerate(self.doc.tables): |
|
|
|
title = self.__get_nearest_title(i, filename) |
|
|
|
html = "<table>" |
|
|
|
if title: |
|
|
|
html += f"<caption>Table Location: {title}</caption>" |
|
|
|
for r in tb.rows: |
|
|
|
html += "<tr>" |
|
|
|
i = 0 |