|
|
|
|
|
|
|
|
|
|
|
|
|
|
from docx import Document |
|
|
from docx import Document |
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError |
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError |
|
|
from markdown import markdown |
|
|
|
|
|
|
|
|
from markdown import markdown |
|
|
from PIL import Image |
|
|
from PIL import Image |
|
|
from tika import parser |
|
|
from tika import parser |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Get the hierarchical title structure before the table""" |
|
|
"""Get the hierarchical title structure before the table""" |
|
|
import re |
|
|
import re |
|
|
from docx.text.paragraph import Paragraph |
|
|
from docx.text.paragraph import Paragraph |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
titles = [] |
|
|
titles = [] |
|
|
blocks = [] |
|
|
blocks = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Get document name from filename parameter |
|
|
# Get document name from filename parameter |
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) |
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) |
|
|
if not doc_name: |
|
|
if not doc_name: |
|
|
doc_name = "Untitled Document" |
|
|
doc_name = "Untitled Document" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Collect all document blocks while maintaining document order |
|
|
# Collect all document blocks while maintaining document order |
|
|
try: |
|
|
try: |
|
|
# Iterate through all paragraphs and tables in document order |
|
|
# Iterate through all paragraphs and tables in document order |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
logging.error(f"Error collecting blocks: {e}") |
|
|
logging.error(f"Error collecting blocks: {e}") |
|
|
return "" |
|
|
return "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Find the target table position |
|
|
# Find the target table position |
|
|
target_table_pos = -1 |
|
|
target_table_pos = -1 |
|
|
table_count = 0 |
|
|
table_count = 0 |
|
|
|
|
|
|
|
|
target_table_pos = pos |
|
|
target_table_pos = pos |
|
|
break |
|
|
break |
|
|
table_count += 1 |
|
|
table_count += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if target_table_pos == -1: |
|
|
if target_table_pos == -1: |
|
|
return "" # Target table not found |
|
|
return "" # Target table not found |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Find the nearest heading paragraph in reverse order |
|
|
# Find the nearest heading paragraph in reverse order |
|
|
nearest_title = None |
|
|
nearest_title = None |
|
|
for i in range(len(blocks)-1, -1, -1): |
|
|
for i in range(len(blocks)-1, -1, -1): |
|
|
block_type, pos, block = blocks[i] |
|
|
block_type, pos, block = blocks[i] |
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if block_type != 'p': |
|
|
if block_type != 'p': |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
try: |
|
|
try: |
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
|
|
|
|
|
|
break |
|
|
break |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
logging.error(f"Error parsing heading level: {e}") |
|
|
logging.error(f"Error parsing heading level: {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if nearest_title: |
|
|
if nearest_title: |
|
|
# Add current title |
|
|
# Add current title |
|
|
titles.append(nearest_title) |
|
|
titles.append(nearest_title) |
|
|
current_level = nearest_title[0] |
|
|
current_level = nearest_title[0] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Find all parent headings, allowing cross-level search |
|
|
# Find all parent headings, allowing cross-level search |
|
|
while current_level > 1: |
|
|
while current_level > 1: |
|
|
found = False |
|
|
found = False |
|
|
|
|
|
|
|
|
block_type, pos, block = blocks[i] |
|
|
block_type, pos, block = blocks[i] |
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
if pos >= target_table_pos: # Skip blocks after the table |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if block_type != 'p': |
|
|
if block_type != 'p': |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): |
|
|
try: |
|
|
try: |
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
level_match = re.search(r"(\d+)", block.style.name) |
|
|
if level_match: |
|
|
if level_match: |
|
|
level = int(level_match.group(1)) |
|
|
level = int(level_match.group(1)) |
|
|
# Find any heading with a higher level |
|
|
# Find any heading with a higher level |
|
|
if level < current_level: |
|
|
|
|
|
|
|
|
if level < current_level: |
|
|
title_text = block.text.strip() |
|
|
title_text = block.text.strip() |
|
|
if title_text: # Avoid empty titles |
|
|
if title_text: # Avoid empty titles |
|
|
titles.append((level, title_text)) |
|
|
titles.append((level, title_text)) |
|
|
|
|
|
|
|
|
break |
|
|
break |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
logging.error(f"Error parsing parent heading: {e}") |
|
|
logging.error(f"Error parsing parent heading: {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not found: # Break if no parent heading is found |
|
|
if not found: # Break if no parent heading is found |
|
|
break |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Sort by level (ascending, from highest to lowest) |
|
|
# Sort by level (ascending, from highest to lowest) |
|
|
titles.sort(key=lambda x: x[0]) |
|
|
titles.sort(key=lambda x: x[0]) |
|
|
# Organize titles (from highest to lowest) |
|
|
# Organize titles (from highest to lowest) |
|
|
hierarchy = [doc_name] + [t[1] for t in titles] |
|
|
hierarchy = [doc_name] + [t[1] for t in titles] |
|
|
return " > ".join(hierarchy) |
|
|
return " > ".join(hierarchy) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return "" |
|
|
return "" |
|
|
|
|
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000): |
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000): |
|
|
|
|
|
|
|
|
text = sections[0] |
|
|
text = sections[0] |
|
|
else: |
|
|
else: |
|
|
return [] |
|
|
return [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup |
|
|
from bs4 import BeautifulSoup |
|
|
html_content = markdown(text) |
|
|
html_content = markdown(text) |
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
|
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] |
|
|
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] |
|
|
return html_images |
|
|
return html_images |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_pictures(self, text): |
|
|
def get_pictures(self, text): |
|
|
"""Download and open all images from markdown text.""" |
|
|
"""Download and open all images from markdown text.""" |
|
|
import requests |
|
|
import requests |
|
|
|
|
|
|
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
logging.error(f"Failed to download/open image from {url}: {e}") |
|
|
logging.error(f"Failed to download/open image from {url}: {e}") |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return images if images else None |
|
|
return images if images else None |
|
|
|
|
|
|
|
|
def __call__(self, filename, binary=None): |
|
|
|
|
|
|
|
|
def __call__(self, filename, binary=None, separate_tables=True): |
|
|
if binary: |
|
|
if binary: |
|
|
encoding = find_codec(binary) |
|
|
encoding = find_codec(binary) |
|
|
txt = binary.decode(encoding, errors="ignore") |
|
|
txt = binary.decode(encoding, errors="ignore") |
|
|
else: |
|
|
else: |
|
|
with open(filename, "r") as f: |
|
|
with open(filename, "r") as f: |
|
|
txt = f.read() |
|
|
txt = f.read() |
|
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') |
|
|
|
|
|
|
|
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) |
|
|
sections = [] |
|
|
sections = [] |
|
|
tbls = [] |
|
|
tbls = [] |
|
|
for sec in remainder.split("\n"): |
|
|
for sec in remainder.split("\n"): |
|
|
|
|
|
|
|
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): |
|
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): |
|
|
callback(0.1, "Start to parse.") |
|
|
callback(0.1, "Start to parse.") |
|
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) |
|
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) |
|
|
sections, tables = markdown_parser(filename, binary) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sections, tables = markdown_parser(filename, binary, separate_tables=False) |
|
|
|
|
|
|
|
|
# Process images for each section |
|
|
# Process images for each section |
|
|
section_images = [] |
|
|
section_images = [] |
|
|
for section_text, _ in sections: |
|
|
for section_text, _ in sections: |
|
|
|
|
|
|
|
|
section_images.append(combined_image) |
|
|
section_images.append(combined_image) |
|
|
else: |
|
|
else: |
|
|
section_images.append(None) |
|
|
section_images.append(None) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
res = tokenize_table(tables, doc, is_english) |
|
|
res = tokenize_table(tables, doc, is_english) |
|
|
callback(0.8, "Finish parsing.") |
|
|
callback(0.8, "Finish parsing.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"delimiter", "\n!?。;!?")) |
|
|
"delimiter", "\n!?。;!?")) |
|
|
if kwargs.get("section_only", False): |
|
|
if kwargs.get("section_only", False): |
|
|
return chunks |
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) |
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) |
|
|
else: |
|
|
else: |
|
|
chunks = naive_merge( |
|
|
chunks = naive_merge( |
|
|
|
|
|
|
|
|
return chunks |
|
|
return chunks |
|
|
|
|
|
|
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) |
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st)) |
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st)) |
|
|
return res |
|
|
return res |
|
|
|
|
|
|