Browse Source

Fix: fixed context loss caused by separating markdown tables from original text (#8844)

### What problem does this PR solve?

Fix context loss caused by separating markdown tables from original
text. #6871, #8804.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.20.0
Yongteng Lei 3 months ago
parent
commit
51a8604dcb
No account linked to committer's email address
2 changed files with 73 additions and 44 deletions
  1. 46
    17
      deepdoc/parser/markdown_parser.py
  2. 27
    27
      rag/app/naive.py

+ 46
- 17
deepdoc/parser/markdown_parser.py View File



import re import re


from markdown import markdown

class RAGFlowMarkdownParser: class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128): def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num) self.chunk_token_num = int(chunk_token_num)


def extract_tables_and_remainder(self, markdown_text):
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
tables = [] tables = []
remainder = markdown_text
working_text = markdown_text

def replace_tables_with_rendered_html(pattern, table_list, render=True):
new_text = ""
last_end = 0
for match in pattern.finditer(working_text):
raw_table = match.group()
table_list.append(raw_table)
if separate_tables:
# Skip this match (i.e., remove it)
new_text += working_text[last_end:match.start()] + "\n\n"
else:
# Replace with rendered HTML
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
new_text += working_text[last_end:match.start()] + html_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
return new_text

if "|" in markdown_text: # for optimize performance if "|" in markdown_text: # for optimize performance
# Standard Markdown table # Standard Markdown table
border_table_pattern = re.compile( border_table_pattern = re.compile(
r''' r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+ (?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
border_tables = border_table_pattern.findall(markdown_text)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)


# Borderless Markdown table # Borderless Markdown table
no_border_table_pattern = re.compile( no_border_table_pattern = re.compile(
r''' r'''
(?:\n|^)
(?:\n|^)
(?:\S.*?\|.*?\n) (?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+ (?:\S.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)


if "<table>" in remainder.lower(): # for optimize performance
if "<table>" in working_text.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags #HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile( html_table_pattern = re.compile(
r''' r'''
''', ''',
re.VERBOSE | re.DOTALL | re.IGNORECASE re.VERBOSE | re.DOTALL | re.IGNORECASE
) )
html_tables = html_table_pattern.findall(remainder)
tables.extend(html_tables)
remainder = html_table_pattern.sub('', remainder)
def replace_html_tables():
nonlocal working_text
new_text = ""
last_end = 0
for match in html_table_pattern.finditer(working_text):
raw_table = match.group()
tables.append(raw_table)
if separate_tables:
new_text += working_text[last_end:match.start()] + "\n\n"
else:
new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
working_text = new_text

replace_html_tables()


return remainder, tables
return working_text, tables

+ 27
- 27
rag/app/naive.py View File



from docx import Document from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown
from markdown import markdown
from PIL import Image from PIL import Image
from tika import parser from tika import parser


"""Get the hierarchical title structure before the table""" """Get the hierarchical title structure before the table"""
import re import re
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
titles = [] titles = []
blocks = [] blocks = []
# Get document name from filename parameter # Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name: if not doc_name:
doc_name = "Untitled Document" doc_name = "Untitled Document"
# Collect all document blocks while maintaining document order # Collect all document blocks while maintaining document order
try: try:
# Iterate through all paragraphs and tables in document order # Iterate through all paragraphs and tables in document order
except Exception as e: except Exception as e:
logging.error(f"Error collecting blocks: {e}") logging.error(f"Error collecting blocks: {e}")
return "" return ""
# Find the target table position # Find the target table position
target_table_pos = -1 target_table_pos = -1
table_count = 0 table_count = 0
target_table_pos = pos target_table_pos = pos
break break
table_count += 1 table_count += 1
if target_table_pos == -1: if target_table_pos == -1:
return "" # Target table not found return "" # Target table not found
# Find the nearest heading paragraph in reverse order # Find the nearest heading paragraph in reverse order
nearest_title = None nearest_title = None
for i in range(len(blocks)-1, -1, -1): for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i] block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table if pos >= target_table_pos: # Skip blocks after the table
continue continue
if block_type != 'p': if block_type != 'p':
continue continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try: try:
level_match = re.search(r"(\d+)", block.style.name) level_match = re.search(r"(\d+)", block.style.name)
break break
except Exception as e: except Exception as e:
logging.error(f"Error parsing heading level: {e}") logging.error(f"Error parsing heading level: {e}")
if nearest_title: if nearest_title:
# Add current title # Add current title
titles.append(nearest_title) titles.append(nearest_title)
current_level = nearest_title[0] current_level = nearest_title[0]
# Find all parent headings, allowing cross-level search # Find all parent headings, allowing cross-level search
while current_level > 1: while current_level > 1:
found = False found = False
block_type, pos, block = blocks[i] block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table if pos >= target_table_pos: # Skip blocks after the table
continue continue
if block_type != 'p': if block_type != 'p':
continue continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try: try:
level_match = re.search(r"(\d+)", block.style.name) level_match = re.search(r"(\d+)", block.style.name)
if level_match: if level_match:
level = int(level_match.group(1)) level = int(level_match.group(1))
# Find any heading with a higher level # Find any heading with a higher level
if level < current_level:
if level < current_level:
title_text = block.text.strip() title_text = block.text.strip()
if title_text: # Avoid empty titles if title_text: # Avoid empty titles
titles.append((level, title_text)) titles.append((level, title_text))
break break
except Exception as e: except Exception as e:
logging.error(f"Error parsing parent heading: {e}") logging.error(f"Error parsing parent heading: {e}")
if not found: # Break if no parent heading is found if not found: # Break if no parent heading is found
break break
# Sort by level (ascending, from highest to lowest) # Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0]) titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest) # Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles] hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy) return " > ".join(hierarchy)
return "" return ""


def __call__(self, filename, binary=None, from_page=0, to_page=100000): def __call__(self, filename, binary=None, from_page=0, to_page=100000):
text = sections[0] text = sections[0]
else: else:
return [] return []
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
html_content = markdown(text) html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images return html_images
def get_pictures(self, text): def get_pictures(self, text):
"""Download and open all images from markdown text.""" """Download and open all images from markdown text."""
import requests import requests
except Exception as e: except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}") logging.error(f"Failed to download/open image from {url}: {e}")
continue continue
return images if images else None return images if images else None


def __call__(self, filename, binary=None):
def __call__(self, filename, binary=None, separate_tables=True):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
txt = f.read() txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
sections = [] sections = []
tbls = [] tbls = []
for sec in remainder.split("\n"): for sec in remainder.split("\n"):
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary)
sections, tables = markdown_parser(filename, binary, separate_tables=False)
# Process images for each section # Process images for each section
section_images = [] section_images = []
for section_text, _ in sections: for section_text, _ in sections:
section_images.append(combined_image) section_images.append(combined_image)
else: else:
section_images.append(None) section_images.append(None)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")


"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False): if kwargs.get("section_only", False):
return chunks return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else: else:
chunks = naive_merge( chunks = naive_merge(
return chunks return chunks


res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res



Loading…
Cancel
Save