### What problem does this PR solve? Fix context loss caused by separating markdown tables from original text. #6871, #8804. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)

3 months ago · 51a8604dcb
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
 import re
 from markdown import markdown
 class RAGFlowMarkdownParser:
    def __init__(self, chunk_token_num=128):
        self.chunk_token_num = int(chunk_token_num)
    def extract_tables_and_remainder(self, markdown_text):
    def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
        tables = []
        remainder = markdown_text
        working_text = markdown_text
        def replace_tables_with_rendered_html(pattern, table_list, render=True):
            new_text = ""
            last_end = 0
            for match in pattern.finditer(working_text):
                raw_table = match.group()
                table_list.append(raw_table)
                if separate_tables:
                    # Skip this match (i.e., remove it)
                    new_text += working_text[last_end:match.start()] + "\n\n"
                else:
                    # Replace with rendered HTML
                    html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
                    new_text += working_text[last_end:match.start()] + html_table + "\n\n"
                last_end = match.end()
            new_text += working_text[last_end:]
            return new_text
        if "|" in markdown_text: # for optimize performance
            # Standard Markdown table
            border_table_pattern = re.compile(
                r'''
                (?:\n|^)                     
                (?:\|.*?\|.*?\|.*?\n)        
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
                (?:\n|^)
                (?:\|.*?\|.*?\|.*?\n)
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
                (?:\|.*?\|.*?\|.*?\n)+
            ''', re.VERBOSE)
            border_tables = border_table_pattern.findall(markdown_text)
            tables.extend(border_tables)
            remainder = border_table_pattern.sub('', remainder)
            working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
            # Borderless Markdown table
            no_border_table_pattern = re.compile(
                r'''
                (?:\n|^)                 
                (?:\n|^)
                (?:\S.*?\|.*?\n)
                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
                (?:\S.*?\|.*?\n)+
                ''', re.VERBOSE)
            no_border_tables = no_border_table_pattern.findall(remainder)
            tables.extend(no_border_tables)
            remainder = no_border_table_pattern.sub('', remainder)
            working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
        if "<table>" in remainder.lower(): # for optimize performance
        if "<table>" in working_text.lower(): # for optimize performance
            #HTML table extraction - handle possible html/body wrapper tags
            html_table_pattern = re.compile(
            r'''
            ''',
            re.VERBOSE | re.DOTALL | re.IGNORECASE
            )
            html_tables = html_table_pattern.findall(remainder)
            tables.extend(html_tables)
            remainder = html_table_pattern.sub('', remainder)
            def replace_html_tables():
                nonlocal working_text
                new_text = ""
                last_end = 0
                for match in html_table_pattern.finditer(working_text):
                    raw_table = match.group()
                    tables.append(raw_table)
                    if separate_tables:
                        new_text += working_text[last_end:match.start()] + "\n\n"
                    else:
                        new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
                    last_end = match.end()
                new_text += working_text[last_end:]
                working_text = new_text
            replace_html_tables()
        return remainder, tables
        return working_text, tables
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
 from docx import Document
 from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
 from markdown import markdown 
 from markdown import markdown
 from PIL import Image
 from tika import parser
        """Get the hierarchical title structure before the table"""
        import re
        from docx.text.paragraph import Paragraph
        titles = []
        blocks = []
        # Get document name from filename parameter
        doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
        if not doc_name:
            doc_name = "Untitled Document"
        # Collect all document blocks while maintaining document order
        try:
            # Iterate through all paragraphs and tables in document order
        except Exception as e:
            logging.error(f"Error collecting blocks: {e}")
            return ""
        # Find the target table position
        target_table_pos = -1
        table_count = 0
                    target_table_pos = pos
                    break
                table_count += 1
        if target_table_pos == -1:
            return ""  # Target table not found
        # Find the nearest heading paragraph in reverse order
        nearest_title = None
        for i in range(len(blocks)-1, -1, -1):
            block_type, pos, block = blocks[i]
            if pos >= target_table_pos:  # Skip blocks after the table
                continue
            if block_type != 'p':
                continue
            if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
                try:
                    level_match = re.search(r"(\d+)", block.style.name)
                                break
                except Exception as e:
                    logging.error(f"Error parsing heading level: {e}")
        if nearest_title:
            # Add current title
            titles.append(nearest_title)
            current_level = nearest_title[0]
            # Find all parent headings, allowing cross-level search
            while current_level > 1:
                found = False
                    block_type, pos, block = blocks[i]
                    if pos >= target_table_pos:  # Skip blocks after the table
                        continue
                    if block_type != 'p':
                        continue
                    if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
                        try:
                            level_match = re.search(r"(\d+)", block.style.name)
                            if level_match:
                                level = int(level_match.group(1))
                                # Find any heading with a higher level
                                if level < current_level:  
                                if level < current_level:
                                    title_text = block.text.strip()
                                    if title_text:  # Avoid empty titles
                                        titles.append((level, title_text))
                                        break
                        except Exception as e:
                            logging.error(f"Error parsing parent heading: {e}")
                if not found:  # Break if no parent heading is found
                    break
            # Sort by level (ascending, from highest to lowest)
            titles.sort(key=lambda x: x[0])
            # Organize titles (from highest to lowest)
            hierarchy = [doc_name] + [t[1] for t in titles]
            return " > ".join(hierarchy)
        return ""
    def __call__(self, filename, binary=None, from_page=0, to_page=100000):
            text = sections[0]
        else:
            return []
        from bs4 import BeautifulSoup
        html_content = markdown(text)
        soup = BeautifulSoup(html_content, 'html.parser')
        html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
        return html_images
    def get_pictures(self, text):
        """Download and open all images from markdown text."""
        import requests
            except Exception as e:
                logging.error(f"Failed to download/open image from {url}: {e}")
                continue
        return images if images else None
    def __call__(self, filename, binary=None):
    def __call__(self, filename, binary=None, separate_tables=True):
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
        else:
            with open(filename, "r") as f:
                txt = f.read()
        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
        sections = []
        tbls = []
        for sec in remainder.split("\n"):
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
        sections, tables = markdown_parser(filename, binary)
        sections, tables = markdown_parser(filename, binary, separate_tables=False)
        # Process images for each section
        section_images = []
        for section_text, _ in sections:
                section_images.append(combined_image)
            else:
                section_images.append(None)
        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")
                                            "delimiter", "\n!?。；！？"))
        if kwargs.get("section_only", False):
            return chunks
        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
    else:
        chunks = naive_merge(
            return chunks
        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
    logging.info("naive_merge({}): {}".format(filename, timer() - st))
    return res