Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### What problem does this PR solve? Optimize the table extraction logic in the Markdown parser: Enhance the recognition of both borderless and bordered Markdown tables. Add support for extracting HTML tables, including various scenarios with nested HTML tags. Improve performance by using conditional checks to reduce unnecessary regular expression matching. ### Type of change - [x] Performance Improvement Co-authored-by: wenju.li <wenju.li@deepctr.cn>

před 8 měsíci · 5b0e38060a
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
        self.chunk_token_num = int(chunk_token_num)

    def extract_tables_and_remainder(self, markdown_text):
        # Standard Markdown table
        table_pattern = re.compile(
            r'''
            (?:\n|^)                     
            (?:\|.*?\|.*?\|.*?\n)        
            (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
            (?:\|.*?\|.*?\|.*?\n)+
        tables = []
        remainder = markdown_text
        if "|" in markdown_text: # for optimize performance
            # Standard Markdown table
            border_table_pattern = re.compile(
                r'''
                (?:\n|^)                     
                (?:\|.*?\|.*?\|.*?\n)        
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) 
                (?:\|.*?\|.*?\|.*?\n)+
            ''', re.VERBOSE)
        tables = table_pattern.findall(markdown_text)
        remainder = table_pattern.sub('', markdown_text)
            border_tables = border_table_pattern.findall(markdown_text)
            tables.extend(border_tables)
            remainder = border_table_pattern.sub('', remainder)

            # Borderless Markdown table
            no_border_table_pattern = re.compile(
                r'''
                (?:\n|^)                 
                (?:\S.*?\|.*?\n)
                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
                (?:\S.*?\|.*?\n)+
                ''', re.VERBOSE)
            no_border_tables = no_border_table_pattern.findall(remainder)
            tables.extend(no_border_tables)
            remainder = no_border_table_pattern.sub('', remainder)

        # Borderless Markdown table
        no_border_table_pattern = re.compile(
        if "<table>" in remainder.lower(): # for optimize performance
            #HTML table extraction - handle possible html/body wrapper tags
            html_table_pattern = re.compile(
            r'''
            (?:\n|^)                 
            (?:\S.*?\|.*?\n)
            (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
            (?:\S.*?\|.*?\n)+
            ''', re.VERBOSE)
        no_border_tables = no_border_table_pattern.findall(remainder)
        tables.extend(no_border_tables)
        remainder = no_border_table_pattern.sub('', remainder)
            (?:\n|^)
            \s*
            (?:
                # case1: <html><body><table>...</table></body></html>
                (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
                |
                # case2: <body><table>...</table></body>
                (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
                |
                # case3: only<table>...</table>
                (?:<table[^>]*>.*?</table>)
            )
            \s*
            (?=\n|$)
            ''',
            re.VERBOSE | re.DOTALL | re.IGNORECASE
            )
            html_tables = html_table_pattern.findall(remainder)
            tables.extend(html_tables)
            remainder = html_table_pattern.sub('', remainder)

        return remainder, tables