Procházet zdrojové kódy

Feat:Optimize the table extraction logic in the Markdown parser: (#5663)

Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags. Improve performance by using conditional checks to
reduce unnecessary regular expression matching.

### What problem does this PR solve?

Optimize the table extraction logic in the Markdown parser:
Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags.
Improve performance by using conditional checks to reduce unnecessary
regular expression matching.

### Type of change

- [x] Performance Improvement

Co-authored-by: wenju.li <wenju.li@deepctr.cn>
tags/v0.17.1
liwenju0 před 8 měsíci
rodič
revize
5b0e38060a
Žádný účet není propojen s e-mailovou adresou tvůrce revize
1 změnil soubory, kde provedl 48 přidání a 19 odebrání
  1. 48
    19
      deepdoc/parser/markdown_parser.py

+ 48
- 19
deepdoc/parser/markdown_parser.py Zobrazit soubor

@@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
self.chunk_token_num = int(chunk_token_num)

def extract_tables_and_remainder(self, markdown_text):
# Standard Markdown table
table_pattern = re.compile(
r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
tables = []
remainder = markdown_text
if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE)
tables = table_pattern.findall(markdown_text)
remainder = table_pattern.sub('', markdown_text)
border_tables = border_table_pattern.findall(markdown_text)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)

# Borderless Markdown table
no_border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)

# Borderless Markdown table
no_border_table_pattern = re.compile(
if "<table>" in remainder.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
(?:\n|^)
\s*
(?:
# case1: <html><body><table>...</table></body></html>
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
# case2: <body><table>...</table></body>
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
# case3: only<table>...</table>
(?:<table[^>]*>.*?</table>)
)
\s*
(?=\n|$)
''',
re.VERBOSE | re.DOTALL | re.IGNORECASE
)
html_tables = html_table_pattern.findall(remainder)
tables.extend(html_tables)
remainder = html_table_pattern.sub('', remainder)

return remainder, tables

Načítá se…
Zrušit
Uložit