Просмотр исходного кода

Feat:Optimize the table extraction logic in the Markdown parser: (#5663)

Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags. Improve performance by using conditional checks to
reduce unnecessary regular expression matching.

### What problem does this PR solve?

Optimize the table extraction logic in the Markdown parser:
Enhance the recognition of both borderless and bordered Markdown tables.
Add support for extracting HTML tables, including various scenarios with
nested HTML tags.
Improve performance by using conditional checks to reduce unnecessary
regular expression matching.

### Type of change

- [x] Performance Improvement

Co-authored-by: wenju.li <wenju.li@deepctr.cn>
tags/v0.17.1
liwenju0 7 месяцев назад
Родитель
Сommit
5b0e38060a
Аккаунт пользователя с таким Email не найден
1 измененных файлов: 48 добавлений и 19 удалений
  1. 48
    19
      deepdoc/parser/markdown_parser.py

+ 48
- 19
deepdoc/parser/markdown_parser.py Просмотреть файл

@@ -22,27 +22,56 @@ class RAGFlowMarkdownParser:
self.chunk_token_num = int(chunk_token_num)

def extract_tables_and_remainder(self, markdown_text):
# Standard Markdown table
table_pattern = re.compile(
r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
tables = []
remainder = markdown_text
if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE)
tables = table_pattern.findall(markdown_text)
remainder = table_pattern.sub('', markdown_text)
border_tables = border_table_pattern.findall(markdown_text)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)

# Borderless Markdown table
no_border_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)

# Borderless Markdown table
no_border_table_pattern = re.compile(
if "<table>" in remainder.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
r'''
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
(?:\n|^)
\s*
(?:
# case1: <html><body><table>...</table></body></html>
(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
|
# case2: <body><table>...</table></body>
(?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
|
# case3: only<table>...</table>
(?:<table[^>]*>.*?</table>)
)
\s*
(?=\n|$)
''',
re.VERBOSE | re.DOTALL | re.IGNORECASE
)
html_tables = html_table_pattern.findall(remainder)
tables.extend(html_tables)
remainder = html_table_pattern.sub('', remainder)

return remainder, tables

Загрузка…
Отмена
Сохранить