| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 | 
							- # -*- coding: utf-8 -*-
 - #
 - #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 - #
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - 
 - import re
 - 
 - import mistune
 - from markdown import markdown
 - 
 - 
 - class RAGFlowMarkdownParser:
 -     def __init__(self, chunk_token_num=128):
 -         self.chunk_token_num = int(chunk_token_num)
 - 
 -     def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
 -         tables = []
 -         working_text = markdown_text
 - 
 -         def replace_tables_with_rendered_html(pattern, table_list, render=True):
 -             new_text = ""
 -             last_end = 0
 -             for match in pattern.finditer(working_text):
 -                 raw_table = match.group()
 -                 table_list.append(raw_table)
 -                 if separate_tables:
 -                     # Skip this match (i.e., remove it)
 -                     new_text += working_text[last_end : match.start()] + "\n\n"
 -                 else:
 -                     # Replace with rendered HTML
 -                     html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
 -                     new_text += working_text[last_end : match.start()] + html_table + "\n\n"
 -                 last_end = match.end()
 -             new_text += working_text[last_end:]
 -             return new_text
 - 
 -         if "|" in markdown_text:  # for optimize performance
 -             # Standard Markdown table
 -             border_table_pattern = re.compile(
 -                 r"""
 -                 (?:\n|^)
 -                 (?:\|.*?\|.*?\|.*?\n)
 -                 (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
 -                 (?:\|.*?\|.*?\|.*?\n)+
 -             """,
 -                 re.VERBOSE,
 -             )
 -             working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
 - 
 -             # Borderless Markdown table
 -             no_border_table_pattern = re.compile(
 -                 r"""
 -                 (?:\n|^)
 -                 (?:\S.*?\|.*?\n)
 -                 (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
 -                 (?:\S.*?\|.*?\n)+
 -                 """,
 -                 re.VERBOSE,
 -             )
 -             working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
 - 
 -         if "<table>" in working_text.lower():  # for optimize performance
 -             # HTML table extraction - handle possible html/body wrapper tags
 -             html_table_pattern = re.compile(
 -                 r"""
 -             (?:\n|^)
 -             \s*
 -             (?:
 -                 # case1: <html><body><table>...</table></body></html>
 -                 (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
 -                 |
 -                 # case2: <body><table>...</table></body>
 -                 (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
 -                 |
 -                 # case3: only<table>...</table>
 -                 (?:<table[^>]*>.*?</table>)
 -             )
 -             \s*
 -             (?=\n|$)
 -             """,
 -                 re.VERBOSE | re.DOTALL | re.IGNORECASE,
 -             )
 - 
 -             def replace_html_tables():
 -                 nonlocal working_text
 -                 new_text = ""
 -                 last_end = 0
 -                 for match in html_table_pattern.finditer(working_text):
 -                     raw_table = match.group()
 -                     tables.append(raw_table)
 -                     if separate_tables:
 -                         new_text += working_text[last_end : match.start()] + "\n\n"
 -                     else:
 -                         new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
 -                     last_end = match.end()
 -                 new_text += working_text[last_end:]
 -                 working_text = new_text
 - 
 -             replace_html_tables()
 - 
 -         return working_text, tables
 - 
 - 
 - class MarkdownElementExtractor:
 -     def __init__(self, markdown_content):
 -         self.markdown_content = markdown_content
 -         self.lines = markdown_content.split("\n")
 -         self.ast_parser = mistune.create_markdown(renderer="ast")
 -         self.ast_nodes = self.ast_parser(markdown_content)
 - 
 -     def extract_elements(self):
 -         """Extract individual elements (headers, code blocks, lists, etc.)"""
 -         sections = []
 - 
 -         i = 0
 -         while i < len(self.lines):
 -             line = self.lines[i]
 - 
 -             if re.match(r"^#{1,6}\s+.*$", line):
 -                 # header
 -                 element = self._extract_header(i)
 -                 sections.append(element["content"])
 -                 i = element["end_line"] + 1
 -             elif line.strip().startswith("```"):
 -                 # code block
 -                 element = self._extract_code_block(i)
 -                 sections.append(element["content"])
 -                 i = element["end_line"] + 1
 -             elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
 -                 # list block
 -                 element = self._extract_list_block(i)
 -                 sections.append(element["content"])
 -                 i = element["end_line"] + 1
 -             elif line.strip().startswith(">"):
 -                 # blockquote
 -                 element = self._extract_blockquote(i)
 -                 sections.append(element["content"])
 -                 i = element["end_line"] + 1
 -             elif line.strip():
 -                 # text block (paragraphs and inline elements until next block element)
 -                 element = self._extract_text_block(i)
 -                 sections.append(element["content"])
 -                 i = element["end_line"] + 1
 -             else:
 -                 i += 1
 - 
 -         sections = [section for section in sections if section.strip()]
 -         return sections
 - 
 -     def _extract_header(self, start_pos):
 -         return {
 -             "type": "header",
 -             "content": self.lines[start_pos],
 -             "start_line": start_pos,
 -             "end_line": start_pos,
 -         }
 - 
 -     def _extract_code_block(self, start_pos):
 -         end_pos = start_pos
 -         content_lines = [self.lines[start_pos]]
 - 
 -         # Find the end of the code block
 -         for i in range(start_pos + 1, len(self.lines)):
 -             content_lines.append(self.lines[i])
 -             end_pos = i
 -             if self.lines[i].strip().startswith("```"):
 -                 break
 - 
 -         return {
 -             "type": "code_block",
 -             "content": "\n".join(content_lines),
 -             "start_line": start_pos,
 -             "end_line": end_pos,
 -         }
 - 
 -     def _extract_list_block(self, start_pos):
 -         end_pos = start_pos
 -         content_lines = []
 - 
 -         i = start_pos
 -         while i < len(self.lines):
 -             line = self.lines[i]
 -             # check if this line is a list item or continuation of a list
 -             if (
 -                 re.match(r"^\s*[-*+]\s+.*$", line)
 -                 or re.match(r"^\s*\d+\.\s+.*$", line)
 -                 or (i > start_pos and not line.strip())
 -                 or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
 -                 or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
 -                 or (i > start_pos and re.match(r"^\s+\w+.*$", line))
 -             ):
 -                 content_lines.append(line)
 -                 end_pos = i
 -                 i += 1
 -             else:
 -                 break
 - 
 -         return {
 -             "type": "list_block",
 -             "content": "\n".join(content_lines),
 -             "start_line": start_pos,
 -             "end_line": end_pos,
 -         }
 - 
 -     def _extract_blockquote(self, start_pos):
 -         end_pos = start_pos
 -         content_lines = []
 - 
 -         i = start_pos
 -         while i < len(self.lines):
 -             line = self.lines[i]
 -             if line.strip().startswith(">") or (i > start_pos and not line.strip()):
 -                 content_lines.append(line)
 -                 end_pos = i
 -                 i += 1
 -             else:
 -                 break
 - 
 -         return {
 -             "type": "blockquote",
 -             "content": "\n".join(content_lines),
 -             "start_line": start_pos,
 -             "end_line": end_pos,
 -         }
 - 
 -     def _extract_text_block(self, start_pos):
 -         """Extract a text block (paragraphs, inline elements) until next block element"""
 -         end_pos = start_pos
 -         content_lines = [self.lines[start_pos]]
 - 
 -         i = start_pos + 1
 -         while i < len(self.lines):
 -             line = self.lines[i]
 -             # stop if we encounter a block element
 -             if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
 -                 break
 -             elif not line.strip():
 -                 # check if the next line is a block element
 -                 if i + 1 < len(self.lines) and (
 -                     re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
 -                     or self.lines[i + 1].strip().startswith("```")
 -                     or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
 -                     or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
 -                     or self.lines[i + 1].strip().startswith(">")
 -                 ):
 -                     break
 -                 else:
 -                     content_lines.append(line)
 -                     end_pos = i
 -                     i += 1
 -             else:
 -                 content_lines.append(line)
 -                 end_pos = i
 -                 i += 1
 - 
 -         return {
 -             "type": "text_block",
 -             "content": "\n".join(content_lines),
 -             "start_line": start_pos,
 -             "end_line": end_pos,
 -         }
 
 
  |