# -*- coding: utf-8 -*- # # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import re import mistune from markdown import markdown class RAGFlowMarkdownParser: def __init__(self, chunk_token_num=128): self.chunk_token_num = int(chunk_token_num) def extract_tables_and_remainder(self, markdown_text, separate_tables=True): tables = [] working_text = markdown_text def replace_tables_with_rendered_html(pattern, table_list, render=True): new_text = "" last_end = 0 for match in pattern.finditer(working_text): raw_table = match.group() table_list.append(raw_table) if separate_tables: # Skip this match (i.e., remove it) new_text += working_text[last_end : match.start()] + "\n\n" else: # Replace with rendered HTML html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table new_text += working_text[last_end : match.start()] + html_table + "\n\n" last_end = match.end() new_text += working_text[last_end:] return new_text if "|" in markdown_text: # for optimize performance # Standard Markdown table border_table_pattern = re.compile( r""" (?:\n|^) (?:\|.*?\|.*?\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|.*?\|.*?\|.*?\n)+ """, re.VERBOSE, ) working_text = replace_tables_with_rendered_html(border_table_pattern, tables) # Borderless Markdown table no_border_table_pattern = re.compile( r""" (?:\n|^) (?:\S.*?\|.*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:\S.*?\|.*?\n)+ """, re.VERBOSE, ) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables) if "" in working_text.lower(): # for optimize performance # HTML table extraction - handle possible html/body wrapper tags html_table_pattern = re.compile( r""" (?:\n|^) \s* (?: # case1:

...

(?:]*>\s*]*>\s*]*>.*?\s*\s*) | # case2: ...

(?:]*>\s*]*>.*?\s*) | # case3: only...

(?:]*>.*?) ) \s* (?=\n|$) """, re.VERBOSE | re.DOTALL | re.IGNORECASE, ) def replace_html_tables(): nonlocal working_text new_text = "" last_end = 0 for match in html_table_pattern.finditer(working_text): raw_table = match.group() tables.append(raw_table) if separate_tables: new_text += working_text[last_end : match.start()] + "\n\n" else: new_text += working_text[last_end : match.start()] + raw_table + "\n\n" last_end = match.end() new_text += working_text[last_end:] working_text = new_text replace_html_tables() return working_text, tables class MarkdownElementExtractor: def __init__(self, markdown_content): self.markdown_content = markdown_content self.lines = markdown_content.split("\n") self.ast_parser = mistune.create_markdown(renderer="ast") self.ast_nodes = self.ast_parser(markdown_content) def extract_elements(self): """Extract individual elements (headers, code blocks, lists, etc.)""" sections = [] i = 0 while i < len(self.lines): line = self.lines[i] if re.match(r"^#{1,6}\s+.*$", line): # header element = self._extract_header(i) sections.append(element["content"]) i = element["end_line"] + 1 elif line.strip().startswith("```"): # code block element = self._extract_code_block(i) sections.append(element["content"]) i = element["end_line"] + 1 elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line): # list block element = self._extract_list_block(i) sections.append(element["content"]) i = element["end_line"] + 1 elif line.strip().startswith(">"): # blockquote element = self._extract_blockquote(i) sections.append(element["content"]) i = element["end_line"] + 1 elif line.strip(): # text block (paragraphs and inline elements until next block element) element = self._extract_text_block(i) sections.append(element["content"]) i = element["end_line"] + 1 else: i += 1 sections = [section for section in sections if section.strip()] return sections def _extract_header(self, start_pos): return { "type": "header", "content": self.lines[start_pos], "start_line": start_pos, "end_line": start_pos, } def _extract_code_block(self, start_pos): end_pos = start_pos content_lines = [self.lines[start_pos]] # Find the end of the code block for i in range(start_pos + 1, len(self.lines)): content_lines.append(self.lines[i]) end_pos = i if self.lines[i].strip().startswith("```"): break return { "type": "code_block", "content": "\n".join(content_lines), "start_line": start_pos, "end_line": end_pos, } def _extract_list_block(self, start_pos): end_pos = start_pos content_lines = [] i = start_pos while i < len(self.lines): line = self.lines[i] # check if this line is a list item or continuation of a list if ( re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or (i > start_pos and not line.strip()) or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line)) or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line)) or (i > start_pos and re.match(r"^\s+\w+.*$", line)) ): content_lines.append(line) end_pos = i i += 1 else: break return { "type": "list_block", "content": "\n".join(content_lines), "start_line": start_pos, "end_line": end_pos, } def _extract_blockquote(self, start_pos): end_pos = start_pos content_lines = [] i = start_pos while i < len(self.lines): line = self.lines[i] if line.strip().startswith(">") or (i > start_pos and not line.strip()): content_lines.append(line) end_pos = i i += 1 else: break return { "type": "blockquote", "content": "\n".join(content_lines), "start_line": start_pos, "end_line": end_pos, } def _extract_text_block(self, start_pos): """Extract a text block (paragraphs, inline elements) until next block element""" end_pos = start_pos content_lines = [self.lines[start_pos]] i = start_pos + 1 while i < len(self.lines): line = self.lines[i] # stop if we encounter a block element if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"): break elif not line.strip(): # check if the next line is a block element if i + 1 < len(self.lines) and ( re.match(r"^#{1,6}\s+.*$", self.lines[i + 1]) or self.lines[i + 1].strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1]) or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1]) or self.lines[i + 1].strip().startswith(">") ): break else: content_lines.append(line) end_pos = i i += 1 else: content_lines.append(line) end_pos = i i += 1 return { "type": "text_block", "content": "\n".join(content_lines), "start_line": start_pos, "end_line": end_pos, }