Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. import re
  18. import mistune
  19. from markdown import markdown
  20. class RAGFlowMarkdownParser:
  21. def __init__(self, chunk_token_num=128):
  22. self.chunk_token_num = int(chunk_token_num)
  23. def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
  24. tables = []
  25. working_text = markdown_text
  26. def replace_tables_with_rendered_html(pattern, table_list, render=True):
  27. new_text = ""
  28. last_end = 0
  29. for match in pattern.finditer(working_text):
  30. raw_table = match.group()
  31. table_list.append(raw_table)
  32. if separate_tables:
  33. # Skip this match (i.e., remove it)
  34. new_text += working_text[last_end : match.start()] + "\n\n"
  35. else:
  36. # Replace with rendered HTML
  37. html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
  38. new_text += working_text[last_end : match.start()] + html_table + "\n\n"
  39. last_end = match.end()
  40. new_text += working_text[last_end:]
  41. return new_text
  42. if "|" in markdown_text: # for optimize performance
  43. # Standard Markdown table
  44. border_table_pattern = re.compile(
  45. r"""
  46. (?:\n|^)
  47. (?:\|.*?\|.*?\|.*?\n)
  48. (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
  49. (?:\|.*?\|.*?\|.*?\n)+
  50. """,
  51. re.VERBOSE,
  52. )
  53. working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
  54. # Borderless Markdown table
  55. no_border_table_pattern = re.compile(
  56. r"""
  57. (?:\n|^)
  58. (?:\S.*?\|.*?\n)
  59. (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
  60. (?:\S.*?\|.*?\n)+
  61. """,
  62. re.VERBOSE,
  63. )
  64. working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
  65. if "<table>" in working_text.lower(): # for optimize performance
  66. # HTML table extraction - handle possible html/body wrapper tags
  67. html_table_pattern = re.compile(
  68. r"""
  69. (?:\n|^)
  70. \s*
  71. (?:
  72. # case1: <html><body><table>...</table></body></html>
  73. (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
  74. |
  75. # case2: <body><table>...</table></body>
  76. (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
  77. |
  78. # case3: only<table>...</table>
  79. (?:<table[^>]*>.*?</table>)
  80. )
  81. \s*
  82. (?=\n|$)
  83. """,
  84. re.VERBOSE | re.DOTALL | re.IGNORECASE,
  85. )
  86. def replace_html_tables():
  87. nonlocal working_text
  88. new_text = ""
  89. last_end = 0
  90. for match in html_table_pattern.finditer(working_text):
  91. raw_table = match.group()
  92. tables.append(raw_table)
  93. if separate_tables:
  94. new_text += working_text[last_end : match.start()] + "\n\n"
  95. else:
  96. new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
  97. last_end = match.end()
  98. new_text += working_text[last_end:]
  99. working_text = new_text
  100. replace_html_tables()
  101. return working_text, tables
  102. class MarkdownElementExtractor:
  103. def __init__(self, markdown_content):
  104. self.markdown_content = markdown_content
  105. self.lines = markdown_content.split("\n")
  106. self.ast_parser = mistune.create_markdown(renderer="ast")
  107. self.ast_nodes = self.ast_parser(markdown_content)
  108. def extract_elements(self):
  109. """Extract individual elements (headers, code blocks, lists, etc.)"""
  110. sections = []
  111. i = 0
  112. while i < len(self.lines):
  113. line = self.lines[i]
  114. if re.match(r"^#{1,6}\s+.*$", line):
  115. # header
  116. element = self._extract_header(i)
  117. sections.append(element["content"])
  118. i = element["end_line"] + 1
  119. elif line.strip().startswith("```"):
  120. # code block
  121. element = self._extract_code_block(i)
  122. sections.append(element["content"])
  123. i = element["end_line"] + 1
  124. elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
  125. # list block
  126. element = self._extract_list_block(i)
  127. sections.append(element["content"])
  128. i = element["end_line"] + 1
  129. elif line.strip().startswith(">"):
  130. # blockquote
  131. element = self._extract_blockquote(i)
  132. sections.append(element["content"])
  133. i = element["end_line"] + 1
  134. elif line.strip():
  135. # text block (paragraphs and inline elements until next block element)
  136. element = self._extract_text_block(i)
  137. sections.append(element["content"])
  138. i = element["end_line"] + 1
  139. else:
  140. i += 1
  141. sections = [section for section in sections if section.strip()]
  142. return sections
  143. def _extract_header(self, start_pos):
  144. return {
  145. "type": "header",
  146. "content": self.lines[start_pos],
  147. "start_line": start_pos,
  148. "end_line": start_pos,
  149. }
  150. def _extract_code_block(self, start_pos):
  151. end_pos = start_pos
  152. content_lines = [self.lines[start_pos]]
  153. # Find the end of the code block
  154. for i in range(start_pos + 1, len(self.lines)):
  155. content_lines.append(self.lines[i])
  156. end_pos = i
  157. if self.lines[i].strip().startswith("```"):
  158. break
  159. return {
  160. "type": "code_block",
  161. "content": "\n".join(content_lines),
  162. "start_line": start_pos,
  163. "end_line": end_pos,
  164. }
  165. def _extract_list_block(self, start_pos):
  166. end_pos = start_pos
  167. content_lines = []
  168. i = start_pos
  169. while i < len(self.lines):
  170. line = self.lines[i]
  171. # check if this line is a list item or continuation of a list
  172. if (
  173. re.match(r"^\s*[-*+]\s+.*$", line)
  174. or re.match(r"^\s*\d+\.\s+.*$", line)
  175. or (i > start_pos and not line.strip())
  176. or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
  177. or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
  178. or (i > start_pos and re.match(r"^\s+\w+.*$", line))
  179. ):
  180. content_lines.append(line)
  181. end_pos = i
  182. i += 1
  183. else:
  184. break
  185. return {
  186. "type": "list_block",
  187. "content": "\n".join(content_lines),
  188. "start_line": start_pos,
  189. "end_line": end_pos,
  190. }
  191. def _extract_blockquote(self, start_pos):
  192. end_pos = start_pos
  193. content_lines = []
  194. i = start_pos
  195. while i < len(self.lines):
  196. line = self.lines[i]
  197. if line.strip().startswith(">") or (i > start_pos and not line.strip()):
  198. content_lines.append(line)
  199. end_pos = i
  200. i += 1
  201. else:
  202. break
  203. return {
  204. "type": "blockquote",
  205. "content": "\n".join(content_lines),
  206. "start_line": start_pos,
  207. "end_line": end_pos,
  208. }
  209. def _extract_text_block(self, start_pos):
  210. """Extract a text block (paragraphs, inline elements) until next block element"""
  211. end_pos = start_pos
  212. content_lines = [self.lines[start_pos]]
  213. i = start_pos + 1
  214. while i < len(self.lines):
  215. line = self.lines[i]
  216. # stop if we encounter a block element
  217. if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
  218. break
  219. elif not line.strip():
  220. # check if the next line is a block element
  221. if i + 1 < len(self.lines) and (
  222. re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
  223. or self.lines[i + 1].strip().startswith("```")
  224. or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
  225. or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
  226. or self.lines[i + 1].strip().startswith(">")
  227. ):
  228. break
  229. else:
  230. content_lines.append(line)
  231. end_pos = i
  232. i += 1
  233. else:
  234. content_lines.append(line)
  235. end_pos = i
  236. i += 1
  237. return {
  238. "type": "text_block",
  239. "content": "\n".join(content_lines),
  240. "start_line": start_pos,
  241. "end_line": end_pos,
  242. }