Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

markdown_parser.py 4.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. import re
  18. from markdown import markdown
  19. class RAGFlowMarkdownParser:
  20. def __init__(self, chunk_token_num=128):
  21. self.chunk_token_num = int(chunk_token_num)
  22. def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
  23. tables = []
  24. working_text = markdown_text
  25. def replace_tables_with_rendered_html(pattern, table_list, render=True):
  26. new_text = ""
  27. last_end = 0
  28. for match in pattern.finditer(working_text):
  29. raw_table = match.group()
  30. table_list.append(raw_table)
  31. if separate_tables:
  32. # Skip this match (i.e., remove it)
  33. new_text += working_text[last_end:match.start()] + "\n\n"
  34. else:
  35. # Replace with rendered HTML
  36. html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
  37. new_text += working_text[last_end:match.start()] + html_table + "\n\n"
  38. last_end = match.end()
  39. new_text += working_text[last_end:]
  40. return new_text
  41. if "|" in markdown_text: # for optimize performance
  42. # Standard Markdown table
  43. border_table_pattern = re.compile(
  44. r'''
  45. (?:\n|^)
  46. (?:\|.*?\|.*?\|.*?\n)
  47. (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
  48. (?:\|.*?\|.*?\|.*?\n)+
  49. ''', re.VERBOSE)
  50. working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
  51. # Borderless Markdown table
  52. no_border_table_pattern = re.compile(
  53. r'''
  54. (?:\n|^)
  55. (?:\S.*?\|.*?\n)
  56. (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
  57. (?:\S.*?\|.*?\n)+
  58. ''', re.VERBOSE)
  59. working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
  60. if "<table>" in working_text.lower(): # for optimize performance
  61. #HTML table extraction - handle possible html/body wrapper tags
  62. html_table_pattern = re.compile(
  63. r'''
  64. (?:\n|^)
  65. \s*
  66. (?:
  67. # case1: <html><body><table>...</table></body></html>
  68. (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
  69. |
  70. # case2: <body><table>...</table></body>
  71. (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
  72. |
  73. # case3: only<table>...</table>
  74. (?:<table[^>]*>.*?</table>)
  75. )
  76. \s*
  77. (?=\n|$)
  78. ''',
  79. re.VERBOSE | re.DOTALL | re.IGNORECASE
  80. )
  81. def replace_html_tables():
  82. nonlocal working_text
  83. new_text = ""
  84. last_end = 0
  85. for match in html_table_pattern.finditer(working_text):
  86. raw_table = match.group()
  87. tables.append(raw_table)
  88. if separate_tables:
  89. new_text += working_text[last_end:match.start()] + "\n\n"
  90. else:
  91. new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
  92. last_end = match.end()
  93. new_text += working_text[last_end:]
  94. working_text = new_text
  95. replace_html_tables()
  96. return working_text, tables