您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. import re
  18. class RAGFlowMarkdownParser:
  19. def __init__(self, chunk_token_num=128):
  20. self.chunk_token_num = int(chunk_token_num)
  21. def extract_tables_and_remainder(self, markdown_text):
  22. tables = []
  23. remainder = markdown_text
  24. if "|" in markdown_text: # for optimize performance
  25. # Standard Markdown table
  26. border_table_pattern = re.compile(
  27. r'''
  28. (?:\n|^)
  29. (?:\|.*?\|.*?\|.*?\n)
  30. (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
  31. (?:\|.*?\|.*?\|.*?\n)+
  32. ''', re.VERBOSE)
  33. border_tables = border_table_pattern.findall(markdown_text)
  34. tables.extend(border_tables)
  35. remainder = border_table_pattern.sub('', remainder)
  36. # Borderless Markdown table
  37. no_border_table_pattern = re.compile(
  38. r'''
  39. (?:\n|^)
  40. (?:\S.*?\|.*?\n)
  41. (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
  42. (?:\S.*?\|.*?\n)+
  43. ''', re.VERBOSE)
  44. no_border_tables = no_border_table_pattern.findall(remainder)
  45. tables.extend(no_border_tables)
  46. remainder = no_border_table_pattern.sub('', remainder)
  47. if "<table>" in remainder.lower(): # for optimize performance
  48. #HTML table extraction - handle possible html/body wrapper tags
  49. html_table_pattern = re.compile(
  50. r'''
  51. (?:\n|^)
  52. \s*
  53. (?:
  54. # case1: <html><body><table>...</table></body></html>
  55. (?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>\s*</html>)
  56. |
  57. # case2: <body><table>...</table></body>
  58. (?:<body[^>]*>\s*<table[^>]*>.*?</table>\s*</body>)
  59. |
  60. # case3: only<table>...</table>
  61. (?:<table[^>]*>.*?</table>)
  62. )
  63. \s*
  64. (?=\n|$)
  65. ''',
  66. re.VERBOSE | re.DOTALL | re.IGNORECASE
  67. )
  68. html_tables = html_table_pattern.findall(remainder)
  69. tables.extend(html_tables)
  70. remainder = html_table_pattern.sub('', remainder)
  71. return remainder, tables