You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

markdown_parser.py 1.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. # -*- coding: utf-8 -*-
  2. # Licensed under the Apache License, Version 2.0 (the "License");
  3. # you may not use this file except in compliance with the License.
  4. # You may obtain a copy of the License at
  5. #
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. #
  8. # Unless required by applicable law or agreed to in writing, software
  9. # distributed under the License is distributed on an "AS IS" BASIS,
  10. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11. # See the License for the specific language governing permissions and
  12. # limitations under the License.
  13. #
  14. import re
  15. class RAGFlowMarkdownParser:
  16. def __init__(self, chunk_token_num=128):
  17. self.chunk_token_num = int(chunk_token_num)
  18. def extract_tables_and_remainder(self, markdown_text):
  19. # Standard Markdown table
  20. table_pattern = re.compile(
  21. r'''
  22. (?:\n|^)
  23. (?:\|.*?\|.*?\|.*?\n)
  24. (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
  25. (?:\|.*?\|.*?\|.*?\n)+
  26. ''', re.VERBOSE)
  27. tables = table_pattern.findall(markdown_text)
  28. remainder = table_pattern.sub('', markdown_text)
  29. # Borderless Markdown table
  30. no_border_table_pattern = re.compile(
  31. r'''
  32. (?:\n|^)
  33. (?:\S.*?\|.*?\n)
  34. (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
  35. (?:\S.*?\|.*?\n)+
  36. ''', re.VERBOSE)
  37. no_border_tables = no_border_table_pattern.findall(remainder)
  38. tables.extend(no_border_tables)
  39. remainder = no_border_table_pattern.sub('', remainder)
  40. return remainder, tables