Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

markdown_parser.py 1.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. # -*- coding: utf-8 -*-
  2. #
  3. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. #
  17. import re
  18. class RAGFlowMarkdownParser:
  19. def __init__(self, chunk_token_num=128):
  20. self.chunk_token_num = int(chunk_token_num)
  21. def extract_tables_and_remainder(self, markdown_text):
  22. # Standard Markdown table
  23. table_pattern = re.compile(
  24. r'''
  25. (?:\n|^)
  26. (?:\|.*?\|.*?\|.*?\n)
  27. (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
  28. (?:\|.*?\|.*?\|.*?\n)+
  29. ''', re.VERBOSE)
  30. tables = table_pattern.findall(markdown_text)
  31. remainder = table_pattern.sub('', markdown_text)
  32. # Borderless Markdown table
  33. no_border_table_pattern = re.compile(
  34. r'''
  35. (?:\n|^)
  36. (?:\S.*?\|.*?\n)
  37. (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
  38. (?:\S.*?\|.*?\n)+
  39. ''', re.VERBOSE)
  40. no_border_tables = no_border_table_pattern.findall(remainder)
  41. tables.extend(no_border_tables)
  42. remainder = no_border_table_pattern.sub('', remainder)
  43. return remainder, tables