|
1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- # -*- coding: utf-8 -*-
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import re
-
- class RAGFlowMarkdownParser:
- def __init__(self, chunk_token_num=128):
- self.chunk_token_num = int(chunk_token_num)
-
- def extract_tables_and_remainder(self, markdown_text):
- # Standard Markdown table
- table_pattern = re.compile(
- r'''
- (?:\n|^)
- (?:\|.*?\|.*?\|.*?\n)
- (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
- (?:\|.*?\|.*?\|.*?\n)+
- ''', re.VERBOSE)
- tables = table_pattern.findall(markdown_text)
- remainder = table_pattern.sub('', markdown_text)
-
- # Borderless Markdown table
- no_border_table_pattern = re.compile(
- r'''
- (?:\n|^)
- (?:\S.*?\|.*?\n)
- (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
- (?:\S.*?\|.*?\n)+
- ''', re.VERBOSE)
- no_border_tables = no_border_table_pattern.findall(remainder)
- tables.extend(no_border_tables)
- remainder = no_border_table_pattern.sub('', remainder)
-
- return remainder, tables
|