# -*- coding: utf-8 -*- # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import re class RAGFlowMarkdownParser: def __init__(self, chunk_token_num=128): self.chunk_token_num = int(chunk_token_num) def extract_tables_and_remainder(self, markdown_text): # Standard Markdown table table_pattern = re.compile( r''' (?:\n|^) (?:\|.*?\|.*?\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|.*?\|.*?\|.*?\n)+ ''', re.VERBOSE) tables = table_pattern.findall(markdown_text) remainder = table_pattern.sub('', markdown_text) # Borderless Markdown table no_border_table_pattern = re.compile( r''' (?:\n|^) (?:\S.*?\|.*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:\S.*?\|.*?\n)+ ''', re.VERBOSE) no_border_tables = no_border_table_pattern.findall(remainder) tables.extend(no_border_tables) remainder = no_border_table_pattern.sub('', remainder) return remainder, tables