|
|
|
@@ -32,7 +32,6 @@ from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, Mark |
|
|
|
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper |
|
|
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser |
|
|
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table |
|
|
|
from rag.utils import num_tokens_from_string |
|
|
|
|
|
|
|
|
|
|
|
class Docx(DocxParser): |
|
|
|
@@ -335,17 +334,13 @@ class Markdown(MarkdownParser): |
|
|
|
sections = [] |
|
|
|
tbls = [] |
|
|
|
for sec in remainder.split("\n"): |
|
|
|
if num_tokens_from_string(sec) > 3 * self.chunk_token_num: |
|
|
|
sections.append((sec[:int(len(sec) / 2)], "")) |
|
|
|
sections.append((sec[int(len(sec) / 2):], "")) |
|
|
|
if sec.strip().find("#") == 0: |
|
|
|
sections.append((sec, "")) |
|
|
|
elif sections and sections[-1][0].strip().find("#") == 0: |
|
|
|
sec_, _ = sections.pop(-1) |
|
|
|
sections.append((sec_ + "\n" + sec, "")) |
|
|
|
else: |
|
|
|
if sec.strip().find("#") == 0: |
|
|
|
sections.append((sec, "")) |
|
|
|
elif sections and sections[-1][0].strip().find("#") == 0: |
|
|
|
sec_, _ = sections.pop(-1) |
|
|
|
sections.append((sec_ + "\n" + sec, "")) |
|
|
|
else: |
|
|
|
sections.append((sec, "")) |
|
|
|
sections.append((sec, "")) |
|
|
|
for table in tables: |
|
|
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) |
|
|
|
return sections, tbls |