|
|
|
@@ -16,7 +16,7 @@ from io import BytesIO |
|
|
|
import re |
|
|
|
|
|
|
|
from deepdoc.parser.utils import get_text |
|
|
|
from rag.app import laws |
|
|
|
from rag.app import naive |
|
|
|
from rag.nlp import rag_tokenizer, tokenize |
|
|
|
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser |
|
|
|
|
|
|
|
@@ -67,7 +67,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, |
|
|
|
|
|
|
|
if re.search(r"\.docx$", filename, re.IGNORECASE): |
|
|
|
callback(0.1, "Start to parse.") |
|
|
|
sections = [txt for txt in laws.Docx()(filename, binary) if txt] |
|
|
|
sections, tbls = naive.Docx()(filename, binary) |
|
|
|
sections = [s for s, _ in sections if s] |
|
|
|
for (_, html), _ in tbls: |
|
|
|
sections.append(html) |
|
|
|
callback(0.8, "Finish parsing.") |
|
|
|
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE): |