|
|
|
@@ -12,6 +12,7 @@ |
|
|
|
# |
|
|
|
import logging |
|
|
|
import re |
|
|
|
import csv |
|
|
|
from copy import deepcopy |
|
|
|
from io import BytesIO |
|
|
|
from timeit import default_timer as timer |
|
|
|
@@ -25,7 +26,6 @@ from docx import Document |
|
|
|
from PIL import Image |
|
|
|
from markdown import markdown |
|
|
|
|
|
|
|
|
|
|
|
class Excel(ExcelParser): |
|
|
|
def __call__(self, fnm, binary=None, callback=None): |
|
|
|
if not binary: |
|
|
|
@@ -320,7 +320,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): |
|
|
|
res.append(beAdoc(deepcopy(doc), q, a, eng)) |
|
|
|
return res |
|
|
|
|
|
|
|
elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE): |
|
|
|
elif re.search(r"\.(txt)$", filename, re.IGNORECASE): |
|
|
|
callback(0.1, "Start to parse.") |
|
|
|
txt = get_text(filename, binary) |
|
|
|
lines = txt.split("\n") |
|
|
|
@@ -359,6 +359,38 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): |
|
|
|
|
|
|
|
return res |
|
|
|
|
|
|
|
elif re.search(r"\.(csv)$", filename, re.IGNORECASE): |
|
|
|
callback(0.1, "Start to parse.") |
|
|
|
txt = get_text(filename, binary) |
|
|
|
lines = txt.split("\n") |
|
|
|
delimiter = "\t" if any("\t" in line for line in lines) else "," |
|
|
|
|
|
|
|
fails = [] |
|
|
|
question, answer = "", "" |
|
|
|
res = [] |
|
|
|
reader = csv.reader(lines, delimiter=delimiter) |
|
|
|
|
|
|
|
for i, row in enumerate(reader): |
|
|
|
if len(row) != 2: |
|
|
|
if question: |
|
|
|
answer += "\n" + lines[i] |
|
|
|
else: |
|
|
|
fails.append(str(i + 1)) |
|
|
|
elif len(row) == 2: |
|
|
|
if question and answer: |
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng)) |
|
|
|
question, answer = row |
|
|
|
if len(res) % 999 == 0: |
|
|
|
callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + ( |
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
|
|
|
|
|
|
if question: |
|
|
|
res.append(beAdoc(deepcopy(doc), question, answer, eng)) |
|
|
|
|
|
|
|
callback(0.6, ("Extract Q&A: {}".format(len(res)) + ( |
|
|
|
f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) |
|
|
|
return res |
|
|
|
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE): |
|
|
|
callback(0.1, "Start to parse.") |
|
|
|
pdf_parser = Pdf() |