| @@ -51,6 +51,7 @@ class TaskService(CommonService): | |||
| .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))\ | |||
| .where( | |||
| Document.status == StatusEnum.VALID.value, | |||
| Document.run == TaskStatus.RUNNING.value, | |||
| ~(Document.type == FileType.VIRTUAL.value), | |||
| cls.model.progress == 0, | |||
| cls.model.update_time >= tm, | |||
| @@ -42,7 +42,9 @@ class HuPptParser(object): | |||
| BytesIO(fnm)) | |||
| txts = [] | |||
| self.total_page = len(ppt.slides) | |||
| for i, slide in enumerate(ppt.slides[from_page: to_page]): | |||
| for i, slide in enumerate(ppt.slides): | |||
| if i < from_page: continue | |||
| if i >= to_page:break | |||
| texts = [] | |||
| for shape in slide.shapes: | |||
| txt = self.__extract(shape) | |||
| @@ -13,6 +13,9 @@ | |||
| import copy | |||
| import re | |||
| from io import BytesIO | |||
| from PIL import Image | |||
| from rag.nlp import tokenize, is_english | |||
| from rag.nlp import huqie | |||
| from deepdoc.parser import PdfParser, PptParser | |||
| @@ -30,7 +33,7 @@ class Ppt(PptParser): | |||
| for i, slide in enumerate(presentation.slides[from_page: to_page]): | |||
| buffered = BytesIO() | |||
| slide.get_thumbnail(0.5, 0.5).save(buffered, drawing.imaging.ImageFormat.jpeg) | |||
| imgs.append(buffered.getvalue()) | |||
| imgs.append(Image.open(buffered)) | |||
| assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) | |||
| callback(0.9, "Image extraction finished") | |||
| self.is_english = is_english(txts) | |||
| @@ -58,12 +58,9 @@ class Excel(ExcelParser): | |||
| continue | |||
| data.append(row) | |||
| done += 1 | |||
| if done % 999 == 0: | |||
| callback(done * 0.6 / total, ("Extract records: {}".format(len(res)) + ( | |||
| f"{len(fails)} failure({sheetname}), line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| res.append(pd.DataFrame(np.array(data), columns=headers)) | |||
| callback(0.6, ("Extract records: {}. ".format(done) + ( | |||
| callback(0.3, ("Extract records: {}~{}".format(from_page+1, min(to_page, from_page+rn)) + ( | |||
| f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else ""))) | |||
| return res | |||
| @@ -151,7 +148,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese | |||
| headers = lines[0].split(kwargs.get("delimiter", "\t")) | |||
| rows = [] | |||
| for i, line in enumerate(lines[1:]): | |||
| if from_page < from_page:continue | |||
| if i < from_page:continue | |||
| if i >= to_page: break | |||
| row = [l for l in line.split(kwargs.get("delimiter", "\t"))] | |||
| if len(row) != len(headers): | |||
| @@ -191,12 +188,15 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, lang="Chinese | |||
| df[clmns[j]] = cln | |||
| if ty == "text": | |||
| txts.extend([str(c) for c in cln if c]) | |||
| clmns_map = [(py_clmns[j] + fieds_map[clmn_tys[j]], clmns[j]) | |||
| clmns_map = [(py_clmns[i] + fieds_map[clmn_tys[i]], clmns[i]) | |||
| for i in range(len(clmns))] | |||
| eng = lang.lower() == "english"#is_english(txts) | |||
| for ii, row in df.iterrows(): | |||
| d = {} | |||
| d = { | |||
| "docnm_kwd": filename, | |||
| "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) | |||
| } | |||
| row_txt = [] | |||
| for j in range(len(clmns)): | |||
| if row[clmns[j]] is None: | |||
| @@ -91,10 +91,10 @@ def dispatch(): | |||
| tsks.append(task) | |||
| elif r["parser_id"] == "table": | |||
| rn = HuExcelParser.row_number(r["name"], MINIO.get(r["kb_id"], r["location"])) | |||
| for i in range(0, rn, 1000): | |||
| for i in range(0, rn, 3000): | |||
| task = new_task() | |||
| task["from_page"] = i | |||
| task["to_page"] = min(i + 1000, rn) | |||
| task["to_page"] = min(i + 3000, rn) | |||
| tsks.append(task) | |||
| else: | |||
| tsks.append(new_task()) | |||
| @@ -128,8 +128,6 @@ def build(row): | |||
| return | |||
| callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks)) | |||
| docs = [] | |||
| doc = { | |||
| "doc_id": row["doc_id"], | |||
| @@ -179,8 +177,8 @@ def embedding(docs, mdl, parser_config={}, callback=None): | |||
| tk_count += c | |||
| cnts_ = np.array([]) | |||
| for i in range(0, len(cnts), 32): | |||
| vts, c = mdl.encode(cnts[i: i+32]) | |||
| for i in range(0, len(cnts), 8): | |||
| vts, c = mdl.encode(cnts[i: i+8]) | |||
| if len(cnts_) == 0: cnts_ = vts | |||
| else: cnts_ = np.concatenate((cnts_, vts), axis=0) | |||
| tk_count += c | |||
| @@ -226,6 +224,7 @@ def main(comm, mod): | |||
| continue | |||
| # TODO: exception handler | |||
| ## set_progress(r["did"], -1, "ERROR: ") | |||
| callback(msg="Finished slicing files(%d). Start to embedding the content."%len(cks)) | |||
| try: | |||
| tk_count = embedding(cks, embd_mdl, r["parser_config"], callback) | |||
| except Exception as e: | |||