| @@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs): | |||
| if "max_tokens" in gen_conf: | |||
| gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) | |||
| answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) | |||
| stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer)) | |||
| if knowledges: | |||
| answer = retrievaler.insert_citations(answer, | |||
| answer, idx = retrievaler.insert_citations(answer, | |||
| [ck["content_ltks"] for ck in kbinfos["chunks"]], | |||
| [ck["vector"] for ck in kbinfos["chunks"]], | |||
| embd_mdl, | |||
| tkweight=1 - dialog.vector_similarity_weight, | |||
| vtweight=dialog.vector_similarity_weight) | |||
| idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | |||
| kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | |||
| for c in kbinfos["chunks"]: | |||
| if c.get("vector"): del c["vector"] | |||
| return {"answer": answer, "reference": kbinfos} | |||
| @@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca | |||
| res = [] | |||
| if re.search(r"\.pptx?$", filename, re.IGNORECASE): | |||
| ppt_parser = Ppt() | |||
| for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback): | |||
| for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| d["image"] = img | |||
| tokenize(d, txt, ppt_parser.is_english) | |||
| d["page_num_int"] = [pn+1] | |||
| d["top_int"] = [0] | |||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | |||
| pdf_parser = Pdf() | |||
| for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): | |||
| d = copy.deepcopy(doc) | |||
| pn += from_page | |||
| d["image"] = img | |||
| d["page_num_int"] = [pn+1] | |||
| d["top_int"] = [0] | |||
| d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1])) | |||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||
| tokenize(d, txt, eng) | |||
| res.append(d) | |||
| return res | |||
| @@ -243,7 +243,7 @@ class Dealer: | |||
| res += f" ##{c}$$" | |||
| seted.add(c) | |||
| return res | |||
| return res, seted | |||
| def rerank(self, sres, query, tkweight=0.3, | |||
| vtweight=0.7, cfield="content_ltks"): | |||
| @@ -290,7 +290,7 @@ class Dealer: | |||
| start_idx -= 1 | |||
| if start_idx >= 0: | |||
| continue | |||
| if len(ranks["chunks"]) == page_size: | |||
| if len(ranks["chunks"]) >= page_size: | |||
| if aggs: | |||
| continue | |||
| break | |||
| @@ -322,7 +322,7 @@ class Dealer: | |||
| if dnm not in ranks["doc_aggs"]: | |||
| ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} | |||
| ranks["doc_aggs"][dnm]["count"] += 1 | |||
| ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] | |||
| ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] | |||
| return ranks | |||