| if "max_tokens" in gen_conf: | if "max_tokens" in gen_conf: | ||||
| gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) | gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count) | ||||
| answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) | answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf) | ||||
| stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer)) | |||||
| if knowledges: | if knowledges: | ||||
| answer = retrievaler.insert_citations(answer, | |||||
| answer, idx = retrievaler.insert_citations(answer, | |||||
| [ck["content_ltks"] for ck in kbinfos["chunks"]], | [ck["content_ltks"] for ck in kbinfos["chunks"]], | ||||
| [ck["vector"] for ck in kbinfos["chunks"]], | [ck["vector"] for ck in kbinfos["chunks"]], | ||||
| embd_mdl, | embd_mdl, | ||||
| tkweight=1 - dialog.vector_similarity_weight, | tkweight=1 - dialog.vector_similarity_weight, | ||||
| vtweight=dialog.vector_similarity_weight) | vtweight=dialog.vector_similarity_weight) | ||||
| idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | |||||
| kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | |||||
| for c in kbinfos["chunks"]: | for c in kbinfos["chunks"]: | ||||
| if c.get("vector"): del c["vector"] | if c.get("vector"): del c["vector"] | ||||
| return {"answer": answer, "reference": kbinfos} | return {"answer": answer, "reference": kbinfos} |
| res = [] | res = [] | ||||
| if re.search(r"\.pptx?$", filename, re.IGNORECASE): | if re.search(r"\.pptx?$", filename, re.IGNORECASE): | ||||
| ppt_parser = Ppt() | ppt_parser = Ppt() | ||||
| for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback): | |||||
| for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): | |||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| pn += from_page | |||||
| d["image"] = img | d["image"] = img | ||||
| tokenize(d, txt, ppt_parser.is_english) | |||||
| d["page_num_int"] = [pn+1] | |||||
| d["top_int"] = [0] | |||||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||||
| tokenize(d, txt, eng) | |||||
| res.append(d) | res.append(d) | ||||
| return res | return res | ||||
| elif re.search(r"\.pdf$", filename, re.IGNORECASE): | elif re.search(r"\.pdf$", filename, re.IGNORECASE): | ||||
| pdf_parser = Pdf() | pdf_parser = Pdf() | ||||
| for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): | for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)): | ||||
| d = copy.deepcopy(doc) | d = copy.deepcopy(doc) | ||||
| pn += from_page | |||||
| d["image"] = img | d["image"] = img | ||||
| d["page_num_int"] = [pn+1] | d["page_num_int"] = [pn+1] | ||||
| d["top_int"] = [0] | d["top_int"] = [0] | ||||
| d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1])) | |||||
| d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] | |||||
| tokenize(d, txt, eng) | tokenize(d, txt, eng) | ||||
| res.append(d) | res.append(d) | ||||
| return res | return res |
| res += f" ##{c}$$" | res += f" ##{c}$$" | ||||
| seted.add(c) | seted.add(c) | ||||
| return res | |||||
| return res, seted | |||||
| def rerank(self, sres, query, tkweight=0.3, | def rerank(self, sres, query, tkweight=0.3, | ||||
| vtweight=0.7, cfield="content_ltks"): | vtweight=0.7, cfield="content_ltks"): | ||||
| start_idx -= 1 | start_idx -= 1 | ||||
| if start_idx >= 0: | if start_idx >= 0: | ||||
| continue | continue | ||||
| if len(ranks["chunks"]) == page_size: | |||||
| if len(ranks["chunks"]) >= page_size: | |||||
| if aggs: | if aggs: | ||||
| continue | continue | ||||
| break | break | ||||
| if dnm not in ranks["doc_aggs"]: | if dnm not in ranks["doc_aggs"]: | ||||
| ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} | ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} | ||||
| ranks["doc_aggs"][dnm]["count"] += 1 | ranks["doc_aggs"][dnm]["count"] += 1 | ||||
| ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] | |||||
| ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)] | |||||
| return ranks | return ranks | ||||