|
|
|
@@ -224,6 +224,8 @@ class Dealer: |
|
|
|
def insert_citations(self, answer, chunks, chunk_v, |
|
|
|
embd_mdl, tkweight=0.1, vtweight=0.9): |
|
|
|
assert len(chunks) == len(chunk_v) |
|
|
|
if not chunks: |
|
|
|
return answer, set([]) |
|
|
|
pieces = re.split(r"(```)", answer) |
|
|
|
if len(pieces) >= 3: |
|
|
|
i = 0 |
|
|
|
@@ -263,7 +265,7 @@ class Dealer: |
|
|
|
|
|
|
|
ans_v, _ = embd_mdl.encode(pieces_) |
|
|
|
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( |
|
|
|
len(ans_v[0]), len(chunk_v[0])) |
|
|
|
len(ans_v[0]), len(chunk_v[0])) |
|
|
|
|
|
|
|
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ") |
|
|
|
for ck in chunks] |
|
|
|
@@ -360,29 +362,33 @@ class Dealer: |
|
|
|
ranks = {"total": 0, "chunks": [], "doc_aggs": {}} |
|
|
|
if not question: |
|
|
|
return ranks |
|
|
|
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size, |
|
|
|
RERANK_PAGE_LIMIT = 3 |
|
|
|
req = {"kb_ids": kb_ids, "doc_ids": doc_ids, "size": page_size*RERANK_PAGE_LIMIT, |
|
|
|
"question": question, "vector": True, "topk": top, |
|
|
|
"similarity": similarity_threshold, |
|
|
|
"available_int": 1} |
|
|
|
if page > RERANK_PAGE_LIMIT: |
|
|
|
req["page"] = page |
|
|
|
req["size"] = page_size |
|
|
|
sres = self.search(req, index_name(tenant_id), embd_mdl, highlight) |
|
|
|
ranks["total"] = sres.total |
|
|
|
|
|
|
|
if rerank_mdl: |
|
|
|
sim, tsim, vsim = self.rerank_by_model(rerank_mdl, |
|
|
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight) |
|
|
|
if page <= RERANK_PAGE_LIMIT: |
|
|
|
if rerank_mdl: |
|
|
|
sim, tsim, vsim = self.rerank_by_model(rerank_mdl, |
|
|
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight) |
|
|
|
else: |
|
|
|
sim, tsim, vsim = self.rerank( |
|
|
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight) |
|
|
|
idx = np.argsort(sim * -1)[(page-1)*page_size:page*page_size] |
|
|
|
else: |
|
|
|
sim, tsim, vsim = self.rerank( |
|
|
|
sres, question, 1 - vector_similarity_weight, vector_similarity_weight) |
|
|
|
idx = np.argsort(sim * -1) |
|
|
|
sim = tsim = vsim = [1]*len(sres.ids) |
|
|
|
idx = list(range(len(sres.ids))) |
|
|
|
|
|
|
|
dim = len(sres.query_vector) |
|
|
|
start_idx = (page - 1) * page_size |
|
|
|
for i in idx: |
|
|
|
if sim[i] < similarity_threshold: |
|
|
|
break |
|
|
|
ranks["total"] += 1 |
|
|
|
start_idx -= 1 |
|
|
|
if start_idx >= 0: |
|
|
|
continue |
|
|
|
if len(ranks["chunks"]) >= page_size: |
|
|
|
if aggs: |
|
|
|
continue |
|
|
|
@@ -406,7 +412,10 @@ class Dealer: |
|
|
|
"positions": sres.field[id].get("position_int", "").split("\t") |
|
|
|
} |
|
|
|
if highlight: |
|
|
|
d["highlight"] = rmSpace(sres.highlight[id]) |
|
|
|
if id in sres.highlight: |
|
|
|
d["highlight"] = rmSpace(sres.highlight[id]) |
|
|
|
else: |
|
|
|
d["highlight"] = d["content_with_weight"] |
|
|
|
if len(d["positions"]) % 5 == 0: |
|
|
|
poss = [] |
|
|
|
for i in range(0, len(d["positions"]), 5): |