浏览代码

Fix: cite disfunction for G component. (#7117)

### What problem does this PR solve?

#7097

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.18.0
Kevin Hu 6 个月前
父节点
当前提交
487aed419e
没有帐户链接到提交者的电子邮件
共有 3 个文件被更改,包括 19 次插入11 次删除
  1. 5
    0
      agent/component/base.py
  2. 12
    10
      agent/component/generate.py
  3. 2
    1
      agent/component/retrieval.py

+ 5
- 0
agent/component/base.py 查看文件

"params": {} "params": {}
} }
""" """
out = getattr(self._param, self._param.output_var_name)
if isinstance(out, pd.DataFrame) and "chunks" in out:
del out["chunks"]
setattr(self._param, self._param.output_var_name, out)

return """{{ return """{{
"component_name": "{}", "component_name": "{}",
"params": {}, "params": {},

+ 12
- 10
agent/component/generate.py 查看文件

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import json
import re import re
from functools import partial from functools import partial
import pandas as pd import pandas as pd
return list(cpnts) return list(cpnts)


def set_cite(self, retrieval_res, answer): def set_cite(self, retrieval_res, answer):
retrieval_res = retrieval_res.dropna(subset=["vector", "content_ltks"]).reset_index(drop=True)
if "empty_response" in retrieval_res.columns: if "empty_response" in retrieval_res.columns:
retrieval_res["empty_response"].fillna("", inplace=True) retrieval_res["empty_response"].fillna("", inplace=True)
chunks = json.loads(retrieval_res["chunks"][0])
answer, idx = settings.retrievaler.insert_citations(answer, answer, idx = settings.retrievaler.insert_citations(answer,
[ck["content_ltks"] for _, ck in retrieval_res.iterrows()],
[ck["vector"] for _, ck in retrieval_res.iterrows()],
[ck["content_ltks"] for ck in chunks],
[ck["vector"] for ck in chunks],
LLMBundle(self._canvas.get_tenant_id(), LLMType.EMBEDDING, LLMBundle(self._canvas.get_tenant_id(), LLMType.EMBEDDING,
self._canvas.get_embedding_model()), tkweight=0.7, self._canvas.get_embedding_model()), tkweight=0.7,
vtweight=0.3) vtweight=0.3)
doc_ids = set([]) doc_ids = set([])
recall_docs = [] recall_docs = []
for i in idx: for i in idx:
did = retrieval_res.loc[int(i), "doc_id"]
did = chunks[int(i)]["doc_id"]
if did in doc_ids: if did in doc_ids:
continue continue
doc_ids.add(did) doc_ids.add(did)
recall_docs.append({"doc_id": did, "doc_name": retrieval_res.loc[int(i), "docnm_kwd"]})
recall_docs.append({"doc_id": did, "doc_name": chunks[int(i)]["docnm_kwd"]})


del retrieval_res["vector"]
del retrieval_res["content_ltks"]
for c in chunks:
del c["vector"]
del c["content_ltks"]


reference = { reference = {
"chunks": [ck.to_dict() for _, ck in retrieval_res.iterrows()],
"chunks": chunks,
"doc_aggs": recall_docs "doc_aggs": recall_docs
} }


ans = chat_mdl.chat(msg[0]["content"], msg[1:], self._param.gen_conf()) ans = chat_mdl.chat(msg[0]["content"], msg[1:], self._param.gen_conf())
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL) ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)


if self._param.cite and "content_ltks" in retrieval_res.columns and "vector" in retrieval_res.columns:
if self._param.cite and "chunks" in retrieval_res.columns:
res = self.set_cite(retrieval_res, ans) res = self.set_cite(retrieval_res, ans)
return pd.DataFrame([res]) return pd.DataFrame([res])


answer = ans answer = ans
yield res yield res


if self._param.cite and "content_ltks" in retrieval_res.columns and "vector" in retrieval_res.columns:
if self._param.cite and "chunks" in retrieval_res.columns:
res = self.set_cite(retrieval_res, answer) res = self.set_cite(retrieval_res, answer)
yield res yield res



+ 2
- 1
agent/component/retrieval.py 查看文件

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import json
import logging import logging
from abc import ABC from abc import ABC


df["empty_response"] = self._param.empty_response df["empty_response"] = self._param.empty_response
return df return df


df = pd.DataFrame({"content": kb_prompt(kbinfos, 200000)})
df = pd.DataFrame({"content": kb_prompt(kbinfos, 200000), "chunks": json.dumps(kbinfos["chunks"])})
logging.debug("{} {}".format(query, df)) logging.debug("{} {}".format(query, df))
return df.dropna() return df.dropna()



正在加载...
取消
保存