Bladeren bron

Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.17.2
Kevin Hu 7 maanden geleden
bovenliggende
commit
caecaa7562
No account linked to committer's email address
5 gewijzigde bestanden met toevoegingen van 77 en 17 verwijderingen
  1. 4
    0
      agent/component/iterationitem.py
  2. 1
    0
      agentic_reasoning/prompts.py
  3. 25
    12
      api/db/services/dialog_service.py
  4. 45
    4
      rag/prompts.py
  5. 2
    1
      rag/utils/tavily_conn.py

+ 4
- 0
agent/component/iterationitem.py Bestand weergeven

ans = parent.get_input() ans = parent.get_input()
ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else "" ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else ""
ans = [a.strip() for a in ans.split(parent._param.delimiter)] ans = [a.strip() for a in ans.split(parent._param.delimiter)]
if not ans:
self._idx = -1
return pd.DataFrame()

df = pd.DataFrame([{"content": ans[self._idx]}]) df = pd.DataFrame([{"content": ans[self._idx]}])
self._idx += 1 self._idx += 1
if self._idx >= len(ans): if self._idx >= len(ans):

+ 1
- 0
agentic_reasoning/prompts.py Bestand weergeven

f"- You have a dataset to search, so you just provide a proper search query.\n" f"- You have a dataset to search, so you just provide a proper search query.\n"
f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
"- The language of query MUST be as the same as 'Question' or 'search result'.\n" "- The language of query MUST be as the same as 'Question' or 'search result'.\n"
"- If no helpful information can be found, rewrite the search query to be less and precise keywords.\n"
"- When done searching, continue your reasoning.\n\n" "- When done searching, continue your reasoning.\n\n"
'Please answer the following question. You should think step by step to solve it.\n\n' 'Please answer the following question. You should think step by step to solve it.\n\n'
) )

+ 25
- 12
api/db/services/dialog_service.py Bestand weergeven

from rag.app.resume import forbidden_select_fields4resume from rag.app.resume import forbidden_select_fields4resume
from rag.app.tag import label_question from rag.app.tag import label_question
from rag.nlp.search import index_name from rag.nlp.search import index_name
from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format
from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format, \
citation_prompt
from rag.utils import rmSpace, num_tokens_from_string from rag.utils import rmSpace, num_tokens_from_string
from rag.utils.tavily_conn import Tavily from rag.utils.tavily_conn import Tavily


gen_conf = dialog.llm_setting gen_conf = dialog.llm_setting


msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}]
prompt4citation = ""
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
prompt4citation = citation_prompt()
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
for m in messages if m["role"] != "system"]) for m in messages if m["role"] != "system"])
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97))
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
assert len(msg) >= 2, f"message_fit_in has bug: {msg}" assert len(msg) >= 2, f"message_fit_in has bug: {msg}"
prompt = msg[0]["content"] prompt = msg[0]["content"]


think = ans[0] + "</think>" think = ans[0] + "</think>"
answer = ans[1] answer = ans[1]
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
answer, idx = retriever.insert_citations(answer,
[ck["content_ltks"]
for ck in kbinfos["chunks"]],
[ck["vector"]
for ck in kbinfos["chunks"]],
embd_mdl,
tkweight=1 - dialog.vector_similarity_weight,
vtweight=dialog.vector_similarity_weight)
answer = re.sub(r"##[ij]\$\$", "", answer, flags=re.DOTALL)
if not re.search(r"##[0-9]+\$\$", answer):
answer, idx = retriever.insert_citations(answer,
[ck["content_ltks"]
for ck in kbinfos["chunks"]],
[ck["vector"]
for ck in kbinfos["chunks"]],
embd_mdl,
tkweight=1 - dialog.vector_similarity_weight,
vtweight=dialog.vector_similarity_weight)
else:
idx = set([])
for r in re.finditer(r"##([0-9]+)\$\$", answer):
i = int(r.group(1))
if i < len(kbinfos["chunks"]):
idx.add(i)

idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
recall_docs = [ recall_docs = [
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
if stream: if stream:
last_ans = "" last_ans = ""
answer = "" answer = ""
for ans in chat_mdl.chat_streamly(prompt, msg[1:], gen_conf):
for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
if thought: if thought:
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL) ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
answer = ans answer = ans
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)} yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
yield decorate_answer(thought+answer) yield decorate_answer(thought+answer)
else: else:
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
answer = chat_mdl.chat(prompt+prompt4citation, msg[1:], gen_conf)
user_content = msg[-1].get("content", "[content not available]") user_content = msg[-1].get("content", "[content not available]")
logging.debug("User: {}|Assistant: {}".format(user_content, answer)) logging.debug("User: {}|Assistant: {}".format(user_content, answer))
res = decorate_answer(answer) res = decorate_answer(answer)

+ 45
- 4
rag/prompts.py Bestand weergeven

docs = {d.id: d.meta_fields for d in docs} docs = {d.id: d.meta_fields for d in docs}


doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
for ck in kbinfos["chunks"][:chunks_num]:
doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"])
for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + f"ID: {i}\n" + ck["content_with_weight"])
doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})


knowledges = [] knowledges = []
for nm, cks_meta in doc2chunks.items(): for nm, cks_meta in doc2chunks.items():
txt = f"Document: {nm} \n"
txt = f"\nDocument: {nm} \n"
for k, v in cks_meta["meta"].items(): for k, v in cks_meta["meta"].items():
txt += f"{k}: {v}\n" txt += f"{k}: {v}\n"
txt += "Relevant fragments as following:\n" txt += "Relevant fragments as following:\n"
for i, chunk in enumerate(cks_meta["chunks"], 1): for i, chunk in enumerate(cks_meta["chunks"], 1):
txt += f"{i}. {chunk}\n"
txt += f"{chunk}\n"
knowledges.append(txt) knowledges.append(txt)
return knowledges return knowledges




def citation_prompt():
return """

# Citation requirements:
- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.

--- Example START ---
<SYSTEM>: Here is the knowledge base:

Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
ID: 0
The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...

Document: Elon Musk's Dogecoin tweet sparks social media frenzy
ID: 1
Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.

Document: Causal effect of Elon Musk tweets on Dogecoin price
ID: 2
If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...

Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
ID: 3
The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...

The above is the knowledge base.

<USER>: What's the Elon's view on dogecoin?

<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.

--- Example END ---

"""


def keyword_extraction(chat_mdl, content, topn=3): def keyword_extraction(chat_mdl, content, topn=3):
prompt = f""" prompt = f"""
Role: You're a text analyzer. Role: You're a text analyzer.

+ 2
- 1
rag/utils/tavily_conn.py Bestand weergeven

try: try:
response = self.tavily_client.search( response = self.tavily_client.search(
query=query, query=query,
search_depth="advanced"
search_depth="advanced",
max_results=6
) )
return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]] return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]]
except Exception as e: except Exception as e:

Laden…
Annuleren
Opslaan