### What problem does this PR solve? #5905 ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.17.2
| ans = parent.get_input() | ans = parent.get_input() | ||||
| ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else "" | ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else "" | ||||
| ans = [a.strip() for a in ans.split(parent._param.delimiter)] | ans = [a.strip() for a in ans.split(parent._param.delimiter)] | ||||
| if not ans: | |||||
| self._idx = -1 | |||||
| return pd.DataFrame() | |||||
| df = pd.DataFrame([{"content": ans[self._idx]}]) | df = pd.DataFrame([{"content": ans[self._idx]}]) | ||||
| self._idx += 1 | self._idx += 1 | ||||
| if self._idx >= len(ans): | if self._idx >= len(ans): |
| f"- You have a dataset to search, so you just provide a proper search query.\n" | f"- You have a dataset to search, so you just provide a proper search query.\n" | ||||
| f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" | f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" | ||||
| "- The language of query MUST be as the same as 'Question' or 'search result'.\n" | "- The language of query MUST be as the same as 'Question' or 'search result'.\n" | ||||
| "- If no helpful information can be found, rewrite the search query to be less and precise keywords.\n" | |||||
| "- When done searching, continue your reasoning.\n\n" | "- When done searching, continue your reasoning.\n\n" | ||||
| 'Please answer the following question. You should think step by step to solve it.\n\n' | 'Please answer the following question. You should think step by step to solve it.\n\n' | ||||
| ) | ) |
| from rag.app.resume import forbidden_select_fields4resume | from rag.app.resume import forbidden_select_fields4resume | ||||
| from rag.app.tag import label_question | from rag.app.tag import label_question | ||||
| from rag.nlp.search import index_name | from rag.nlp.search import index_name | ||||
| from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format | |||||
| from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format, \ | |||||
| citation_prompt | |||||
| from rag.utils import rmSpace, num_tokens_from_string | from rag.utils import rmSpace, num_tokens_from_string | ||||
| from rag.utils.tavily_conn import Tavily | from rag.utils.tavily_conn import Tavily | ||||
| gen_conf = dialog.llm_setting | gen_conf = dialog.llm_setting | ||||
| msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] | msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] | ||||
| prompt4citation = "" | |||||
| if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): | |||||
| prompt4citation = citation_prompt() | |||||
| msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} | msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} | ||||
| for m in messages if m["role"] != "system"]) | for m in messages if m["role"] != "system"]) | ||||
| used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97)) | |||||
| used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95)) | |||||
| assert len(msg) >= 2, f"message_fit_in has bug: {msg}" | assert len(msg) >= 2, f"message_fit_in has bug: {msg}" | ||||
| prompt = msg[0]["content"] | prompt = msg[0]["content"] | ||||
| think = ans[0] + "</think>" | think = ans[0] + "</think>" | ||||
| answer = ans[1] | answer = ans[1] | ||||
| if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): | if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): | ||||
| answer, idx = retriever.insert_citations(answer, | |||||
| [ck["content_ltks"] | |||||
| for ck in kbinfos["chunks"]], | |||||
| [ck["vector"] | |||||
| for ck in kbinfos["chunks"]], | |||||
| embd_mdl, | |||||
| tkweight=1 - dialog.vector_similarity_weight, | |||||
| vtweight=dialog.vector_similarity_weight) | |||||
| answer = re.sub(r"##[ij]\$\$", "", answer, flags=re.DOTALL) | |||||
| if not re.search(r"##[0-9]+\$\$", answer): | |||||
| answer, idx = retriever.insert_citations(answer, | |||||
| [ck["content_ltks"] | |||||
| for ck in kbinfos["chunks"]], | |||||
| [ck["vector"] | |||||
| for ck in kbinfos["chunks"]], | |||||
| embd_mdl, | |||||
| tkweight=1 - dialog.vector_similarity_weight, | |||||
| vtweight=dialog.vector_similarity_weight) | |||||
| else: | |||||
| idx = set([]) | |||||
| for r in re.finditer(r"##([0-9]+)\$\$", answer): | |||||
| i = int(r.group(1)) | |||||
| if i < len(kbinfos["chunks"]): | |||||
| idx.add(i) | |||||
| idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) | ||||
| recall_docs = [ | recall_docs = [ | ||||
| d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] | ||||
| if stream: | if stream: | ||||
| last_ans = "" | last_ans = "" | ||||
| answer = "" | answer = "" | ||||
| for ans in chat_mdl.chat_streamly(prompt, msg[1:], gen_conf): | |||||
| for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf): | |||||
| if thought: | if thought: | ||||
| ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL) | ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL) | ||||
| answer = ans | answer = ans | ||||
| yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)} | yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)} | ||||
| yield decorate_answer(thought+answer) | yield decorate_answer(thought+answer) | ||||
| else: | else: | ||||
| answer = chat_mdl.chat(prompt, msg[1:], gen_conf) | |||||
| answer = chat_mdl.chat(prompt+prompt4citation, msg[1:], gen_conf) | |||||
| user_content = msg[-1].get("content", "[content not available]") | user_content = msg[-1].get("content", "[content not available]") | ||||
| logging.debug("User: {}|Assistant: {}".format(user_content, answer)) | logging.debug("User: {}|Assistant: {}".format(user_content, answer)) | ||||
| res = decorate_answer(answer) | res = decorate_answer(answer) |
| docs = {d.id: d.meta_fields for d in docs} | docs = {d.id: d.meta_fields for d in docs} | ||||
| doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) | doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) | ||||
| for ck in kbinfos["chunks"][:chunks_num]: | |||||
| doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"]) | |||||
| for i, ck in enumerate(kbinfos["chunks"][:chunks_num]): | |||||
| doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + f"ID: {i}\n" + ck["content_with_weight"]) | |||||
| doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) | doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) | ||||
| knowledges = [] | knowledges = [] | ||||
| for nm, cks_meta in doc2chunks.items(): | for nm, cks_meta in doc2chunks.items(): | ||||
| txt = f"Document: {nm} \n" | |||||
| txt = f"\nDocument: {nm} \n" | |||||
| for k, v in cks_meta["meta"].items(): | for k, v in cks_meta["meta"].items(): | ||||
| txt += f"{k}: {v}\n" | txt += f"{k}: {v}\n" | ||||
| txt += "Relevant fragments as following:\n" | txt += "Relevant fragments as following:\n" | ||||
| for i, chunk in enumerate(cks_meta["chunks"], 1): | for i, chunk in enumerate(cks_meta["chunks"], 1): | ||||
| txt += f"{i}. {chunk}\n" | |||||
| txt += f"{chunk}\n" | |||||
| knowledges.append(txt) | knowledges.append(txt) | ||||
| return knowledges | return knowledges | ||||
| def citation_prompt(): | |||||
| return """ | |||||
| # Citation requirements: | |||||
| - Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'. | |||||
| - Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations. | |||||
| - DO NOT insert CITATION in the answer if the content is not from retrieved chunks. | |||||
| --- Example START --- | |||||
| <SYSTEM>: Here is the knowledge base: | |||||
| Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ... | |||||
| URL: https://blockworks.co/news/elon-musk-crypto-dogecoin | |||||
| ID: 0 | |||||
| The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto... | |||||
| Document: Elon Musk's Dogecoin tweet sparks social media frenzy | |||||
| ID: 1 | |||||
| Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin. | |||||
| Document: Causal effect of Elon Musk tweets on Dogecoin price | |||||
| ID: 2 | |||||
| If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk... | |||||
| Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services | |||||
| ID: 3 | |||||
| The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?... | |||||
| The above is the knowledge base. | |||||
| <USER>: What's the Elon's view on dogecoin? | |||||
| <ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$. | |||||
| Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$. | |||||
| Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature. | |||||
| --- Example END --- | |||||
| """ | |||||
| def keyword_extraction(chat_mdl, content, topn=3): | def keyword_extraction(chat_mdl, content, topn=3): | ||||
| prompt = f""" | prompt = f""" | ||||
| Role: You're a text analyzer. | Role: You're a text analyzer. |
| try: | try: | ||||
| response = self.tavily_client.search( | response = self.tavily_client.search( | ||||
| query=query, | query=query, | ||||
| search_depth="advanced" | |||||
| search_depth="advanced", | |||||
| max_results=6 | |||||
| ) | ) | ||||
| return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]] | return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]] | ||||
| except Exception as e: | except Exception as e: |