Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import re
  20. from collections import defaultdict
  21. import json_repair
  22. from api import settings
  23. from api.db import LLMType
  24. from rag.settings import TAG_FLD
  25. from rag.utils import encoder, num_tokens_from_string
  26. def chunks_format(reference):
  27. def get_value(d, k1, k2):
  28. return d.get(k1, d.get(k2))
  29. return [
  30. {
  31. "id": get_value(chunk, "chunk_id", "id"),
  32. "content": get_value(chunk, "content", "content_with_weight"),
  33. "document_id": get_value(chunk, "doc_id", "document_id"),
  34. "document_name": get_value(chunk, "docnm_kwd", "document_name"),
  35. "dataset_id": get_value(chunk, "kb_id", "dataset_id"),
  36. "image_id": get_value(chunk, "image_id", "img_id"),
  37. "positions": get_value(chunk, "positions", "position_int"),
  38. "url": chunk.get("url"),
  39. "similarity": chunk.get("similarity"),
  40. "vector_similarity": chunk.get("vector_similarity"),
  41. "term_similarity": chunk.get("term_similarity"),
  42. "doc_type": chunk.get("doc_type_kwd"),
  43. }
  44. for chunk in reference.get("chunks", [])
  45. ]
  46. def llm_id2llm_type(llm_id):
  47. from api.db.services.llm_service import TenantLLMService
  48. llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)
  49. llm_factories = settings.FACTORY_LLM_INFOS
  50. for llm_factory in llm_factories:
  51. for llm in llm_factory["llm"]:
  52. if llm_id == llm["llm_name"]:
  53. return llm["model_type"].strip(",")[-1]
  54. def message_fit_in(msg, max_length=4000):
  55. def count():
  56. nonlocal msg
  57. tks_cnts = []
  58. for m in msg:
  59. tks_cnts.append({"role": m["role"], "count": num_tokens_from_string(m["content"])})
  60. total = 0
  61. for m in tks_cnts:
  62. total += m["count"]
  63. return total
  64. c = count()
  65. if c < max_length:
  66. return c, msg
  67. msg_ = [m for m in msg if m["role"] == "system"]
  68. if len(msg) > 1:
  69. msg_.append(msg[-1])
  70. msg = msg_
  71. c = count()
  72. if c < max_length:
  73. return c, msg
  74. ll = num_tokens_from_string(msg_[0]["content"])
  75. ll2 = num_tokens_from_string(msg_[-1]["content"])
  76. if ll / (ll + ll2) > 0.8:
  77. m = msg_[0]["content"]
  78. m = encoder.decode(encoder.encode(m)[: max_length - ll2])
  79. msg[0]["content"] = m
  80. return max_length, msg
  81. m = msg_[-1]["content"]
  82. m = encoder.decode(encoder.encode(m)[: max_length - ll2])
  83. msg[-1]["content"] = m
  84. return max_length, msg
  85. def kb_prompt(kbinfos, max_tokens):
  86. from api.db.services.document_service import DocumentService
  87. knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
  88. used_token_count = 0
  89. chunks_num = 0
  90. for i, c in enumerate(knowledges):
  91. used_token_count += num_tokens_from_string(c)
  92. chunks_num += 1
  93. if max_tokens * 0.97 < used_token_count:
  94. knowledges = knowledges[:i]
  95. logging.warning(f"Not all the retrieval into prompt: {i + 1}/{len(knowledges)}")
  96. break
  97. docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
  98. docs = {d.id: d.meta_fields for d in docs}
  99. doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
  100. for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
  101. cnt = f"---\nID: {i}\n" + (f"URL: {ck['url']}\n" if "url" in ck else "")
  102. cnt += ck["content_with_weight"]
  103. doc2chunks[ck["docnm_kwd"]]["chunks"].append(cnt)
  104. doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
  105. knowledges = []
  106. for nm, cks_meta in doc2chunks.items():
  107. txt = f"\nDocument: {nm} \n"
  108. for k, v in cks_meta["meta"].items():
  109. txt += f"{k}: {v}\n"
  110. txt += "Relevant fragments as following:\n"
  111. for i, chunk in enumerate(cks_meta["chunks"], 1):
  112. txt += f"{chunk}\n"
  113. knowledges.append(txt)
  114. return knowledges
  115. def citation_prompt():
  116. return """
  117. # Citation requirements:
  118. - Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
  119. - Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
  120. - DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
  121. - DO NOT use standalone Document IDs (e.g., '#ID#').
  122. - Under NO circumstances any other citation styles or formats (e.g., '~~i==', '[i]', '(i)', etc.) be used.
  123. - Citations ALWAYS the '##i$$' format.
  124. - Any failure to adhere to the above rules, including but not limited to incorrect formatting, use of prohibited styles, or unsupported citations, will be considered a error, should skip adding Citation for this sentence.
  125. --- Example START ---
  126. <SYSTEM>: Here is the knowledge base:
  127. Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
  128. URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
  129. ID: 0
  130. The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...
  131. Document: Elon Musk's Dogecoin tweet sparks social media frenzy
  132. ID: 1
  133. Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.
  134. Document: Causal effect of Elon Musk tweets on Dogecoin price
  135. ID: 2
  136. If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...
  137. Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
  138. ID: 3
  139. The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
  140. The above is the knowledge base.
  141. <USER>: What's the Elon's view on dogecoin?
  142. <ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
  143. Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
  144. Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
  145. --- Example END ---
  146. """
  147. def keyword_extraction(chat_mdl, content, topn=3):
  148. prompt = f"""
  149. Role: You're a text analyzer.
  150. Task: extract the most important keywords/phrases of a given piece of text content.
  151. Requirements:
  152. - Summarize the text content, and give top {topn} important keywords/phrases.
  153. - The keywords MUST be in language of the given piece of text content.
  154. - The keywords are delimited by ENGLISH COMMA.
  155. - Keywords ONLY in output.
  156. ### Text Content
  157. {content}
  158. """
  159. msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
  160. _, msg = message_fit_in(msg, chat_mdl.max_length)
  161. kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
  162. if isinstance(kwd, tuple):
  163. kwd = kwd[0]
  164. kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
  165. if kwd.find("**ERROR**") >= 0:
  166. return ""
  167. return kwd
  168. def question_proposal(chat_mdl, content, topn=3):
  169. prompt = f"""
  170. Role: You're a text analyzer.
  171. Task: propose {topn} questions about a given piece of text content.
  172. Requirements:
  173. - Understand and summarize the text content, and propose top {topn} important questions.
  174. - The questions SHOULD NOT have overlapping meanings.
  175. - The questions SHOULD cover the main content of the text as much as possible.
  176. - The questions MUST be in language of the given piece of text content.
  177. - One question per line.
  178. - Question ONLY in output.
  179. ### Text Content
  180. {content}
  181. """
  182. msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
  183. _, msg = message_fit_in(msg, chat_mdl.max_length)
  184. kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
  185. if isinstance(kwd, tuple):
  186. kwd = kwd[0]
  187. kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
  188. if kwd.find("**ERROR**") >= 0:
  189. return ""
  190. return kwd
  191. def full_question(tenant_id, llm_id, messages, language=None):
  192. from api.db.services.llm_service import LLMBundle
  193. if llm_id2llm_type(llm_id) == "image2text":
  194. chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
  195. else:
  196. chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
  197. conv = []
  198. for m in messages:
  199. if m["role"] not in ["user", "assistant"]:
  200. continue
  201. conv.append("{}: {}".format(m["role"].upper(), m["content"]))
  202. conv = "\n".join(conv)
  203. today = datetime.date.today().isoformat()
  204. yesterday = (datetime.date.today() - datetime.timedelta(days=1)).isoformat()
  205. tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
  206. prompt = f"""
  207. Role: A helpful assistant
  208. Task and steps:
  209. 1. Generate a full user question that would follow the conversation.
  210. 2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}.
  211. Requirements & Restrictions:
  212. - If the user's latest question is completely, don't do anything, just return the original question.
  213. - DON'T generate anything except a refined question."""
  214. if language:
  215. prompt += f"""
  216. - Text generated MUST be in {language}."""
  217. else:
  218. prompt += """
  219. - Text generated MUST be in the same language of the original user's question.
  220. """
  221. prompt += f"""
  222. ######################
  223. -Examples-
  224. ######################
  225. # Example 1
  226. ## Conversation
  227. USER: What is the name of Donald Trump's father?
  228. ASSISTANT: Fred Trump.
  229. USER: And his mother?
  230. ###############
  231. Output: What's the name of Donald Trump's mother?
  232. ------------
  233. # Example 2
  234. ## Conversation
  235. USER: What is the name of Donald Trump's father?
  236. ASSISTANT: Fred Trump.
  237. USER: And his mother?
  238. ASSISTANT: Mary Trump.
  239. User: What's her full name?
  240. ###############
  241. Output: What's the full name of Donald Trump's mother Mary Trump?
  242. ------------
  243. # Example 3
  244. ## Conversation
  245. USER: What's the weather today in London?
  246. ASSISTANT: Cloudy.
  247. USER: What's about tomorrow in Rochester?
  248. ###############
  249. Output: What's the weather in Rochester on {tomorrow}?
  250. ######################
  251. # Real Data
  252. ## Conversation
  253. {conv}
  254. ###############
  255. """
  256. ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2})
  257. ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
  258. return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]
  259. def cross_languages(tenant_id, llm_id, query, languages=[]):
  260. from api.db.services.llm_service import LLMBundle
  261. if llm_id and llm_id2llm_type(llm_id) == "image2text":
  262. chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
  263. else:
  264. chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
  265. sys_prompt = """
  266. Act as a streamlined multilingual translator. Strictly output translations separated by ### without any explanations or formatting. Follow these rules:
  267. 1. Accept batch translation requests in format:
  268. [source text]
  269. ===
  270. [target languages separated by commas]
  271. 2. Always maintain:
  272. - Original formatting (tables/lists/spacing)
  273. - Technical terminology accuracy
  274. - Cultural context appropriateness
  275. 3. Output format:
  276. [language1 translation]
  277. ###
  278. [language1 translation]
  279. **Examples:**
  280. Input:
  281. Hello World! Let's discuss AI safety.
  282. ===
  283. Chinese, French, Jappanese
  284. Output:
  285. 你好世界!让我们讨论人工智能安全问题。
  286. ###
  287. Bonjour le monde ! Parlons de la sécurité de l'IA.
  288. ###
  289. こんにちは世界!AIの安全性について話し合いましょう。
  290. """
  291. user_prompt=f"""
  292. Input:
  293. {query}
  294. ===
  295. {', '.join(languages)}
  296. Output:
  297. """
  298. ans = chat_mdl.chat(sys_prompt, [{"role": "user", "content": user_prompt}], {"temperature": 0.2})
  299. ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
  300. if ans.find("**ERROR**") >= 0:
  301. return query
  302. return "\n".join([a for a in re.sub(r"(^Output:|\n+)", "", ans, flags=re.DOTALL).split("===") if a.strip()])
  303. def content_tagging(chat_mdl, content, all_tags, examples, topn=3):
  304. prompt = f"""
  305. Role: You're a text analyzer.
  306. Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set.
  307. Steps::
  308. - Comprehend the tag/label set.
  309. - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON.
  310. - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score.
  311. Requirements
  312. - The tags MUST be from the tag set.
  313. - The output MUST be in JSON format only, the key is tag and the value is its relevance score.
  314. - The relevance score must be range from 1 to 10.
  315. - Keywords ONLY in output.
  316. # TAG SET
  317. {", ".join(all_tags)}
  318. """
  319. for i, ex in enumerate(examples):
  320. prompt += """
  321. # Examples {}
  322. ### Text Content
  323. {}
  324. Output:
  325. {}
  326. """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))
  327. prompt += f"""
  328. # Real Data
  329. ### Text Content
  330. {content}
  331. """
  332. msg = [{"role": "system", "content": prompt}, {"role": "user", "content": "Output: "}]
  333. _, msg = message_fit_in(msg, chat_mdl.max_length)
  334. kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5})
  335. if isinstance(kwd, tuple):
  336. kwd = kwd[0]
  337. kwd = re.sub(r"^.*</think>", "", kwd, flags=re.DOTALL)
  338. if kwd.find("**ERROR**") >= 0:
  339. raise Exception(kwd)
  340. try:
  341. obj = json_repair.loads(kwd)
  342. except json_repair.JSONDecodeError:
  343. try:
  344. result = kwd.replace(prompt[:-1], "").replace("user", "").replace("model", "").strip()
  345. result = "{" + result.split("{")[1].split("}")[0] + "}"
  346. obj = json_repair.loads(result)
  347. except Exception as e:
  348. logging.exception(f"JSON parsing error: {result} -> {e}")
  349. raise e
  350. res = {}
  351. for k, v in obj.items():
  352. try:
  353. res[str(k)] = int(v)
  354. except Exception:
  355. pass
  356. return res
  357. def vision_llm_describe_prompt(page=None) -> str:
  358. prompt_en = """
  359. INSTRUCTION:
  360. Transcribe the content from the provided PDF page image into clean Markdown format.
  361. - Only output the content transcribed from the image.
  362. - Do NOT output this instruction or any other explanation.
  363. - If the content is missing or you do not understand the input, return an empty string.
  364. RULES:
  365. 1. Do NOT generate examples, demonstrations, or templates.
  366. 2. Do NOT output any extra text such as 'Example', 'Example Output', or similar.
  367. 3. Do NOT generate any tables, headings, or content that is not explicitly present in the image.
  368. 4. Transcribe content word-for-word. Do NOT modify, translate, or omit any content.
  369. 5. Do NOT explain Markdown or mention that you are using Markdown.
  370. 6. Do NOT wrap the output in ```markdown or ``` blocks.
  371. 7. Only apply Markdown structure to headings, paragraphs, lists, and tables, strictly based on the layout of the image. Do NOT create tables unless an actual table exists in the image.
  372. 8. Preserve the original language, information, and order exactly as shown in the image.
  373. """
  374. if page is not None:
  375. prompt_en += f"\nAt the end of the transcription, add the page divider: `--- Page {page} ---`."
  376. prompt_en += """
  377. FAILURE HANDLING:
  378. - If you do not detect valid content in the image, return an empty string.
  379. """
  380. return prompt_en
  381. def vision_llm_figure_describe_prompt() -> str:
  382. prompt = """
  383. You are an expert visual data analyst. Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
  384. Tasks:
  385. 1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
  386. 2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
  387. 3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
  388. 4. Analyze and explain any trends, comparisons, or patterns shown in the data.
  389. 5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
  390. 6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.
  391. Output format (include only sections relevant to the image content):
  392. - Visual Type: [Type]
  393. - Title: [Title text, if available]
  394. - Axes / Legends / Labels: [Details, if available]
  395. - Data Points: [Extracted data]
  396. - Trends / Insights: [Analysis and interpretation]
  397. - Captions / Annotations: [Text and relevance, if available]
  398. Ensure high accuracy, clarity, and completeness in your analysis, and includes only the information present in the image. Avoid unnecessary statements about missing elements.
  399. """
  400. return prompt