Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

cv_model.py 29KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import json
  18. import os
  19. from abc import ABC
  20. from copy import deepcopy
  21. from io import BytesIO
  22. from urllib.parse import urljoin
  23. import requests
  24. from openai import OpenAI
  25. from openai.lib.azure import AzureOpenAI
  26. from zhipuai import ZhipuAI
  27. from rag.nlp import is_english
  28. from rag.prompts import vision_llm_describe_prompt
  29. from rag.utils import num_tokens_from_string
  30. class Base(ABC):
  31. def __init__(self, **kwargs):
  32. # Configure retry parameters
  33. self.max_retries = kwargs.get("max_retries", int(os.environ.get("LLM_MAX_RETRIES", 5)))
  34. self.base_delay = kwargs.get("retry_interval", float(os.environ.get("LLM_BASE_DELAY", 2.0)))
  35. self.max_rounds = kwargs.get("max_rounds", 5)
  36. self.is_tools = False
  37. self.tools = []
  38. self.toolcall_sessions = {}
  39. def describe(self, image):
  40. raise NotImplementedError("Please implement encode method!")
  41. def describe_with_prompt(self, image, prompt=None):
  42. raise NotImplementedError("Please implement encode method!")
  43. def _form_history(self, system, history, images=[]):
  44. hist = []
  45. if system:
  46. hist.append({"role": "system", "content": system})
  47. for h in history:
  48. if images and h["role"] == "user":
  49. h["content"] = self._image_prompt(h["content"], images)
  50. images = []
  51. hist.append(h)
  52. return hist
  53. def _image_prompt(self, text, images):
  54. if not images:
  55. return text
  56. pmpt = [{"type": "text", "text": text}]
  57. for img in images:
  58. pmpt.append({
  59. "type": "image_url",
  60. "image_url": {
  61. "url": f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img
  62. }
  63. })
  64. return pmpt
  65. def chat(self, system, history, gen_conf, images=[], **kwargs):
  66. try:
  67. response = self.client.chat.completions.create(
  68. model=self.model_name,
  69. messages=self._form_history(system, history, images)
  70. )
  71. return response.choices[0].message.content.strip(), response.usage.total_tokens
  72. except Exception as e:
  73. return "**ERROR**: " + str(e), 0
  74. def chat_streamly(self, system, history, gen_conf, images=[], **kwargs):
  75. ans = ""
  76. tk_count = 0
  77. try:
  78. response = self.client.chat.completions.create(
  79. model=self.model_name,
  80. messages=self._form_history(system, history, images),
  81. stream=True
  82. )
  83. for resp in response:
  84. if not resp.choices[0].delta.content:
  85. continue
  86. delta = resp.choices[0].delta.content
  87. ans = delta
  88. if resp.choices[0].finish_reason == "length":
  89. ans += "...\nFor the content length reason, it stopped, continue?" if is_english([ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
  90. if resp.choices[0].finish_reason == "stop":
  91. tk_count += resp.usage.total_tokens
  92. yield ans
  93. except Exception as e:
  94. yield ans + "\n**ERROR**: " + str(e)
  95. yield tk_count
  96. @staticmethod
  97. def image2base64(image):
  98. if isinstance(image, bytes):
  99. return base64.b64encode(image).decode("utf-8")
  100. if isinstance(image, BytesIO):
  101. return base64.b64encode(image.getvalue()).decode("utf-8")
  102. buffered = BytesIO()
  103. try:
  104. image.save(buffered, format="JPEG")
  105. except Exception:
  106. image.save(buffered, format="PNG")
  107. return base64.b64encode(buffered.getvalue()).decode("utf-8")
  108. def prompt(self, b64):
  109. return [
  110. {
  111. "role": "user",
  112. "content": self._image_prompt(
  113. "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。"
  114. if self.lang.lower() == "chinese"
  115. else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out.",
  116. b64
  117. )
  118. }
  119. ]
  120. def vision_llm_prompt(self, b64, prompt=None):
  121. return [
  122. {
  123. "role": "user",
  124. "content": self._image_prompt(prompt if prompt else vision_llm_describe_prompt(), b64)
  125. }
  126. ]
  127. class GptV4(Base):
  128. _FACTORY_NAME = "OpenAI"
  129. def __init__(self, key, model_name="gpt-4-vision-preview", lang="Chinese", base_url="https://api.openai.com/v1", **kwargs):
  130. if not base_url:
  131. base_url = "https://api.openai.com/v1"
  132. self.client = OpenAI(api_key=key, base_url=base_url)
  133. self.model_name = model_name
  134. self.lang = lang
  135. super().__init__(**kwargs)
  136. def describe(self, image):
  137. b64 = self.image2base64(image)
  138. res = self.client.chat.completions.create(
  139. model=self.model_name,
  140. messages=self.prompt(b64),
  141. )
  142. return res.choices[0].message.content.strip(), res.usage.total_tokens
  143. def describe_with_prompt(self, image, prompt=None):
  144. b64 = self.image2base64(image)
  145. res = self.client.chat.completions.create(
  146. model=self.model_name,
  147. messages=self.vision_llm_prompt(b64, prompt),
  148. )
  149. return res.choices[0].message.content.strip(), res.usage.total_tokens
  150. class AzureGptV4(GptV4):
  151. _FACTORY_NAME = "Azure-OpenAI"
  152. def __init__(self, key, model_name, lang="Chinese", **kwargs):
  153. api_key = json.loads(key).get("api_key", "")
  154. api_version = json.loads(key).get("api_version", "2024-02-01")
  155. self.client = AzureOpenAI(api_key=api_key, azure_endpoint=kwargs["base_url"], api_version=api_version)
  156. self.model_name = model_name
  157. self.lang = lang
  158. Base.__init__(self, **kwargs)
  159. class xAICV(GptV4):
  160. _FACTORY_NAME = "xAI"
  161. def __init__(self, key, model_name="grok-3", lang="Chinese", base_url=None, **kwargs):
  162. if not base_url:
  163. base_url = "https://api.x.ai/v1"
  164. super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
  165. class QWenCV(GptV4):
  166. _FACTORY_NAME = "Tongyi-Qianwen"
  167. def __init__(self, key, model_name="qwen-vl-chat-v1", lang="Chinese", base_url=None, **kwargs):
  168. if not base_url:
  169. base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
  170. super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
  171. class HunyuanCV(GptV4):
  172. _FACTORY_NAME = "Tencent Hunyuan"
  173. def __init__(self, key, model_name, lang="Chinese", base_url=None, **kwargs):
  174. if not base_url:
  175. base_url = "https://api.hunyuan.cloud.tencent.com/v1"
  176. super().__init__(key, model_name, lang=lang, base_url=base_url, **kwargs)
  177. class Zhipu4V(GptV4):
  178. _FACTORY_NAME = "ZHIPU-AI"
  179. def __init__(self, key, model_name="glm-4v", lang="Chinese", **kwargs):
  180. self.client = ZhipuAI(api_key=key)
  181. self.model_name = model_name
  182. self.lang = lang
  183. Base.__init__(self, **kwargs)
  184. class StepFunCV(GptV4):
  185. _FACTORY_NAME = "StepFun"
  186. def __init__(self, key, model_name="step-1v-8k", lang="Chinese", base_url="https://api.stepfun.com/v1", **kwargs):
  187. if not base_url:
  188. base_url = "https://api.stepfun.com/v1"
  189. self.client = OpenAI(api_key=key, base_url=base_url)
  190. self.model_name = model_name
  191. self.lang = lang
  192. Base.__init__(self, **kwargs)
  193. class LmStudioCV(GptV4):
  194. _FACTORY_NAME = "LM-Studio"
  195. def __init__(self, key, model_name, lang="Chinese", base_url="", **kwargs):
  196. if not base_url:
  197. raise ValueError("Local llm url cannot be None")
  198. base_url = urljoin(base_url, "v1")
  199. self.client = OpenAI(api_key="lm-studio", base_url=base_url)
  200. self.model_name = model_name
  201. self.lang = lang
  202. Base.__init__(self, **kwargs)
  203. class OpenAI_APICV(GptV4):
  204. _FACTORY_NAME = ["VLLM", "OpenAI-API-Compatible"]
  205. def __init__(self, key, model_name, lang="Chinese", base_url="", **kwargs):
  206. if not base_url:
  207. raise ValueError("url cannot be None")
  208. base_url = urljoin(base_url, "v1")
  209. self.client = OpenAI(api_key=key, base_url=base_url)
  210. self.model_name = model_name.split("___")[0]
  211. self.lang = lang
  212. Base.__init__(self, **kwargs)
  213. class TogetherAICV(GptV4):
  214. _FACTORY_NAME = "TogetherAI"
  215. def __init__(self, key, model_name, lang="Chinese", base_url="https://api.together.xyz/v1", **kwargs):
  216. if not base_url:
  217. base_url = "https://api.together.xyz/v1"
  218. super().__init__(key, model_name, lang, base_url, **kwargs)
  219. class YiCV(GptV4):
  220. _FACTORY_NAME = "01.AI"
  221. def __init__(
  222. self,
  223. key,
  224. model_name,
  225. lang="Chinese",
  226. base_url="https://api.lingyiwanwu.com/v1", **kwargs
  227. ):
  228. if not base_url:
  229. base_url = "https://api.lingyiwanwu.com/v1"
  230. super().__init__(key, model_name, lang, base_url, **kwargs)
  231. class SILICONFLOWCV(GptV4):
  232. _FACTORY_NAME = "SILICONFLOW"
  233. def __init__(
  234. self,
  235. key,
  236. model_name,
  237. lang="Chinese",
  238. base_url="https://api.siliconflow.cn/v1", **kwargs
  239. ):
  240. if not base_url:
  241. base_url = "https://api.siliconflow.cn/v1"
  242. super().__init__(key, model_name, lang, base_url, **kwargs)
  243. class OpenRouterCV(GptV4):
  244. _FACTORY_NAME = "OpenRouter"
  245. def __init__(
  246. self,
  247. key,
  248. model_name,
  249. lang="Chinese",
  250. base_url="https://openrouter.ai/api/v1", **kwargs
  251. ):
  252. if not base_url:
  253. base_url = "https://openrouter.ai/api/v1"
  254. self.client = OpenAI(api_key=key, base_url=base_url)
  255. self.model_name = model_name
  256. self.lang = lang
  257. Base.__init__(self, **kwargs)
  258. class LocalAICV(GptV4):
  259. _FACTORY_NAME = "LocalAI"
  260. def __init__(self, key, model_name, base_url, lang="Chinese", **kwargs):
  261. if not base_url:
  262. raise ValueError("Local cv model url cannot be None")
  263. base_url = urljoin(base_url, "v1")
  264. self.client = OpenAI(api_key="empty", base_url=base_url)
  265. self.model_name = model_name.split("___")[0]
  266. self.lang = lang
  267. Base.__init__(self, **kwargs)
  268. class XinferenceCV(GptV4):
  269. _FACTORY_NAME = "Xinference"
  270. def __init__(self, key, model_name="", lang="Chinese", base_url="", **kwargs):
  271. base_url = urljoin(base_url, "v1")
  272. self.client = OpenAI(api_key=key, base_url=base_url)
  273. self.model_name = model_name
  274. self.lang = lang
  275. Base.__init__(self, **kwargs)
  276. class GPUStackCV(GptV4):
  277. _FACTORY_NAME = "GPUStack"
  278. def __init__(self, key, model_name, lang="Chinese", base_url="", **kwargs):
  279. if not base_url:
  280. raise ValueError("Local llm url cannot be None")
  281. base_url = urljoin(base_url, "v1")
  282. self.client = OpenAI(api_key=key, base_url=base_url)
  283. self.model_name = model_name
  284. self.lang = lang
  285. Base.__init__(self, **kwargs)
  286. class LocalCV(Base):
  287. _FACTORY_NAME = "Moonshot"
  288. def __init__(self, key, model_name="glm-4v", lang="Chinese", **kwargs):
  289. pass
  290. def describe(self, image):
  291. return "", 0
  292. class OllamaCV(Base):
  293. _FACTORY_NAME = "Ollama"
  294. def __init__(self, key, model_name, lang="Chinese", **kwargs):
  295. from ollama import Client
  296. self.client = Client(host=kwargs["base_url"])
  297. self.model_name = model_name
  298. self.lang = lang
  299. self.keep_alive = kwargs.get("ollama_keep_alive", int(os.environ.get("OLLAMA_KEEP_ALIVE", -1)))
  300. Base.__init__(self, **kwargs)
  301. def _clean_conf(self, gen_conf):
  302. options = {}
  303. if "temperature" in gen_conf:
  304. options["temperature"] = gen_conf["temperature"]
  305. if "top_p" in gen_conf:
  306. options["top_k"] = gen_conf["top_p"]
  307. if "presence_penalty" in gen_conf:
  308. options["presence_penalty"] = gen_conf["presence_penalty"]
  309. if "frequency_penalty" in gen_conf:
  310. options["frequency_penalty"] = gen_conf["frequency_penalty"]
  311. return options
  312. def _form_history(self, system, history, images=[]):
  313. hist = deepcopy(history)
  314. if system and hist[0]["role"] == "user":
  315. hist.insert(0, {"role": "system", "content": system})
  316. if not images:
  317. return hist
  318. for his in hist:
  319. if his["role"] == "user":
  320. his["images"] = images
  321. break
  322. return hist
  323. def describe(self, image):
  324. prompt = self.prompt("")
  325. try:
  326. response = self.client.generate(
  327. model=self.model_name,
  328. prompt=prompt[0]["content"][0]["text"],
  329. images=[image],
  330. )
  331. ans = response["response"].strip()
  332. return ans, 128
  333. except Exception as e:
  334. return "**ERROR**: " + str(e), 0
  335. def describe_with_prompt(self, image, prompt=None):
  336. vision_prompt = self.vision_llm_prompt("", prompt) if prompt else self.vision_llm_prompt("")
  337. try:
  338. response = self.client.generate(
  339. model=self.model_name,
  340. prompt=vision_prompt[0]["content"][0]["text"],
  341. images=[image],
  342. )
  343. ans = response["response"].strip()
  344. return ans, 128
  345. except Exception as e:
  346. return "**ERROR**: " + str(e), 0
  347. def chat(self, system, history, gen_conf, images=[]):
  348. try:
  349. response = self.client.chat(
  350. model=self.model_name,
  351. messages=self._form_history(system, history, images),
  352. options=self._clean_conf(gen_conf),
  353. keep_alive=self.keep_alive
  354. )
  355. ans = response["message"]["content"].strip()
  356. return ans, response["eval_count"] + response.get("prompt_eval_count", 0)
  357. except Exception as e:
  358. return "**ERROR**: " + str(e), 0
  359. def chat_streamly(self, system, history, gen_conf, images=[]):
  360. ans = ""
  361. try:
  362. response = self.client.chat(
  363. model=self.model_name,
  364. messages=self._form_history(system, history, images),
  365. stream=True,
  366. options=self._clean_conf(gen_conf),
  367. keep_alive=self.keep_alive
  368. )
  369. for resp in response:
  370. if resp["done"]:
  371. yield resp.get("prompt_eval_count", 0) + resp.get("eval_count", 0)
  372. ans = resp["message"]["content"]
  373. yield ans
  374. except Exception as e:
  375. yield ans + "\n**ERROR**: " + str(e)
  376. yield 0
  377. class GeminiCV(Base):
  378. _FACTORY_NAME = "Gemini"
  379. def __init__(self, key, model_name="gemini-1.0-pro-vision-latest", lang="Chinese", **kwargs):
  380. from google.generativeai import GenerativeModel, client
  381. client.configure(api_key=key)
  382. _client = client.get_default_generative_client()
  383. self.model_name = model_name
  384. self.model = GenerativeModel(model_name=self.model_name)
  385. self.model._client = _client
  386. self.lang = lang
  387. Base.__init__(self, **kwargs)
  388. def _form_history(self, system, history, images=[]):
  389. hist = []
  390. if system:
  391. hist.append({"role": "user", "parts": [system, history[0]["content"]]})
  392. for img in images:
  393. hist[0]["parts"].append(("data:image/jpeg;base64," + img) if img[:4]!="data" else img)
  394. for h in history[1:]:
  395. hist.append({"role": "user" if h["role"]=="user" else "model", "parts": [h["content"]]})
  396. return hist
  397. def describe(self, image):
  398. from PIL.Image import open
  399. prompt = (
  400. "请用中文详细描述一下图中的内容,比如时间,地点,人物,事情,人物心情等,如果有数据请提取出数据。"
  401. if self.lang.lower() == "chinese"
  402. else "Please describe the content of this picture, like where, when, who, what happen. If it has number data, please extract them out."
  403. )
  404. b64 = self.image2base64(image)
  405. img = open(BytesIO(base64.b64decode(b64)))
  406. input = [prompt, img]
  407. res = self.model.generate_content(input)
  408. return res.text, res.usage_metadata.total_token_count
  409. def describe_with_prompt(self, image, prompt=None):
  410. from PIL.Image import open
  411. b64 = self.image2base64(image)
  412. vision_prompt = prompt if prompt else vision_llm_describe_prompt()
  413. img = open(BytesIO(base64.b64decode(b64)))
  414. input = [vision_prompt, img]
  415. res = self.model.generate_content(
  416. input,
  417. )
  418. return res.text, res.usage_metadata.total_token_count
  419. def chat(self, system, history, gen_conf, images=[]):
  420. from transformers import GenerationConfig
  421. try:
  422. response = self.model.generate_content(
  423. self._form_history(system, history, images),
  424. generation_config=GenerationConfig(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7)))
  425. ans = response.text
  426. return ans, response.usage_metadata.total_token_count
  427. except Exception as e:
  428. return "**ERROR**: " + str(e), 0
  429. def chat_streamly(self, system, history, gen_conf, images=[]):
  430. from transformers import GenerationConfig
  431. ans = ""
  432. try:
  433. response = self.model.generate_content(
  434. self._form_history(system, history, images),
  435. generation_config=GenerationConfig(temperature=gen_conf.get("temperature", 0.3), top_p=gen_conf.get("top_p", 0.7)),
  436. stream=True,
  437. )
  438. for resp in response:
  439. if not resp.text:
  440. continue
  441. ans = resp.text
  442. yield ans
  443. except Exception as e:
  444. yield ans + "\n**ERROR**: " + str(e)
  445. yield response._chunks[-1].usage_metadata.total_token_count
  446. class NvidiaCV(Base):
  447. _FACTORY_NAME = "NVIDIA"
  448. def __init__(
  449. self,
  450. key,
  451. model_name,
  452. lang="Chinese",
  453. base_url="https://ai.api.nvidia.com/v1/vlm", **kwargs
  454. ):
  455. if not base_url:
  456. base_url = ("https://ai.api.nvidia.com/v1/vlm",)
  457. self.lang = lang
  458. factory, llm_name = model_name.split("/")
  459. if factory != "liuhaotian":
  460. self.base_url = urljoin(base_url, f"{factory}/{llm_name}")
  461. else:
  462. self.base_url = urljoin(f"{base_url}/community", llm_name.replace("-v1.6", "16"))
  463. self.key = key
  464. Base.__init__(self, **kwargs)
  465. def _image_prompt(self, text, images):
  466. if not images:
  467. return text
  468. htmls = ""
  469. for img in images:
  470. htmls += ' <img src="{}"/>'.format(f"data:image/jpeg;base64,{img}" if img[:4] != "data" else img)
  471. return text + htmls
  472. def describe(self, image):
  473. b64 = self.image2base64(image)
  474. response = requests.post(
  475. url=self.base_url,
  476. headers={
  477. "accept": "application/json",
  478. "content-type": "application/json",
  479. "Authorization": f"Bearer {self.key}",
  480. },
  481. json={"messages": self.prompt(b64)},
  482. )
  483. response = response.json()
  484. return (
  485. response["choices"][0]["message"]["content"].strip(),
  486. response["usage"]["total_tokens"],
  487. )
  488. def _request(self, msg, gen_conf={}):
  489. response = requests.post(
  490. url=self.base_url,
  491. headers={
  492. "accept": "application/json",
  493. "content-type": "application/json",
  494. "Authorization": f"Bearer {self.key}",
  495. },
  496. json={
  497. "messages": msg, **gen_conf
  498. },
  499. )
  500. return response.json()
  501. def describe_with_prompt(self, image, prompt=None):
  502. b64 = self.image2base64(image)
  503. vision_prompt = self.vision_llm_prompt(b64, prompt) if prompt else self.vision_llm_prompt(b64)
  504. response = self._request(vision_prompt)
  505. return (
  506. response["choices"][0]["message"]["content"].strip(),
  507. response["usage"]["total_tokens"],
  508. )
  509. def chat(self, system, history, gen_conf, images=[], **kwargs):
  510. try:
  511. response = self._request(self._form_history(system, history, images), gen_conf)
  512. return (
  513. response["choices"][0]["message"]["content"].strip(),
  514. response["usage"]["total_tokens"],
  515. )
  516. except Exception as e:
  517. return "**ERROR**: " + str(e), 0
  518. def chat_streamly(self, system, history, gen_conf, images=[], **kwargs):
  519. try:
  520. response = self._request(self._form_history(system, history, images), gen_conf)
  521. cnt = response["choices"][0]["message"]["content"]
  522. for resp in cnt:
  523. yield resp
  524. except Exception as e:
  525. yield "\n**ERROR**: " + str(e)
  526. yield response["usage"]["total_tokens"]
  527. class AnthropicCV(Base):
  528. _FACTORY_NAME = "Anthropic"
  529. def __init__(self, key, model_name, base_url=None, **kwargs):
  530. import anthropic
  531. self.client = anthropic.Anthropic(api_key=key)
  532. self.model_name = model_name
  533. self.system = ""
  534. self.max_tokens = 8192
  535. if "haiku" in self.model_name or "opus" in self.model_name:
  536. self.max_tokens = 4096
  537. Base.__init__(self, **kwargs)
  538. def _image_prompt(self, text, images):
  539. if not images:
  540. return text
  541. pmpt = [{"type": "text", "text": text}]
  542. for img in images:
  543. pmpt.append({
  544. "type": "image",
  545. "source": {
  546. "type": "base64",
  547. "media_type": "image/jpeg" if img[:4] != "data" else img.split(":")[1].split(";")[0],
  548. "data": img if img[:4] != "data" else img.split(",")[1]
  549. },
  550. }
  551. )
  552. return pmpt
  553. def describe(self, image):
  554. b64 = self.image2base64(image)
  555. response = self.client.messages.create(model=self.model_name, max_tokens=self.max_tokens, messages=self.prompt(b64))
  556. return response["content"][0]["text"].strip(), response["usage"]["input_tokens"] + response["usage"]["output_tokens"]
  557. def describe_with_prompt(self, image, prompt=None):
  558. b64 = self.image2base64(image)
  559. prompt = self.prompt(b64, prompt if prompt else vision_llm_describe_prompt())
  560. response = self.client.messages.create(model=self.model_name, max_tokens=self.max_tokens, messages=prompt)
  561. return response["content"][0]["text"].strip(), response["usage"]["input_tokens"] + response["usage"]["output_tokens"]
  562. def _clean_conf(self, gen_conf):
  563. if "presence_penalty" in gen_conf:
  564. del gen_conf["presence_penalty"]
  565. if "frequency_penalty" in gen_conf:
  566. del gen_conf["frequency_penalty"]
  567. if "max_token" in gen_conf:
  568. gen_conf["max_tokens"] = self.max_tokens
  569. return gen_conf
  570. def chat(self, system, history, gen_conf, images=[]):
  571. gen_conf = self._clean_conf(gen_conf)
  572. ans = ""
  573. try:
  574. response = self.client.messages.create(
  575. model=self.model_name,
  576. messages=self._form_history(system, history, images),
  577. system=system,
  578. stream=False,
  579. **gen_conf,
  580. ).to_dict()
  581. ans = response["content"][0]["text"]
  582. if response["stop_reason"] == "max_tokens":
  583. ans += "...\nFor the content length reason, it stopped, continue?" if is_english([ans]) else "······\n由于长度的原因,回答被截断了,要继续吗?"
  584. return (
  585. ans,
  586. response["usage"]["input_tokens"] + response["usage"]["output_tokens"],
  587. )
  588. except Exception as e:
  589. return ans + "\n**ERROR**: " + str(e), 0
  590. def chat_streamly(self, system, history, gen_conf, images=[]):
  591. gen_conf = self._clean_conf(gen_conf)
  592. total_tokens = 0
  593. try:
  594. response = self.client.messages.create(
  595. model=self.model_name,
  596. messages=self._form_history(system, history, images),
  597. system=system,
  598. stream=True,
  599. **gen_conf,
  600. )
  601. think = False
  602. for res in response:
  603. if res.type == "content_block_delta":
  604. if res.delta.type == "thinking_delta" and res.delta.thinking:
  605. if not think:
  606. yield "<think>"
  607. think = True
  608. yield res.delta.thinking
  609. total_tokens += num_tokens_from_string(res.delta.thinking)
  610. elif think:
  611. yield "</think>"
  612. else:
  613. yield res.delta.text
  614. total_tokens += num_tokens_from_string(res.delta.text)
  615. except Exception as e:
  616. yield "\n**ERROR**: " + str(e)
  617. yield total_tokens
  618. class GoogleCV(AnthropicCV, GeminiCV):
  619. _FACTORY_NAME = "Google Cloud"
  620. def __init__(self, key, model_name, lang="Chinese", base_url=None, **kwargs):
  621. import base64
  622. from google.oauth2 import service_account
  623. key = json.loads(key)
  624. access_token = json.loads(base64.b64decode(key.get("google_service_account_key", "")))
  625. project_id = key.get("google_project_id", "")
  626. region = key.get("google_region", "")
  627. scopes = ["https://www.googleapis.com/auth/cloud-platform"]
  628. self.model_name = model_name
  629. self.lang = lang
  630. if "claude" in self.model_name:
  631. from anthropic import AnthropicVertex
  632. from google.auth.transport.requests import Request
  633. if access_token:
  634. credits = service_account.Credentials.from_service_account_info(access_token, scopes=scopes)
  635. request = Request()
  636. credits.refresh(request)
  637. token = credits.token
  638. self.client = AnthropicVertex(region=region, project_id=project_id, access_token=token)
  639. else:
  640. self.client = AnthropicVertex(region=region, project_id=project_id)
  641. else:
  642. import vertexai.generative_models as glm
  643. from google.cloud import aiplatform
  644. if access_token:
  645. credits = service_account.Credentials.from_service_account_info(access_token)
  646. aiplatform.init(credentials=credits, project=project_id, location=region)
  647. else:
  648. aiplatform.init(project=project_id, location=region)
  649. self.client = glm.GenerativeModel(model_name=self.model_name)
  650. Base.__init__(self, **kwargs)
  651. def describe(self, image):
  652. if "claude" in self.model_name:
  653. return AnthropicCV.describe(self, image)
  654. else:
  655. return GeminiCV.describe(self, image)
  656. def describe_with_prompt(self, image, prompt=None):
  657. if "claude" in self.model_name:
  658. return AnthropicCV.describe_with_prompt(self, image, prompt)
  659. else:
  660. return GeminiCV.describe_with_prompt(self, image, prompt)
  661. def chat(self, system, history, gen_conf, images=[]):
  662. if "claude" in self.model_name:
  663. return AnthropicCV.chat(self, system, history, gen_conf, images)
  664. else:
  665. return GeminiCV.chat(self, system, history, gen_conf, images)
  666. def chat_streamly(self, system, history, gen_conf, images=[]):
  667. if "claude" in self.model_name:
  668. for ans in AnthropicCV.chat_streamly(self, system, history, gen_conf, images):
  669. yield ans
  670. else:
  671. for ans in GeminiCV.chat_streamly(self, system, history, gen_conf, images):
  672. yield ans