| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632 |
- #
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import json
- import os
- import re
- import threading
- from abc import ABC
- from collections.abc import Iterable
- from urllib.parse import urljoin
-
- import httpx
- import numpy as np
- import requests
- from huggingface_hub import snapshot_download
- from yarl import URL
-
- from api import settings
- from api.utils.file_utils import get_home_cache_dir
- from api.utils.log_utils import log_exception
- from rag.utils import num_tokens_from_string, truncate
-
- class Base(ABC):
- def __init__(self, key, model_name, **kwargs):
- """
- Abstract base class constructor.
- Parameters are not stored; initialization is left to subclasses.
- """
- pass
-
- def similarity(self, query: str, texts: list):
- raise NotImplementedError("Please implement encode method!")
-
- def total_token_count(self, resp):
- try:
- return resp.usage.total_tokens
- except Exception:
- pass
- try:
- return resp["usage"]["total_tokens"]
- except Exception:
- pass
- return 0
-
-
- class DefaultRerank(Base):
- _FACTORY_NAME = "BAAI"
- _model = None
- _model_lock = threading.Lock()
-
- def __init__(self, key, model_name, **kwargs):
- """
- If you have trouble downloading HuggingFace models, -_^ this might help!!
-
- For Linux:
- export HF_ENDPOINT=https://hf-mirror.com
-
- For Windows:
- Good luck
- ^_-
-
- """
- if not settings.LIGHTEN and not DefaultRerank._model:
- import torch
- from FlagEmbedding import FlagReranker
-
- with DefaultRerank._model_lock:
- if not DefaultRerank._model:
- try:
- DefaultRerank._model = FlagReranker(os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), use_fp16=torch.cuda.is_available())
- except Exception:
- model_dir = snapshot_download(repo_id=model_name, local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), local_dir_use_symlinks=False)
- DefaultRerank._model = FlagReranker(model_dir, use_fp16=torch.cuda.is_available())
- self._model = DefaultRerank._model
- self._dynamic_batch_size = 8
- self._min_batch_size = 1
-
- def torch_empty_cache(self):
- try:
- import torch
-
- torch.cuda.empty_cache()
- except Exception as e:
- log_exception(e)
-
- def _process_batch(self, pairs, max_batch_size=None):
- """template method for subclass call"""
- old_dynamic_batch_size = self._dynamic_batch_size
- if max_batch_size is not None:
- self._dynamic_batch_size = max_batch_size
- res = np.array([], dtype=float)
- i = 0
- while i < len(pairs):
- cur_i = i
- current_batch = self._dynamic_batch_size
- max_retries = 5
- retry_count = 0
- while retry_count < max_retries:
- try:
- # call subclass implemented batch processing calculation
- batch_scores = self._compute_batch_scores(pairs[i : i + current_batch])
- res = np.append(res, batch_scores)
- i += current_batch
- self._dynamic_batch_size = min(self._dynamic_batch_size * 2, 8)
- break
- except RuntimeError as e:
- if "CUDA out of memory" in str(e) and current_batch > self._min_batch_size:
- current_batch = max(current_batch // 2, self._min_batch_size)
- self.torch_empty_cache()
- i = cur_i # reset i to the start of the current batch
- retry_count += 1
- else:
- raise
- if retry_count >= max_retries:
- raise RuntimeError("max retry times, still cannot process batch, please check your GPU memory")
- self.torch_empty_cache()
-
- self._dynamic_batch_size = old_dynamic_batch_size
- return np.array(res)
-
- def _compute_batch_scores(self, batch_pairs, max_length=None):
- if max_length is None:
- scores = self._model.compute_score(batch_pairs, normalize=True)
- else:
- scores = self._model.compute_score(batch_pairs, max_length=max_length, normalize=True)
- if not isinstance(scores, Iterable):
- scores = [scores]
- return scores
-
- def similarity(self, query: str, texts: list):
- pairs = [(query, truncate(t, 2048)) for t in texts]
- token_count = 0
- for _, t in pairs:
- token_count += num_tokens_from_string(t)
- batch_size = 4096
- res = self._process_batch(pairs, max_batch_size=batch_size)
- return np.array(res), token_count
-
-
- class JinaRerank(Base):
- _FACTORY_NAME = "Jina"
-
- def __init__(self, key, model_name="jina-reranker-v2-base-multilingual", base_url="https://api.jina.ai/v1/rerank"):
- self.base_url = "https://api.jina.ai/v1/rerank"
- self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
- self.model_name = model_name
-
- def similarity(self, query: str, texts: list):
- texts = [truncate(t, 8196) for t in texts]
- data = {"model": self.model_name, "query": query, "documents": texts, "top_n": len(texts)}
- res = requests.post(self.base_url, headers=self.headers, json=data).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, res)
- return rank, self.total_token_count(res)
-
-
- class YoudaoRerank(DefaultRerank):
- _FACTORY_NAME = "Youdao"
- _model = None
- _model_lock = threading.Lock()
-
- def __init__(self, key=None, model_name="maidalun1020/bce-reranker-base_v1", **kwargs):
- if not settings.LIGHTEN and not YoudaoRerank._model:
- from BCEmbedding import RerankerModel
-
- with YoudaoRerank._model_lock:
- if not YoudaoRerank._model:
- try:
- YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)))
- except Exception:
- YoudaoRerank._model = RerankerModel(model_name_or_path=model_name.replace("maidalun1020", "InfiniFlow"))
-
- self._model = YoudaoRerank._model
- self._dynamic_batch_size = 8
- self._min_batch_size = 1
-
- def similarity(self, query: str, texts: list):
- pairs = [(query, truncate(t, self._model.max_length)) for t in texts]
- token_count = 0
- for _, t in pairs:
- token_count += num_tokens_from_string(t)
- batch_size = 8
- res = self._process_batch(pairs, max_batch_size=batch_size)
- return np.array(res), token_count
-
-
- class XInferenceRerank(Base):
- _FACTORY_NAME = "Xinference"
-
- def __init__(self, key="x", model_name="", base_url=""):
- if base_url.find("/v1") == -1:
- base_url = urljoin(base_url, "/v1/rerank")
- if base_url.find("/rerank") == -1:
- base_url = urljoin(base_url, "/v1/rerank")
- self.model_name = model_name
- self.base_url = base_url
- self.headers = {"Content-Type": "application/json", "accept": "application/json"}
- if key and key != "x":
- self.headers["Authorization"] = f"Bearer {key}"
-
- def similarity(self, query: str, texts: list):
- if len(texts) == 0:
- return np.array([]), 0
- pairs = [(query, truncate(t, 4096)) for t in texts]
- token_count = 0
- for _, t in pairs:
- token_count += num_tokens_from_string(t)
- data = {"model": self.model_name, "query": query, "return_documents": "true", "return_len": "true", "documents": texts}
- res = requests.post(self.base_url, headers=self.headers, json=data).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, res)
- return rank, token_count
-
-
- class LocalAIRerank(Base):
- _FACTORY_NAME = "LocalAI"
-
- def __init__(self, key, model_name, base_url):
- if base_url.find("/rerank") == -1:
- self.base_url = urljoin(base_url, "/rerank")
- else:
- self.base_url = base_url
- self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
- self.model_name = model_name.split("___")[0]
-
- def similarity(self, query: str, texts: list):
- # noway to config Ragflow , use fix setting
- texts = [truncate(t, 500) for t in texts]
- data = {
- "model": self.model_name,
- "query": query,
- "documents": texts,
- "top_n": len(texts),
- }
- token_count = 0
- for t in texts:
- token_count += num_tokens_from_string(t)
- res = requests.post(self.base_url, headers=self.headers, json=data).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, res)
-
- # Normalize the rank values to the range 0 to 1
- min_rank = np.min(rank)
- max_rank = np.max(rank)
-
- # Avoid division by zero if all ranks are identical
- if max_rank - min_rank != 0:
- rank = (rank - min_rank) / (max_rank - min_rank)
- else:
- rank = np.zeros_like(rank)
-
- return rank, token_count
-
-
- class NvidiaRerank(Base):
- _FACTORY_NAME = "NVIDIA"
-
- def __init__(self, key, model_name, base_url="https://ai.api.nvidia.com/v1/retrieval/nvidia/"):
- if not base_url:
- base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
- self.model_name = model_name
-
- if self.model_name == "nvidia/nv-rerankqa-mistral-4b-v3":
- self.base_url = urljoin(base_url, "nv-rerankqa-mistral-4b-v3/reranking")
-
- if self.model_name == "nvidia/rerank-qa-mistral-4b":
- self.base_url = urljoin(base_url, "reranking")
- self.model_name = "nv-rerank-qa-mistral-4b:1"
-
- self.headers = {
- "accept": "application/json",
- "Content-Type": "application/json",
- "Authorization": f"Bearer {key}",
- }
-
- def similarity(self, query: str, texts: list):
- token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
- data = {
- "model": self.model_name,
- "query": {"text": query},
- "passages": [{"text": text} for text in texts],
- "truncate": "END",
- "top_n": len(texts),
- }
- res = requests.post(self.base_url, headers=self.headers, json=data).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["rankings"]:
- rank[d["index"]] = d["logit"]
- except Exception as _e:
- log_exception(_e, res)
- return rank, token_count
-
-
- class LmStudioRerank(Base):
- _FACTORY_NAME = "LM-Studio"
-
- def __init__(self, key, model_name, base_url, **kwargs):
- pass
-
- def similarity(self, query: str, texts: list):
- raise NotImplementedError("The LmStudioRerank has not been implement")
-
-
- class OpenAI_APIRerank(Base):
- _FACTORY_NAME = "OpenAI-API-Compatible"
-
- def __init__(self, key, model_name, base_url):
- if base_url.find("/rerank") == -1:
- self.base_url = urljoin(base_url, "/rerank")
- else:
- self.base_url = base_url
- self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
- self.model_name = model_name.split("___")[0]
-
- def similarity(self, query: str, texts: list):
- # noway to config Ragflow , use fix setting
- texts = [truncate(t, 500) for t in texts]
- data = {
- "model": self.model_name,
- "query": query,
- "documents": texts,
- "top_n": len(texts),
- }
- token_count = 0
- for t in texts:
- token_count += num_tokens_from_string(t)
- res = requests.post(self.base_url, headers=self.headers, json=data).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, res)
-
- # Normalize the rank values to the range 0 to 1
- min_rank = np.min(rank)
- max_rank = np.max(rank)
-
- # Avoid division by zero if all ranks are identical
- if np.isclose(min_rank, max_rank, atol=1e-3):
- rank = (rank - min_rank) / (max_rank - min_rank)
- else:
- rank = np.zeros_like(rank)
-
- return rank, token_count
-
-
- class CoHereRerank(Base):
- _FACTORY_NAME = ["Cohere", "VLLM"]
-
- def __init__(self, key, model_name, base_url=None):
- from cohere import Client
-
- self.client = Client(api_key=key, base_url=base_url)
- self.model_name = model_name.split("___")[0]
-
- def similarity(self, query: str, texts: list):
- token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
- res = self.client.rerank(
- model=self.model_name,
- query=query,
- documents=texts,
- top_n=len(texts),
- return_documents=False,
- )
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res.results:
- rank[d.index] = d.relevance_score
- except Exception as _e:
- log_exception(_e, res)
- return rank, token_count
-
-
- class TogetherAIRerank(Base):
- _FACTORY_NAME = "TogetherAI"
-
- def __init__(self, key, model_name, base_url, **kwargs):
- pass
-
- def similarity(self, query: str, texts: list):
- raise NotImplementedError("The api has not been implement")
-
-
- class SILICONFLOWRerank(Base):
- _FACTORY_NAME = "SILICONFLOW"
-
- def __init__(self, key, model_name, base_url="https://api.siliconflow.cn/v1/rerank"):
- if not base_url:
- base_url = "https://api.siliconflow.cn/v1/rerank"
- self.model_name = model_name
- self.base_url = base_url
- self.headers = {
- "accept": "application/json",
- "content-type": "application/json",
- "authorization": f"Bearer {key}",
- }
-
- def similarity(self, query: str, texts: list):
- payload = {
- "model": self.model_name,
- "query": query,
- "documents": texts,
- "top_n": len(texts),
- "return_documents": False,
- "max_chunks_per_doc": 1024,
- "overlap_tokens": 80,
- }
- response = requests.post(self.base_url, json=payload, headers=self.headers).json()
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in response["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, response)
- return (
- rank,
- response["meta"]["tokens"]["input_tokens"] + response["meta"]["tokens"]["output_tokens"],
- )
-
-
- class BaiduYiyanRerank(Base):
- _FACTORY_NAME = "BaiduYiyan"
-
- def __init__(self, key, model_name, base_url=None):
- from qianfan.resources import Reranker
-
- key = json.loads(key)
- ak = key.get("yiyan_ak", "")
- sk = key.get("yiyan_sk", "")
- self.client = Reranker(ak=ak, sk=sk)
- self.model_name = model_name
-
- def similarity(self, query: str, texts: list):
- res = self.client.do(
- model=self.model_name,
- query=query,
- documents=texts,
- top_n=len(texts),
- ).body
- rank = np.zeros(len(texts), dtype=float)
- try:
- for d in res["results"]:
- rank[d["index"]] = d["relevance_score"]
- except Exception as _e:
- log_exception(_e, res)
- return rank, self.total_token_count(res)
-
-
- class VoyageRerank(Base):
- _FACTORY_NAME = "Voyage AI"
-
- def __init__(self, key, model_name, base_url=None):
- import voyageai
-
- self.client = voyageai.Client(api_key=key)
- self.model_name = model_name
-
- def similarity(self, query: str, texts: list):
- rank = np.zeros(len(texts), dtype=float)
- if not texts:
- return rank, 0
- res = self.client.rerank(query=query, documents=texts, model=self.model_name, top_k=len(texts))
- try:
- for r in res.results:
- rank[r.index] = r.relevance_score
- except Exception as _e:
- log_exception(_e, res)
- return rank, res.total_tokens
-
-
- class QWenRerank(Base):
- _FACTORY_NAME = "Tongyi-Qianwen"
-
- def __init__(self, key, model_name="gte-rerank", base_url=None, **kwargs):
- import dashscope
-
- self.api_key = key
- self.model_name = dashscope.TextReRank.Models.gte_rerank if model_name is None else model_name
-
- def similarity(self, query: str, texts: list):
- from http import HTTPStatus
-
- import dashscope
-
- resp = dashscope.TextReRank.call(api_key=self.api_key, model=self.model_name, query=query, documents=texts, top_n=len(texts), return_documents=False)
- rank = np.zeros(len(texts), dtype=float)
- if resp.status_code == HTTPStatus.OK:
- try:
- for r in resp.output.results:
- rank[r.index] = r.relevance_score
- except Exception as _e:
- log_exception(_e, resp)
- return rank, resp.usage.total_tokens
- else:
- raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {resp.text}")
-
-
- class HuggingfaceRerank(DefaultRerank):
- _FACTORY_NAME = "HuggingFace"
-
- @staticmethod
- def post(query: str, texts: list, url="127.0.0.1"):
- exc = None
- scores = [0 for _ in range(len(texts))]
- batch_size = 8
- for i in range(0, len(texts), batch_size):
- try:
- res = requests.post(
- f"http://{url}/rerank", headers={"Content-Type": "application/json"}, json={"query": query, "texts": texts[i : i + batch_size], "raw_scores": False, "truncate": True}
- )
-
- for o in res.json():
- scores[o["index"] + i] = o["score"]
- except Exception as e:
- exc = e
-
- if exc:
- raise exc
- return np.array(scores)
-
- def __init__(self, key, model_name="BAAI/bge-reranker-v2-m3", base_url="http://127.0.0.1"):
- self.model_name = model_name.split("___")[0]
- self.base_url = base_url
-
- def similarity(self, query: str, texts: list) -> tuple[np.ndarray, int]:
- if not texts:
- return np.array([]), 0
- token_count = 0
- for t in texts:
- token_count += num_tokens_from_string(t)
- return HuggingfaceRerank.post(query, texts, self.base_url), token_count
-
-
- class GPUStackRerank(Base):
- _FACTORY_NAME = "GPUStack"
-
- def __init__(self, key, model_name, base_url):
- if not base_url:
- raise ValueError("url cannot be None")
-
- self.model_name = model_name
- self.base_url = str(URL(base_url) / "v1" / "rerank")
- self.headers = {
- "accept": "application/json",
- "content-type": "application/json",
- "authorization": f"Bearer {key}",
- }
-
- def similarity(self, query: str, texts: list):
- payload = {
- "model": self.model_name,
- "query": query,
- "documents": texts,
- "top_n": len(texts),
- }
-
- try:
- response = requests.post(self.base_url, json=payload, headers=self.headers)
- response.raise_for_status()
- response_json = response.json()
-
- rank = np.zeros(len(texts), dtype=float)
-
- token_count = 0
- for t in texts:
- token_count += num_tokens_from_string(t)
- try:
- for result in response_json["results"]:
- rank[result["index"]] = result["relevance_score"]
- except Exception as _e:
- log_exception(_e, response)
-
- return (
- rank,
- token_count,
- )
-
- except httpx.HTTPStatusError as e:
- raise ValueError(f"Error calling GPUStackRerank model {self.model_name}: {e.response.status_code} - {e.response.text}")
-
-
- class NovitaRerank(JinaRerank):
- _FACTORY_NAME = "NovitaAI"
-
- def __init__(self, key, model_name, base_url="https://api.novita.ai/v3/openai/rerank"):
- if not base_url:
- base_url = "https://api.novita.ai/v3/openai/rerank"
- super().__init__(key, model_name, base_url)
-
-
- class GiteeRerank(JinaRerank):
- _FACTORY_NAME = "GiteeAI"
-
- def __init__(self, key, model_name, base_url="https://ai.gitee.com/v1/rerank"):
- if not base_url:
- base_url = "https://ai.gitee.com/v1/rerank"
- super().__init__(key, model_name, base_url)
-
-
- class Ai302Rerank(Base):
- _FACTORY_NAME = "302.AI"
-
- def __init__(self, key, model_name, base_url="https://api.302.ai/v1/rerank"):
- if not base_url:
- base_url = "https://api.302.ai/v1/rerank"
- super().__init__(key, model_name, base_url)
|