- #
 - #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 - #
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import json
 - import os
 - import re
 - import threading
 - from abc import ABC
 - from collections.abc import Iterable
 - from urllib.parse import urljoin
 - 
 - import httpx
 - import numpy as np
 - import requests
 - from huggingface_hub import snapshot_download
 - from yarl import URL
 - 
 - from api import settings
 - from api.utils.file_utils import get_home_cache_dir
 - from api.utils.log_utils import log_exception
 - from rag.utils import num_tokens_from_string, truncate
 - 
 - class Base(ABC):
 -     def __init__(self, key, model_name, **kwargs):
 -         """
 -         Abstract base class constructor.
 -         Parameters are not stored; initialization is left to subclasses.
 -         """
 -         pass
 - 
 -     def similarity(self, query: str, texts: list):
 -         raise NotImplementedError("Please implement encode method!")
 - 
 -     def total_token_count(self, resp):
 -         if hasattr(resp, "usage") and hasattr(resp.usage, "total_tokens"):
 -             try:
 -                 return resp.usage.total_tokens
 -             except Exception:
 -                 pass
 - 
 -         if 'usage' in resp and 'total_tokens' in resp['usage']:
 -             try:
 -                 return resp["usage"]["total_tokens"]
 -             except Exception:
 -                 pass
 -         return 0
 - 
 - 
 - class DefaultRerank(Base):
 -     _FACTORY_NAME = "BAAI"
 -     _model = None
 -     _model_lock = threading.Lock()
 - 
 -     def __init__(self, key, model_name, **kwargs):
 -         """
 -         If you have trouble downloading HuggingFace models, -_^ this might help!!
 - 
 -         For Linux:
 -         export HF_ENDPOINT=https://hf-mirror.com
 - 
 -         For Windows:
 -         Good luck
 -         ^_-
 - 
 -         """
 -         if not settings.LIGHTEN and not DefaultRerank._model:
 -             import torch
 -             from FlagEmbedding import FlagReranker
 - 
 -             with DefaultRerank._model_lock:
 -                 if not DefaultRerank._model:
 -                     try:
 -                         DefaultRerank._model = FlagReranker(os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), use_fp16=torch.cuda.is_available())
 -                     except Exception:
 -                         model_dir = snapshot_download(repo_id=model_name, local_dir=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)), local_dir_use_symlinks=False)
 -                         DefaultRerank._model = FlagReranker(model_dir, use_fp16=torch.cuda.is_available())
 -         self._model = DefaultRerank._model
 -         self._dynamic_batch_size = 8
 -         self._min_batch_size = 1
 - 
 -     def torch_empty_cache(self):
 -         try:
 -             import torch
 - 
 -             torch.cuda.empty_cache()
 -         except Exception as e:
 -             log_exception(e)
 - 
 -     def _process_batch(self, pairs, max_batch_size=None):
 -         """template method for subclass call"""
 -         old_dynamic_batch_size = self._dynamic_batch_size
 -         if max_batch_size is not None:
 -             self._dynamic_batch_size = max_batch_size
 -         res = np.array(len(pairs), dtype=float)
 -         i = 0
 -         while i < len(pairs):
 -             cur_i = i
 -             current_batch = self._dynamic_batch_size
 -             max_retries = 5
 -             retry_count = 0
 -             while retry_count < max_retries:
 -                 try:
 -                     # call subclass implemented batch processing calculation
 -                     batch_scores = self._compute_batch_scores(pairs[i : i + current_batch])
 -                     res[i : i + current_batch] = batch_scores
 -                     i += current_batch
 -                     self._dynamic_batch_size = min(self._dynamic_batch_size * 2, 8)
 -                     break
 -                 except RuntimeError as e:
 -                     if "CUDA out of memory" in str(e) and current_batch > self._min_batch_size:
 -                         current_batch = max(current_batch // 2, self._min_batch_size)
 -                         self.torch_empty_cache()
 -                         i = cur_i # reset i to the start of the current batch
 -                         retry_count += 1
 -                     else:
 -                         raise
 -             if retry_count >= max_retries:
 -                 raise RuntimeError("max retry times, still cannot process batch, please check your GPU memory")
 -             
 -         self.torch_empty_cache()
 -         self._dynamic_batch_size = old_dynamic_batch_size
 -         return np.array(res)
 - 
 -     def _compute_batch_scores(self, batch_pairs, max_length=None):
 -         if max_length is None:
 -             scores = self._model.compute_score(batch_pairs, normalize=True)
 -         else:
 -             scores = self._model.compute_score(batch_pairs, max_length=max_length, normalize=True)
 -         if not isinstance(scores, Iterable):
 -             scores = [scores]
 -         return scores
 - 
 -     def similarity(self, query: str, texts: list):
 -         pairs = [(query, truncate(t, 2048)) for t in texts]
 -         token_count = 0
 -         for _, t in pairs:
 -             token_count += num_tokens_from_string(t)
 -         batch_size = 4096
 -         res = self._process_batch(pairs, max_batch_size=batch_size)
 -         return np.array(res), token_count
 - 
 - 
 - class JinaRerank(Base):
 -     _FACTORY_NAME = "Jina"
 - 
 -     def __init__(self, key, model_name="jina-reranker-v2-base-multilingual", base_url="https://api.jina.ai/v1/rerank"):
 -         self.base_url = "https://api.jina.ai/v1/rerank"
 -         self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
 -         self.model_name = model_name
 - 
 -     def similarity(self, query: str, texts: list):
 -         texts = [truncate(t, 8196) for t in texts]
 -         data = {"model": self.model_name, "query": query, "documents": texts, "top_n": len(texts)}
 -         res = requests.post(self.base_url, headers=self.headers, json=data).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, self.total_token_count(res)
 - 
 - 
 - class YoudaoRerank(DefaultRerank):
 -     _FACTORY_NAME = "Youdao"
 -     _model = None
 -     _model_lock = threading.Lock()
 - 
 -     def __init__(self, key=None, model_name="maidalun1020/bce-reranker-base_v1", **kwargs):
 -         if not settings.LIGHTEN and not YoudaoRerank._model:
 -             from BCEmbedding import RerankerModel
 - 
 -             with YoudaoRerank._model_lock:
 -                 if not YoudaoRerank._model:
 -                     try:
 -                         YoudaoRerank._model = RerankerModel(model_name_or_path=os.path.join(get_home_cache_dir(), re.sub(r"^[a-zA-Z0-9]+/", "", model_name)))
 -                     except Exception:
 -                         YoudaoRerank._model = RerankerModel(model_name_or_path=model_name.replace("maidalun1020", "InfiniFlow"))
 - 
 -         self._model = YoudaoRerank._model
 -         self._dynamic_batch_size = 8
 -         self._min_batch_size = 1
 - 
 -     def similarity(self, query: str, texts: list):
 -         pairs = [(query, truncate(t, self._model.max_length)) for t in texts]
 -         token_count = 0
 -         for _, t in pairs:
 -             token_count += num_tokens_from_string(t)
 -         batch_size = 8
 -         res = self._process_batch(pairs, max_batch_size=batch_size)
 -         return np.array(res), token_count
 - 
 - 
 - class XInferenceRerank(Base):
 -     _FACTORY_NAME = "Xinference"
 - 
 -     def __init__(self, key="x", model_name="", base_url=""):
 -         if base_url.find("/v1") == -1:
 -             base_url = urljoin(base_url, "/v1/rerank")
 -         if base_url.find("/rerank") == -1:
 -             base_url = urljoin(base_url, "/v1/rerank")
 -         self.model_name = model_name
 -         self.base_url = base_url
 -         self.headers = {"Content-Type": "application/json", "accept": "application/json"}
 -         if key and key != "x":
 -             self.headers["Authorization"] = f"Bearer {key}"
 - 
 -     def similarity(self, query: str, texts: list):
 -         if len(texts) == 0:
 -             return np.array([]), 0
 -         pairs = [(query, truncate(t, 4096)) for t in texts]
 -         token_count = 0
 -         for _, t in pairs:
 -             token_count += num_tokens_from_string(t)
 -         data = {"model": self.model_name, "query": query, "return_documents": "true", "return_len": "true", "documents": texts}
 -         res = requests.post(self.base_url, headers=self.headers, json=data).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, token_count
 - 
 - 
 - class LocalAIRerank(Base):
 -     _FACTORY_NAME = "LocalAI"
 - 
 -     def __init__(self, key, model_name, base_url):
 -         if base_url.find("/rerank") == -1:
 -             self.base_url = urljoin(base_url, "/rerank")
 -         else:
 -             self.base_url = base_url
 -         self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
 -         self.model_name = model_name.split("___")[0]
 - 
 -     def similarity(self, query: str, texts: list):
 -         # noway to config Ragflow , use fix setting
 -         texts = [truncate(t, 500) for t in texts]
 -         data = {
 -             "model": self.model_name,
 -             "query": query,
 -             "documents": texts,
 -             "top_n": len(texts),
 -         }
 -         token_count = 0
 -         for t in texts:
 -             token_count += num_tokens_from_string(t)
 -         res = requests.post(self.base_url, headers=self.headers, json=data).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 - 
 -         # Normalize the rank values to the range 0 to 1
 -         min_rank = np.min(rank)
 -         max_rank = np.max(rank)
 - 
 -         # Avoid division by zero if all ranks are identical
 -         if not np.isclose(min_rank, max_rank, atol=1e-3):
 -             rank = (rank - min_rank) / (max_rank - min_rank)
 -         else:
 -             rank = np.zeros_like(rank)
 - 
 -         return rank, token_count
 - 
 - 
 - class NvidiaRerank(Base):
 -     _FACTORY_NAME = "NVIDIA"
 - 
 -     def __init__(self, key, model_name, base_url="https://ai.api.nvidia.com/v1/retrieval/nvidia/"):
 -         if not base_url:
 -             base_url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/"
 -         self.model_name = model_name
 - 
 -         if self.model_name == "nvidia/nv-rerankqa-mistral-4b-v3":
 -             self.base_url = urljoin(base_url, "nv-rerankqa-mistral-4b-v3/reranking")
 - 
 -         if self.model_name == "nvidia/rerank-qa-mistral-4b":
 -             self.base_url = urljoin(base_url, "reranking")
 -             self.model_name = "nv-rerank-qa-mistral-4b:1"
 - 
 -         self.headers = {
 -             "accept": "application/json",
 -             "Content-Type": "application/json",
 -             "Authorization": f"Bearer {key}",
 -         }
 - 
 -     def similarity(self, query: str, texts: list):
 -         token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
 -         data = {
 -             "model": self.model_name,
 -             "query": {"text": query},
 -             "passages": [{"text": text} for text in texts],
 -             "truncate": "END",
 -             "top_n": len(texts),
 -         }
 -         res = requests.post(self.base_url, headers=self.headers, json=data).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["rankings"]:
 -                 rank[d["index"]] = d["logit"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, token_count
 - 
 - 
 - class LmStudioRerank(Base):
 -     _FACTORY_NAME = "LM-Studio"
 - 
 -     def __init__(self, key, model_name, base_url, **kwargs):
 -         pass
 - 
 -     def similarity(self, query: str, texts: list):
 -         raise NotImplementedError("The LmStudioRerank has not been implement")
 - 
 - 
 - class OpenAI_APIRerank(Base):
 -     _FACTORY_NAME = "OpenAI-API-Compatible"
 - 
 -     def __init__(self, key, model_name, base_url):
 -         if base_url.find("/rerank") == -1:
 -             self.base_url = urljoin(base_url, "/rerank")
 -         else:
 -             self.base_url = base_url
 -         self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
 -         self.model_name = model_name.split("___")[0]
 - 
 -     def similarity(self, query: str, texts: list):
 -         # noway to config Ragflow , use fix setting
 -         texts = [truncate(t, 500) for t in texts]
 -         data = {
 -             "model": self.model_name,
 -             "query": query,
 -             "documents": texts,
 -             "top_n": len(texts),
 -         }
 -         token_count = 0
 -         for t in texts:
 -             token_count += num_tokens_from_string(t)
 -         res = requests.post(self.base_url, headers=self.headers, json=data).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 - 
 -         # Normalize the rank values to the range 0 to 1
 -         min_rank = np.min(rank)
 -         max_rank = np.max(rank)
 - 
 -         # Avoid division by zero if all ranks are identical
 -         if np.isclose(min_rank, max_rank, atol=1e-3):
 -             rank = (rank - min_rank) / (max_rank - min_rank)
 -         else:
 -             rank = np.zeros_like(rank)
 - 
 -         return rank, token_count
 - 
 - 
 - class CoHereRerank(Base):
 -     _FACTORY_NAME = ["Cohere", "VLLM"]
 - 
 -     def __init__(self, key, model_name, base_url=None):
 -         from cohere import Client
 - 
 -         self.client = Client(api_key=key, base_url=base_url)
 -         self.model_name = model_name.split("___")[0]
 - 
 -     def similarity(self, query: str, texts: list):
 -         token_count = num_tokens_from_string(query) + sum([num_tokens_from_string(t) for t in texts])
 -         res = self.client.rerank(
 -             model=self.model_name,
 -             query=query,
 -             documents=texts,
 -             top_n=len(texts),
 -             return_documents=False,
 -         )
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res.results:
 -                 rank[d.index] = d.relevance_score
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, token_count
 - 
 - 
 - class TogetherAIRerank(Base):
 -     _FACTORY_NAME = "TogetherAI"
 - 
 -     def __init__(self, key, model_name, base_url, **kwargs):
 -         pass
 - 
 -     def similarity(self, query: str, texts: list):
 -         raise NotImplementedError("The api has not been implement")
 - 
 - 
 - class SILICONFLOWRerank(Base):
 -     _FACTORY_NAME = "SILICONFLOW"
 - 
 -     def __init__(self, key, model_name, base_url="https://api.siliconflow.cn/v1/rerank"):
 -         if not base_url:
 -             base_url = "https://api.siliconflow.cn/v1/rerank"
 -         self.model_name = model_name
 -         self.base_url = base_url
 -         self.headers = {
 -             "accept": "application/json",
 -             "content-type": "application/json",
 -             "authorization": f"Bearer {key}",
 -         }
 - 
 -     def similarity(self, query: str, texts: list):
 -         payload = {
 -             "model": self.model_name,
 -             "query": query,
 -             "documents": texts,
 -             "top_n": len(texts),
 -             "return_documents": False,
 -             "max_chunks_per_doc": 1024,
 -             "overlap_tokens": 80,
 -         }
 -         response = requests.post(self.base_url, json=payload, headers=self.headers).json()
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in response["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, response)
 -         return (
 -             rank,
 -             response["meta"]["tokens"]["input_tokens"] + response["meta"]["tokens"]["output_tokens"],
 -         )
 - 
 - 
 - class BaiduYiyanRerank(Base):
 -     _FACTORY_NAME = "BaiduYiyan"
 - 
 -     def __init__(self, key, model_name, base_url=None):
 -         from qianfan.resources import Reranker
 - 
 -         key = json.loads(key)
 -         ak = key.get("yiyan_ak", "")
 -         sk = key.get("yiyan_sk", "")
 -         self.client = Reranker(ak=ak, sk=sk)
 -         self.model_name = model_name
 - 
 -     def similarity(self, query: str, texts: list):
 -         res = self.client.do(
 -             model=self.model_name,
 -             query=query,
 -             documents=texts,
 -             top_n=len(texts),
 -         ).body
 -         rank = np.zeros(len(texts), dtype=float)
 -         try:
 -             for d in res["results"]:
 -                 rank[d["index"]] = d["relevance_score"]
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, self.total_token_count(res)
 - 
 - 
 - class VoyageRerank(Base):
 -     _FACTORY_NAME = "Voyage AI"
 - 
 -     def __init__(self, key, model_name, base_url=None):
 -         import voyageai
 - 
 -         self.client = voyageai.Client(api_key=key)
 -         self.model_name = model_name
 - 
 -     def similarity(self, query: str, texts: list):
 -         if not texts:
 -             return np.array([]), 0
 -         rank = np.zeros(len(texts), dtype=float)
 - 
 -         res = self.client.rerank(query=query, documents=texts, model=self.model_name, top_k=len(texts))
 -         try:
 -             for r in res.results:
 -                 rank[r.index] = r.relevance_score
 -         except Exception as _e:
 -             log_exception(_e, res)
 -         return rank, res.total_tokens
 - 
 - 
 - class QWenRerank(Base):
 -     _FACTORY_NAME = "Tongyi-Qianwen"
 - 
 -     def __init__(self, key, model_name="gte-rerank", base_url=None, **kwargs):
 -         import dashscope
 - 
 -         self.api_key = key
 -         self.model_name = dashscope.TextReRank.Models.gte_rerank if model_name is None else model_name
 - 
 -     def similarity(self, query: str, texts: list):
 -         from http import HTTPStatus
 - 
 -         import dashscope
 - 
 -         resp = dashscope.TextReRank.call(api_key=self.api_key, model=self.model_name, query=query, documents=texts, top_n=len(texts), return_documents=False)
 -         rank = np.zeros(len(texts), dtype=float)
 -         if resp.status_code == HTTPStatus.OK:
 -             try:
 -                 for r in resp.output.results:
 -                     rank[r.index] = r.relevance_score
 -             except Exception as _e:
 -                 log_exception(_e, resp)
 -             return rank, resp.usage.total_tokens
 -         else:
 -             raise ValueError(f"Error calling QWenRerank model {self.model_name}: {resp.status_code} - {resp.text}")
 - 
 - 
 - class HuggingfaceRerank(DefaultRerank):
 -     _FACTORY_NAME = "HuggingFace"
 - 
 -     @staticmethod
 -     def post(query: str, texts: list, url="127.0.0.1"):
 -         exc = None
 -         scores = [0 for _ in range(len(texts))]
 -         batch_size = 8
 -         for i in range(0, len(texts), batch_size):
 -             try:
 -                 res = requests.post(
 -                     f"http://{url}/rerank", headers={"Content-Type": "application/json"}, json={"query": query, "texts": texts[i : i + batch_size], "raw_scores": False, "truncate": True}
 -                 )
 - 
 -                 for o in res.json():
 -                     scores[o["index"] + i] = o["score"]
 -             except Exception as e:
 -                 exc = e
 - 
 -         if exc:
 -             raise exc
 -         return np.array(scores)
 - 
 -     def __init__(self, key, model_name="BAAI/bge-reranker-v2-m3", base_url="http://127.0.0.1"):
 -         self.model_name = model_name.split("___")[0]
 -         self.base_url = base_url
 - 
 -     def similarity(self, query: str, texts: list) -> tuple[np.ndarray, int]:
 -         if not texts:
 -             return np.array([]), 0
 -         token_count = 0
 -         for t in texts:
 -             token_count += num_tokens_from_string(t)
 -         return HuggingfaceRerank.post(query, texts, self.base_url), token_count
 - 
 - 
 - class GPUStackRerank(Base):
 -     _FACTORY_NAME = "GPUStack"
 - 
 -     def __init__(self, key, model_name, base_url):
 -         if not base_url:
 -             raise ValueError("url cannot be None")
 - 
 -         self.model_name = model_name
 -         self.base_url = str(URL(base_url) / "v1" / "rerank")
 -         self.headers = {
 -             "accept": "application/json",
 -             "content-type": "application/json",
 -             "authorization": f"Bearer {key}",
 -         }
 - 
 -     def similarity(self, query: str, texts: list):
 -         payload = {
 -             "model": self.model_name,
 -             "query": query,
 -             "documents": texts,
 -             "top_n": len(texts),
 -         }
 - 
 -         try:
 -             response = requests.post(self.base_url, json=payload, headers=self.headers)
 -             response.raise_for_status()
 -             response_json = response.json()
 - 
 -             rank = np.zeros(len(texts), dtype=float)
 - 
 -             token_count = 0
 -             for t in texts:
 -                 token_count += num_tokens_from_string(t)
 -             try:
 -                 for result in response_json["results"]:
 -                     rank[result["index"]] = result["relevance_score"]
 -             except Exception as _e:
 -                 log_exception(_e, response)
 - 
 -             return (
 -                 rank,
 -                 token_count,
 -             )
 - 
 -         except httpx.HTTPStatusError as e:
 -             raise ValueError(f"Error calling GPUStackRerank model {self.model_name}: {e.response.status_code} - {e.response.text}")
 - 
 - 
 - class NovitaRerank(JinaRerank):
 -     _FACTORY_NAME = "NovitaAI"
 - 
 -     def __init__(self, key, model_name, base_url="https://api.novita.ai/v3/openai/rerank"):
 -         if not base_url:
 -             base_url = "https://api.novita.ai/v3/openai/rerank"
 -         super().__init__(key, model_name, base_url)
 - 
 - 
 - class GiteeRerank(JinaRerank):
 -     _FACTORY_NAME = "GiteeAI"
 - 
 -     def __init__(self, key, model_name, base_url="https://ai.gitee.com/v1/rerank"):
 -         if not base_url:
 -             base_url = "https://ai.gitee.com/v1/rerank"
 -         super().__init__(key, model_name, base_url)
 - 
 - 
 - class Ai302Rerank(Base):
 -     _FACTORY_NAME = "302.AI"
 - 
 -     def __init__(self, key, model_name, base_url="https://api.302.ai/v1/rerank"):
 -         if not base_url:
 -             base_url = "https://api.302.ai/v1/rerank"
 -         super().__init__(key, model_name, base_url)
 
 
  |