Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

external_knowledge_service.py 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. import json
  2. from copy import deepcopy
  3. from datetime import UTC, datetime
  4. from typing import Any, Optional, Union, cast
  5. from urllib.parse import urlparse
  6. import httpx
  7. from constants import HIDDEN_VALUE
  8. from core.helper import ssrf_proxy
  9. from core.rag.entities.metadata_entities import MetadataCondition
  10. from extensions.ext_database import db
  11. from models.dataset import (
  12. Dataset,
  13. ExternalKnowledgeApis,
  14. ExternalKnowledgeBindings,
  15. )
  16. from services.entities.external_knowledge_entities.external_knowledge_entities import (
  17. Authorization,
  18. ExternalKnowledgeApiSetting,
  19. )
  20. from services.errors.dataset import DatasetNameDuplicateError
  21. class ExternalDatasetService:
  22. @staticmethod
  23. def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
  24. query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
  25. ExternalKnowledgeApis.created_at.desc()
  26. )
  27. if search:
  28. query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
  29. external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
  30. return external_knowledge_apis.items, external_knowledge_apis.total
  31. @classmethod
  32. def validate_api_list(cls, api_settings: dict):
  33. if not api_settings:
  34. raise ValueError("api list is empty")
  35. if "endpoint" not in api_settings and not api_settings["endpoint"]:
  36. raise ValueError("endpoint is required")
  37. if "api_key" not in api_settings and not api_settings["api_key"]:
  38. raise ValueError("api_key is required")
  39. @staticmethod
  40. def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
  41. settings = args.get("settings")
  42. if settings is None:
  43. raise ValueError("settings is required")
  44. ExternalDatasetService.check_endpoint_and_api_key(settings)
  45. external_knowledge_api = ExternalKnowledgeApis(
  46. tenant_id=tenant_id,
  47. created_by=user_id,
  48. updated_by=user_id,
  49. name=args.get("name"),
  50. description=args.get("description", ""),
  51. settings=json.dumps(args.get("settings"), ensure_ascii=False),
  52. )
  53. db.session.add(external_knowledge_api)
  54. db.session.commit()
  55. return external_knowledge_api
  56. @staticmethod
  57. def check_endpoint_and_api_key(settings: dict):
  58. if "endpoint" not in settings or not settings["endpoint"]:
  59. raise ValueError("endpoint is required")
  60. if "api_key" not in settings or not settings["api_key"]:
  61. raise ValueError("api_key is required")
  62. endpoint = f"{settings['endpoint']}/retrieval"
  63. api_key = settings["api_key"]
  64. parsed_url = urlparse(endpoint)
  65. if not all([parsed_url.scheme, parsed_url.netloc]):
  66. if not endpoint.startswith("http://") and not endpoint.startswith("https://"):
  67. raise ValueError(f"invalid endpoint: {endpoint} must start with http:// or https://")
  68. else:
  69. raise ValueError(f"invalid endpoint: {endpoint}")
  70. try:
  71. response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
  72. except Exception as e:
  73. raise ValueError(f"failed to connect to the endpoint: {endpoint}")
  74. if response.status_code == 502:
  75. raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
  76. if response.status_code == 404:
  77. raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
  78. if response.status_code == 403:
  79. raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
  80. @staticmethod
  81. def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
  82. external_knowledge_api: Optional[ExternalKnowledgeApis] = ExternalKnowledgeApis.query.filter_by(
  83. id=external_knowledge_api_id
  84. ).first()
  85. if external_knowledge_api is None:
  86. raise ValueError("api template not found")
  87. return external_knowledge_api
  88. @staticmethod
  89. def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
  90. external_knowledge_api: Optional[ExternalKnowledgeApis] = ExternalKnowledgeApis.query.filter_by(
  91. id=external_knowledge_api_id, tenant_id=tenant_id
  92. ).first()
  93. if external_knowledge_api is None:
  94. raise ValueError("api template not found")
  95. if args.get("settings") and args.get("settings").get("api_key") == HIDDEN_VALUE:
  96. args.get("settings")["api_key"] = external_knowledge_api.settings_dict.get("api_key")
  97. external_knowledge_api.name = args.get("name")
  98. external_knowledge_api.description = args.get("description", "")
  99. external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
  100. external_knowledge_api.updated_by = user_id
  101. external_knowledge_api.updated_at = datetime.now(UTC).replace(tzinfo=None)
  102. db.session.commit()
  103. return external_knowledge_api
  104. @staticmethod
  105. def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
  106. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  107. id=external_knowledge_api_id, tenant_id=tenant_id
  108. ).first()
  109. if external_knowledge_api is None:
  110. raise ValueError("api template not found")
  111. db.session.delete(external_knowledge_api)
  112. db.session.commit()
  113. @staticmethod
  114. def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
  115. count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
  116. if count > 0:
  117. return True, count
  118. return False, 0
  119. @staticmethod
  120. def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
  121. external_knowledge_binding: Optional[ExternalKnowledgeBindings] = ExternalKnowledgeBindings.query.filter_by(
  122. dataset_id=dataset_id, tenant_id=tenant_id
  123. ).first()
  124. if not external_knowledge_binding:
  125. raise ValueError("external knowledge binding not found")
  126. return external_knowledge_binding
  127. @staticmethod
  128. def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
  129. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  130. id=external_knowledge_api_id, tenant_id=tenant_id
  131. ).first()
  132. if external_knowledge_api is None:
  133. raise ValueError("api template not found")
  134. settings = json.loads(external_knowledge_api.settings)
  135. for setting in settings:
  136. custom_parameters = setting.get("document_process_setting")
  137. if custom_parameters:
  138. for parameter in custom_parameters:
  139. if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
  140. raise ValueError(f"{parameter.get('name')} is required")
  141. @staticmethod
  142. def process_external_api(
  143. settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
  144. ) -> httpx.Response:
  145. """
  146. do http request depending on api bundle
  147. """
  148. kwargs = {
  149. "url": settings.url,
  150. "headers": settings.headers,
  151. "follow_redirects": True,
  152. }
  153. response: httpx.Response = getattr(ssrf_proxy, settings.request_method)(
  154. data=json.dumps(settings.params), files=files, **kwargs
  155. )
  156. return response
  157. @staticmethod
  158. def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
  159. authorization = deepcopy(authorization)
  160. if headers:
  161. headers = deepcopy(headers)
  162. else:
  163. headers = {}
  164. if authorization.type == "api-key":
  165. if authorization.config is None:
  166. raise ValueError("authorization config is required")
  167. if authorization.config.api_key is None:
  168. raise ValueError("api_key is required")
  169. if not authorization.config.header:
  170. authorization.config.header = "Authorization"
  171. if authorization.config.type == "bearer":
  172. headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
  173. elif authorization.config.type == "basic":
  174. headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
  175. elif authorization.config.type == "custom":
  176. headers[authorization.config.header] = authorization.config.api_key
  177. return headers
  178. @staticmethod
  179. def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
  180. return ExternalKnowledgeApiSetting.parse_obj(settings)
  181. @staticmethod
  182. def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
  183. # check if dataset name already exists
  184. if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
  185. raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
  186. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  187. id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
  188. ).first()
  189. if external_knowledge_api is None:
  190. raise ValueError("api template not found")
  191. dataset = Dataset(
  192. tenant_id=tenant_id,
  193. name=args.get("name"),
  194. description=args.get("description", ""),
  195. provider="external",
  196. retrieval_model=args.get("external_retrieval_model"),
  197. created_by=user_id,
  198. )
  199. db.session.add(dataset)
  200. db.session.flush()
  201. external_knowledge_binding = ExternalKnowledgeBindings(
  202. tenant_id=tenant_id,
  203. dataset_id=dataset.id,
  204. external_knowledge_api_id=args.get("external_knowledge_api_id"),
  205. external_knowledge_id=args.get("external_knowledge_id"),
  206. created_by=user_id,
  207. )
  208. db.session.add(external_knowledge_binding)
  209. db.session.commit()
  210. return dataset
  211. @staticmethod
  212. def fetch_external_knowledge_retrieval(
  213. tenant_id: str,
  214. dataset_id: str,
  215. query: str,
  216. external_retrieval_parameters: dict,
  217. metadata_condition: Optional[MetadataCondition] = None,
  218. ) -> list:
  219. external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
  220. dataset_id=dataset_id, tenant_id=tenant_id
  221. ).first()
  222. if not external_knowledge_binding:
  223. raise ValueError("external knowledge binding not found")
  224. external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
  225. id=external_knowledge_binding.external_knowledge_api_id
  226. ).first()
  227. if not external_knowledge_api:
  228. raise ValueError("external api template not found")
  229. settings = json.loads(external_knowledge_api.settings)
  230. headers = {"Content-Type": "application/json"}
  231. if settings.get("api_key"):
  232. headers["Authorization"] = f"Bearer {settings.get('api_key')}"
  233. score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
  234. score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
  235. request_params = {
  236. "retrieval_setting": {
  237. "top_k": external_retrieval_parameters.get("top_k"),
  238. "score_threshold": score_threshold,
  239. },
  240. "query": query,
  241. "knowledge_id": external_knowledge_binding.external_knowledge_id,
  242. "metadata_condition": metadata_condition.model_dump() if metadata_condition else None,
  243. }
  244. response = ExternalDatasetService.process_external_api(
  245. ExternalKnowledgeApiSetting(
  246. url=f"{settings.get('endpoint')}/retrieval",
  247. request_method="post",
  248. headers=headers,
  249. params=request_params,
  250. ),
  251. None,
  252. )
  253. if response.status_code == 200:
  254. return cast(list[Any], response.json().get("records", []))
  255. return []