Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

website_service.py 12KB

10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
10 месяцев назад
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from extensions.ext_redis import redis_client
  9. from extensions.ext_storage import storage
  10. from services.auth.api_key_auth_service import ApiKeyAuthService
  11. class WebsiteService:
  12. @classmethod
  13. def document_create_args_validate(cls, args: dict):
  14. if "url" not in args or not args["url"]:
  15. raise ValueError("url is required")
  16. if "options" not in args or not args["options"]:
  17. raise ValueError("options is required")
  18. if "limit" not in args["options"] or not args["options"]["limit"]:
  19. raise ValueError("limit is required")
  20. @classmethod
  21. def crawl_url(cls, args: dict) -> dict:
  22. provider = args.get("provider", "")
  23. url = args.get("url")
  24. options = args.get("options", "")
  25. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  26. if provider == "firecrawl":
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  30. )
  31. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  32. crawl_sub_pages = options.get("crawl_sub_pages", False)
  33. only_main_content = options.get("only_main_content", False)
  34. if not crawl_sub_pages:
  35. params = {
  36. "crawlerOptions": {
  37. "includes": [],
  38. "excludes": [],
  39. "generateImgAltText": True,
  40. "limit": 1,
  41. "returnOnlyUrls": False,
  42. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  43. }
  44. }
  45. else:
  46. includes = options.get("includes").split(",") if options.get("includes") else []
  47. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  48. params = {
  49. "crawlerOptions": {
  50. "includes": includes,
  51. "excludes": excludes,
  52. "generateImgAltText": True,
  53. "limit": options.get("limit", 1),
  54. "returnOnlyUrls": False,
  55. "pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False},
  56. }
  57. }
  58. if options.get("max_depth"):
  59. params["crawlerOptions"]["maxDepth"] = options.get("max_depth")
  60. job_id = firecrawl_app.crawl_url(url, params)
  61. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  62. time = str(datetime.datetime.now().timestamp())
  63. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  64. return {"status": "active", "job_id": job_id}
  65. elif provider == "jinareader":
  66. api_key = encrypter.decrypt_token(
  67. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  68. )
  69. crawl_sub_pages = options.get("crawl_sub_pages", False)
  70. if not crawl_sub_pages:
  71. response = requests.get(
  72. f"https://r.jina.ai/{url}",
  73. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  74. )
  75. if response.json().get("code") != 200:
  76. raise ValueError("Failed to crawl")
  77. return {"status": "active", "data": response.json().get("data")}
  78. else:
  79. response = requests.post(
  80. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  81. json={
  82. "url": url,
  83. "maxPages": options.get("limit", 1),
  84. "useSitemap": options.get("use_sitemap", True),
  85. },
  86. headers={
  87. "Content-Type": "application/json",
  88. "Authorization": f"Bearer {api_key}",
  89. },
  90. )
  91. if response.json().get("code") != 200:
  92. raise ValueError("Failed to crawl")
  93. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  94. else:
  95. raise ValueError("Invalid provider")
  96. @classmethod
  97. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  98. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  99. if provider == "firecrawl":
  100. # decrypt api_key
  101. api_key = encrypter.decrypt_token(
  102. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  103. )
  104. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  105. result = firecrawl_app.check_crawl_status(job_id)
  106. crawl_status_data = {
  107. "status": result.get("status", "active"),
  108. "job_id": job_id,
  109. "total": result.get("total", 0),
  110. "current": result.get("current", 0),
  111. "data": result.get("data", []),
  112. }
  113. if crawl_status_data["status"] == "completed":
  114. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  115. start_time = redis_client.get(website_crawl_time_cache_key)
  116. if start_time:
  117. end_time = datetime.datetime.now().timestamp()
  118. time_consuming = abs(end_time - float(start_time))
  119. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  120. redis_client.delete(website_crawl_time_cache_key)
  121. elif provider == "jinareader":
  122. api_key = encrypter.decrypt_token(
  123. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  124. )
  125. response = requests.post(
  126. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  127. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  128. json={"taskId": job_id},
  129. )
  130. data = response.json().get("data", {})
  131. crawl_status_data = {
  132. "status": data.get("status", "active"),
  133. "job_id": job_id,
  134. "total": len(data.get("urls", [])),
  135. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  136. "data": [],
  137. "time_consuming": data.get("duration", 0) / 1000,
  138. }
  139. if crawl_status_data["status"] == "completed":
  140. response = requests.post(
  141. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  142. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  143. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  144. )
  145. data = response.json().get("data", {})
  146. formatted_data = [
  147. {
  148. "title": item.get("data", {}).get("title"),
  149. "source_url": item.get("data", {}).get("url"),
  150. "description": item.get("data", {}).get("description"),
  151. "markdown": item.get("data", {}).get("content"),
  152. }
  153. for item in data.get("processed", {}).values()
  154. ]
  155. crawl_status_data["data"] = formatted_data
  156. else:
  157. raise ValueError("Invalid provider")
  158. return crawl_status_data
  159. @classmethod
  160. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  161. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  162. # decrypt api_key
  163. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  164. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  165. data: Any
  166. if provider == "firecrawl":
  167. file_key = "website_files/" + job_id + ".txt"
  168. if storage.exists(file_key):
  169. d = storage.load_once(file_key)
  170. if d:
  171. data = json.loads(d.decode("utf-8"))
  172. else:
  173. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  174. result = firecrawl_app.check_crawl_status(job_id)
  175. if result.get("status") != "completed":
  176. raise ValueError("Crawl job is not completed")
  177. data = result.get("data")
  178. if data:
  179. for item in data:
  180. if item.get("source_url") == url:
  181. return dict(item)
  182. return None
  183. elif provider == "jinareader":
  184. if not job_id:
  185. response = requests.get(
  186. f"https://r.jina.ai/{url}",
  187. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  188. )
  189. if response.json().get("code") != 200:
  190. raise ValueError("Failed to crawl")
  191. return dict(response.json().get("data", {}))
  192. else:
  193. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  194. response = requests.post(
  195. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  196. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  197. json={"taskId": job_id},
  198. )
  199. data = response.json().get("data", {})
  200. if data.get("status") != "completed":
  201. raise ValueError("Crawl job is not completed")
  202. response = requests.post(
  203. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  204. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  205. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  206. )
  207. data = response.json().get("data", {})
  208. for item in data.get("processed", {}).values():
  209. if item.get("data", {}).get("url") == url:
  210. return dict(item.get("data", {}))
  211. return None
  212. else:
  213. raise ValueError("Invalid provider")
  214. @classmethod
  215. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  216. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  217. if provider == "firecrawl":
  218. # decrypt api_key
  219. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  220. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  221. params = {"pageOptions": {"onlyMainContent": only_main_content, "includeHtml": False}}
  222. result = firecrawl_app.scrape_url(url, params)
  223. return result
  224. else:
  225. raise ValueError("Invalid provider")