You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from extensions.ext_redis import redis_client
  9. from extensions.ext_storage import storage
  10. from services.auth.api_key_auth_service import ApiKeyAuthService
  11. class WebsiteService:
  12. @classmethod
  13. def document_create_args_validate(cls, args: dict):
  14. if "url" not in args or not args["url"]:
  15. raise ValueError("url is required")
  16. if "options" not in args or not args["options"]:
  17. raise ValueError("options is required")
  18. if "limit" not in args["options"] or not args["options"]["limit"]:
  19. raise ValueError("limit is required")
  20. @classmethod
  21. def crawl_url(cls, args: dict) -> dict:
  22. provider = args.get("provider", "")
  23. url = args.get("url")
  24. options = args.get("options", "")
  25. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  26. if provider == "firecrawl":
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  30. )
  31. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  32. crawl_sub_pages = options.get("crawl_sub_pages", False)
  33. only_main_content = options.get("only_main_content", False)
  34. if not crawl_sub_pages:
  35. params = {
  36. "includePaths": [],
  37. "excludePaths": [],
  38. "limit": 1,
  39. "scrapeOptions": {"onlyMainContent": only_main_content},
  40. }
  41. else:
  42. includes = options.get("includes").split(",") if options.get("includes") else []
  43. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  44. params = {
  45. "includePaths": includes,
  46. "excludePaths": excludes,
  47. "limit": options.get("limit", 1),
  48. "scrapeOptions": {"onlyMainContent": only_main_content},
  49. }
  50. if options.get("max_depth"):
  51. params["maxDepth"] = options.get("max_depth")
  52. job_id = firecrawl_app.crawl_url(url, params)
  53. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  54. time = str(datetime.datetime.now().timestamp())
  55. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  56. return {"status": "active", "job_id": job_id}
  57. elif provider == "jinareader":
  58. api_key = encrypter.decrypt_token(
  59. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  60. )
  61. crawl_sub_pages = options.get("crawl_sub_pages", False)
  62. if not crawl_sub_pages:
  63. response = requests.get(
  64. f"https://r.jina.ai/{url}",
  65. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  66. )
  67. if response.json().get("code") != 200:
  68. raise ValueError("Failed to crawl")
  69. return {"status": "active", "data": response.json().get("data")}
  70. else:
  71. response = requests.post(
  72. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  73. json={
  74. "url": url,
  75. "maxPages": options.get("limit", 1),
  76. "useSitemap": options.get("use_sitemap", True),
  77. },
  78. headers={
  79. "Content-Type": "application/json",
  80. "Authorization": f"Bearer {api_key}",
  81. },
  82. )
  83. if response.json().get("code") != 200:
  84. raise ValueError("Failed to crawl")
  85. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  86. else:
  87. raise ValueError("Invalid provider")
  88. @classmethod
  89. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  90. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  91. if provider == "firecrawl":
  92. # decrypt api_key
  93. api_key = encrypter.decrypt_token(
  94. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  95. )
  96. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  97. result = firecrawl_app.check_crawl_status(job_id)
  98. crawl_status_data = {
  99. "status": result.get("status", "active"),
  100. "job_id": job_id,
  101. "total": result.get("total", 0),
  102. "current": result.get("current", 0),
  103. "data": result.get("data", []),
  104. }
  105. if crawl_status_data["status"] == "completed":
  106. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  107. start_time = redis_client.get(website_crawl_time_cache_key)
  108. if start_time:
  109. end_time = datetime.datetime.now().timestamp()
  110. time_consuming = abs(end_time - float(start_time))
  111. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  112. redis_client.delete(website_crawl_time_cache_key)
  113. elif provider == "jinareader":
  114. api_key = encrypter.decrypt_token(
  115. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  116. )
  117. response = requests.post(
  118. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  119. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  120. json={"taskId": job_id},
  121. )
  122. data = response.json().get("data", {})
  123. crawl_status_data = {
  124. "status": data.get("status", "active"),
  125. "job_id": job_id,
  126. "total": len(data.get("urls", [])),
  127. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  128. "data": [],
  129. "time_consuming": data.get("duration", 0) / 1000,
  130. }
  131. if crawl_status_data["status"] == "completed":
  132. response = requests.post(
  133. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  134. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  135. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  136. )
  137. data = response.json().get("data", {})
  138. formatted_data = [
  139. {
  140. "title": item.get("data", {}).get("title"),
  141. "source_url": item.get("data", {}).get("url"),
  142. "description": item.get("data", {}).get("description"),
  143. "markdown": item.get("data", {}).get("content"),
  144. }
  145. for item in data.get("processed", {}).values()
  146. ]
  147. crawl_status_data["data"] = formatted_data
  148. else:
  149. raise ValueError("Invalid provider")
  150. return crawl_status_data
  151. @classmethod
  152. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  153. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  154. # decrypt api_key
  155. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  156. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  157. data: Any
  158. if provider == "firecrawl":
  159. file_key = "website_files/" + job_id + ".txt"
  160. if storage.exists(file_key):
  161. d = storage.load_once(file_key)
  162. if d:
  163. data = json.loads(d.decode("utf-8"))
  164. else:
  165. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  166. result = firecrawl_app.check_crawl_status(job_id)
  167. if result.get("status") != "completed":
  168. raise ValueError("Crawl job is not completed")
  169. data = result.get("data")
  170. if data:
  171. for item in data:
  172. if item.get("source_url") == url:
  173. return dict(item)
  174. return None
  175. elif provider == "jinareader":
  176. if not job_id:
  177. response = requests.get(
  178. f"https://r.jina.ai/{url}",
  179. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  180. )
  181. if response.json().get("code") != 200:
  182. raise ValueError("Failed to crawl")
  183. return dict(response.json().get("data", {}))
  184. else:
  185. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  186. response = requests.post(
  187. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  188. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  189. json={"taskId": job_id},
  190. )
  191. data = response.json().get("data", {})
  192. if data.get("status") != "completed":
  193. raise ValueError("Crawl job is not completed")
  194. response = requests.post(
  195. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  196. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  197. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  198. )
  199. data = response.json().get("data", {})
  200. for item in data.get("processed", {}).values():
  201. if item.get("data", {}).get("url") == url:
  202. return dict(item.get("data", {}))
  203. return None
  204. else:
  205. raise ValueError("Invalid provider")
  206. @classmethod
  207. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  208. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  209. if provider == "firecrawl":
  210. # decrypt api_key
  211. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  212. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  213. params = {"onlyMainContent": only_main_content}
  214. result = firecrawl_app.scrape_url(url, params)
  215. return result
  216. else:
  217. raise ValueError("Invalid provider")