You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
преди 10 месеца
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
  9. from extensions.ext_redis import redis_client
  10. from extensions.ext_storage import storage
  11. from services.auth.api_key_auth_service import ApiKeyAuthService
  12. class WebsiteService:
  13. @classmethod
  14. def document_create_args_validate(cls, args: dict):
  15. if "url" not in args or not args["url"]:
  16. raise ValueError("url is required")
  17. if "options" not in args or not args["options"]:
  18. raise ValueError("options is required")
  19. if "limit" not in args["options"] or not args["options"]["limit"]:
  20. raise ValueError("limit is required")
  21. @classmethod
  22. def crawl_url(cls, args: dict) -> dict:
  23. provider = args.get("provider", "")
  24. url = args.get("url")
  25. options = args.get("options", "")
  26. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  27. if provider == "firecrawl":
  28. # decrypt api_key
  29. api_key = encrypter.decrypt_token(
  30. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  31. )
  32. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  33. crawl_sub_pages = options.get("crawl_sub_pages", False)
  34. only_main_content = options.get("only_main_content", False)
  35. if not crawl_sub_pages:
  36. params = {
  37. "includePaths": [],
  38. "excludePaths": [],
  39. "limit": 1,
  40. "scrapeOptions": {"onlyMainContent": only_main_content},
  41. }
  42. else:
  43. includes = options.get("includes").split(",") if options.get("includes") else []
  44. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  45. params = {
  46. "includePaths": includes,
  47. "excludePaths": excludes,
  48. "limit": options.get("limit", 1),
  49. "scrapeOptions": {"onlyMainContent": only_main_content},
  50. }
  51. if options.get("max_depth"):
  52. params["maxDepth"] = options.get("max_depth")
  53. job_id = firecrawl_app.crawl_url(url, params)
  54. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  55. time = str(datetime.datetime.now().timestamp())
  56. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  57. return {"status": "active", "job_id": job_id}
  58. elif provider == "watercrawl":
  59. # decrypt api_key
  60. api_key = encrypter.decrypt_token(
  61. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  62. )
  63. return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options)
  64. elif provider == "jinareader":
  65. api_key = encrypter.decrypt_token(
  66. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  67. )
  68. crawl_sub_pages = options.get("crawl_sub_pages", False)
  69. if not crawl_sub_pages:
  70. response = requests.get(
  71. f"https://r.jina.ai/{url}",
  72. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  73. )
  74. if response.json().get("code") != 200:
  75. raise ValueError("Failed to crawl")
  76. return {"status": "active", "data": response.json().get("data")}
  77. else:
  78. response = requests.post(
  79. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  80. json={
  81. "url": url,
  82. "maxPages": options.get("limit", 1),
  83. "useSitemap": options.get("use_sitemap", True),
  84. },
  85. headers={
  86. "Content-Type": "application/json",
  87. "Authorization": f"Bearer {api_key}",
  88. },
  89. )
  90. if response.json().get("code") != 200:
  91. raise ValueError("Failed to crawl")
  92. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  93. else:
  94. raise ValueError("Invalid provider")
  95. @classmethod
  96. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  97. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  98. if provider == "firecrawl":
  99. # decrypt api_key
  100. api_key = encrypter.decrypt_token(
  101. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  102. )
  103. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  104. result = firecrawl_app.check_crawl_status(job_id)
  105. crawl_status_data = {
  106. "status": result.get("status", "active"),
  107. "job_id": job_id,
  108. "total": result.get("total", 0),
  109. "current": result.get("current", 0),
  110. "data": result.get("data", []),
  111. }
  112. if crawl_status_data["status"] == "completed":
  113. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  114. start_time = redis_client.get(website_crawl_time_cache_key)
  115. if start_time:
  116. end_time = datetime.datetime.now().timestamp()
  117. time_consuming = abs(end_time - float(start_time))
  118. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  119. redis_client.delete(website_crawl_time_cache_key)
  120. elif provider == "watercrawl":
  121. # decrypt api_key
  122. api_key = encrypter.decrypt_token(
  123. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  124. )
  125. crawl_status_data = WaterCrawlProvider(
  126. api_key, credentials.get("config").get("base_url", None)
  127. ).get_crawl_status(job_id)
  128. elif provider == "jinareader":
  129. api_key = encrypter.decrypt_token(
  130. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  131. )
  132. response = requests.post(
  133. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  134. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  135. json={"taskId": job_id},
  136. )
  137. data = response.json().get("data", {})
  138. crawl_status_data = {
  139. "status": data.get("status", "active"),
  140. "job_id": job_id,
  141. "total": len(data.get("urls", [])),
  142. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  143. "data": [],
  144. "time_consuming": data.get("duration", 0) / 1000,
  145. }
  146. if crawl_status_data["status"] == "completed":
  147. response = requests.post(
  148. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  149. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  150. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  151. )
  152. data = response.json().get("data", {})
  153. formatted_data = [
  154. {
  155. "title": item.get("data", {}).get("title"),
  156. "source_url": item.get("data", {}).get("url"),
  157. "description": item.get("data", {}).get("description"),
  158. "markdown": item.get("data", {}).get("content"),
  159. }
  160. for item in data.get("processed", {}).values()
  161. ]
  162. crawl_status_data["data"] = formatted_data
  163. else:
  164. raise ValueError("Invalid provider")
  165. return crawl_status_data
  166. @classmethod
  167. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  168. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  169. # decrypt api_key
  170. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  171. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  172. data: Any
  173. if provider == "firecrawl":
  174. file_key = "website_files/" + job_id + ".txt"
  175. if storage.exists(file_key):
  176. d = storage.load_once(file_key)
  177. if d:
  178. data = json.loads(d.decode("utf-8"))
  179. else:
  180. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  181. result = firecrawl_app.check_crawl_status(job_id)
  182. if result.get("status") != "completed":
  183. raise ValueError("Crawl job is not completed")
  184. data = result.get("data")
  185. if data:
  186. for item in data:
  187. if item.get("source_url") == url:
  188. return dict(item)
  189. return None
  190. elif provider == "watercrawl":
  191. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  192. return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data(
  193. job_id, url
  194. )
  195. elif provider == "jinareader":
  196. if not job_id:
  197. response = requests.get(
  198. f"https://r.jina.ai/{url}",
  199. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  200. )
  201. if response.json().get("code") != 200:
  202. raise ValueError("Failed to crawl")
  203. return dict(response.json().get("data", {}))
  204. else:
  205. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  206. response = requests.post(
  207. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  208. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  209. json={"taskId": job_id},
  210. )
  211. data = response.json().get("data", {})
  212. if data.get("status") != "completed":
  213. raise ValueError("Crawl job is not completed")
  214. response = requests.post(
  215. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  216. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  217. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  218. )
  219. data = response.json().get("data", {})
  220. for item in data.get("processed", {}).values():
  221. if item.get("data", {}).get("url") == url:
  222. return dict(item.get("data", {}))
  223. return None
  224. else:
  225. raise ValueError("Invalid provider")
  226. @classmethod
  227. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  228. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  229. if provider == "firecrawl":
  230. # decrypt api_key
  231. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  232. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  233. params = {"onlyMainContent": only_main_content}
  234. result = firecrawl_app.scrape_url(url, params)
  235. return result
  236. elif provider == "watercrawl":
  237. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  238. return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url)
  239. else:
  240. raise ValueError("Invalid provider")