Co-authored-by: crazywoola <427733928@qq.com>tags/1.2.0
| @@ -14,7 +14,12 @@ class WebsiteCrawlApi(Resource): | |||
| def post(self): | |||
| parser = reqparse.RequestParser() | |||
| parser.add_argument( | |||
| "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" | |||
| "provider", | |||
| type=str, | |||
| choices=["firecrawl", "watercrawl", "jinareader"], | |||
| required=True, | |||
| nullable=True, | |||
| location="json", | |||
| ) | |||
| parser.add_argument("url", type=str, required=True, nullable=True, location="json") | |||
| parser.add_argument("options", type=dict, required=True, nullable=True, location="json") | |||
| @@ -34,7 +39,9 @@ class WebsiteCrawlStatusApi(Resource): | |||
| @account_initialization_required | |||
| def get(self, job_id: str): | |||
| parser = reqparse.RequestParser() | |||
| parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") | |||
| parser.add_argument( | |||
| "provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], required=True, location="args" | |||
| ) | |||
| args = parser.parse_args() | |||
| # get crawl status | |||
| try: | |||
| @@ -26,6 +26,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu | |||
| from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor | |||
| from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor | |||
| from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor | |||
| from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor | |||
| from core.rag.extractor.word_extractor import WordExtractor | |||
| from core.rag.models.document import Document | |||
| from extensions.ext_storage import storage | |||
| @@ -183,6 +184,15 @@ class ExtractProcessor: | |||
| only_main_content=extract_setting.website_info.only_main_content, | |||
| ) | |||
| return extractor.extract() | |||
| elif extract_setting.website_info.provider == "watercrawl": | |||
| extractor = WaterCrawlWebExtractor( | |||
| url=extract_setting.website_info.url, | |||
| job_id=extract_setting.website_info.job_id, | |||
| tenant_id=extract_setting.website_info.tenant_id, | |||
| mode=extract_setting.website_info.mode, | |||
| only_main_content=extract_setting.website_info.only_main_content, | |||
| ) | |||
| return extractor.extract() | |||
| elif extract_setting.website_info.provider == "jinareader": | |||
| extractor = JinaReaderWebExtractor( | |||
| url=extract_setting.website_info.url, | |||
| @@ -0,0 +1,161 @@ | |||
| import json | |||
| from collections.abc import Generator | |||
| from typing import Union | |||
| from urllib.parse import urljoin | |||
| import requests | |||
| from requests import Response | |||
| class BaseAPIClient: | |||
| def __init__(self, api_key, base_url): | |||
| self.api_key = api_key | |||
| self.base_url = base_url | |||
| self.session = self.init_session() | |||
| def init_session(self): | |||
| session = requests.Session() | |||
| session.headers.update({"X-API-Key": self.api_key}) | |||
| session.headers.update({"Content-Type": "application/json"}) | |||
| session.headers.update({"Accept": "application/json"}) | |||
| session.headers.update({"User-Agent": "WaterCrawl-Plugin"}) | |||
| session.headers.update({"Accept-Language": "en-US"}) | |||
| return session | |||
| def _get(self, endpoint: str, query_params: dict | None = None, **kwargs): | |||
| return self.session.get(urljoin(self.base_url, endpoint), params=query_params, **kwargs) | |||
| def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||
| return self.session.post(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||
| def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||
| return self.session.put(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||
| def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs): | |||
| return self.session.delete(urljoin(self.base_url, endpoint), params=query_params, **kwargs) | |||
| def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||
| return self.session.patch(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||
| class WaterCrawlAPIClient(BaseAPIClient): | |||
| def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"): | |||
| super().__init__(api_key, base_url) | |||
| def process_eventstream(self, response: Response, download: bool = False) -> Generator: | |||
| for line in response.iter_lines(): | |||
| line = line.decode("utf-8") | |||
| if line.startswith("data:"): | |||
| line = line[5:].strip() | |||
| data = json.loads(line) | |||
| if data["type"] == "result" and download: | |||
| data["data"] = self.download_result(data["data"]) | |||
| yield data | |||
| def process_response(self, response: Response) -> dict | bytes | list | None | Generator: | |||
| response.raise_for_status() | |||
| if response.status_code == 204: | |||
| return None | |||
| if response.headers.get("Content-Type") == "application/json": | |||
| return response.json() or {} | |||
| if response.headers.get("Content-Type") == "application/octet-stream": | |||
| return response.content | |||
| if response.headers.get("Content-Type") == "text/event-stream": | |||
| return self.process_eventstream(response) | |||
| raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}") | |||
| def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): | |||
| query_params = {"page": page or 1, "page_size": page_size or 10} | |||
| return self.process_response( | |||
| self._get( | |||
| "/api/v1/core/crawl-requests/", | |||
| query_params=query_params, | |||
| ) | |||
| ) | |||
| def get_crawl_request(self, item_id: str): | |||
| return self.process_response( | |||
| self._get( | |||
| f"/api/v1/core/crawl-requests/{item_id}/", | |||
| ) | |||
| ) | |||
| def create_crawl_request( | |||
| self, | |||
| url: Union[list, str] | None = None, | |||
| spider_options: dict | None = None, | |||
| page_options: dict | None = None, | |||
| plugin_options: dict | None = None, | |||
| ): | |||
| data = { | |||
| # 'urls': url if isinstance(url, list) else [url], | |||
| "url": url, | |||
| "options": { | |||
| "spider_options": spider_options or {}, | |||
| "page_options": page_options or {}, | |||
| "plugin_options": plugin_options or {}, | |||
| }, | |||
| } | |||
| return self.process_response( | |||
| self._post( | |||
| "/api/v1/core/crawl-requests/", | |||
| data=data, | |||
| ) | |||
| ) | |||
| def stop_crawl_request(self, item_id: str): | |||
| return self.process_response( | |||
| self._delete( | |||
| f"/api/v1/core/crawl-requests/{item_id}/", | |||
| ) | |||
| ) | |||
| def download_crawl_request(self, item_id: str): | |||
| return self.process_response( | |||
| self._get( | |||
| f"/api/v1/core/crawl-requests/{item_id}/download/", | |||
| ) | |||
| ) | |||
| def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator: | |||
| query_params = {"prefetched": str(prefetched).lower()} | |||
| generator = self.process_response( | |||
| self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params), | |||
| ) | |||
| if not isinstance(generator, Generator): | |||
| raise ValueError("Generator expected") | |||
| yield from generator | |||
| def get_crawl_request_results( | |||
| self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None | |||
| ): | |||
| query_params = query_params or {} | |||
| query_params.update({"page": page or 1, "page_size": page_size or 25}) | |||
| return self.process_response( | |||
| self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params) | |||
| ) | |||
| def scrape_url( | |||
| self, | |||
| url: str, | |||
| page_options: dict | None = None, | |||
| plugin_options: dict | None = None, | |||
| sync: bool = True, | |||
| prefetched: bool = True, | |||
| ): | |||
| response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options) | |||
| if not sync: | |||
| return response_result | |||
| for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched): | |||
| if event_data["type"] == "result": | |||
| return event_data["data"] | |||
| def download_result(self, result_object: dict): | |||
| response = requests.get(result_object["result"]) | |||
| response.raise_for_status() | |||
| result_object["result"] = response.json() | |||
| return result_object | |||
| @@ -0,0 +1,57 @@ | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| from services.website_service import WebsiteService | |||
| class WaterCrawlWebExtractor(BaseExtractor): | |||
| """ | |||
| Crawl and scrape websites and return content in clean llm-ready markdown. | |||
| Args: | |||
| url: The URL to scrape. | |||
| api_key: The API key for WaterCrawl. | |||
| base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'. | |||
| mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. | |||
| only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. | |||
| """ | |||
| def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): | |||
| """Initialize with url, api_key, base_url and mode.""" | |||
| self._url = url | |||
| self.job_id = job_id | |||
| self.tenant_id = tenant_id | |||
| self.mode = mode | |||
| self.only_main_content = only_main_content | |||
| def extract(self) -> list[Document]: | |||
| """Extract content from the URL.""" | |||
| documents = [] | |||
| if self.mode == "crawl": | |||
| crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) | |||
| if crawl_data is None: | |||
| return [] | |||
| document = Document( | |||
| page_content=crawl_data.get("markdown", ""), | |||
| metadata={ | |||
| "source_url": crawl_data.get("source_url"), | |||
| "description": crawl_data.get("description"), | |||
| "title": crawl_data.get("title"), | |||
| }, | |||
| ) | |||
| documents.append(document) | |||
| elif self.mode == "scrape": | |||
| scrape_data = WebsiteService.get_scrape_url_data( | |||
| "watercrawl", self._url, self.tenant_id, self.only_main_content | |||
| ) | |||
| document = Document( | |||
| page_content=scrape_data.get("markdown", ""), | |||
| metadata={ | |||
| "source_url": scrape_data.get("source_url"), | |||
| "description": scrape_data.get("description"), | |||
| "title": scrape_data.get("title"), | |||
| }, | |||
| ) | |||
| documents.append(document) | |||
| return documents | |||
| @@ -0,0 +1,117 @@ | |||
| from collections.abc import Generator | |||
| from datetime import datetime | |||
| from typing import Any | |||
| from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient | |||
| class WaterCrawlProvider: | |||
| def __init__(self, api_key, base_url: str | None = None): | |||
| self.client = WaterCrawlAPIClient(api_key, base_url) | |||
| def crawl_url(self, url, options: dict | Any = None) -> dict: | |||
| options = options or {} | |||
| spider_options = { | |||
| "max_depth": 1, | |||
| "page_limit": 1, | |||
| "allowed_domains": [], | |||
| "exclude_paths": [], | |||
| "include_paths": [], | |||
| } | |||
| if options.get("crawl_sub_pages", True): | |||
| spider_options["page_limit"] = options.get("limit", 1) | |||
| spider_options["max_depth"] = options.get("depth", 1) | |||
| spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else [] | |||
| spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else [] | |||
| wait_time = options.get("wait_time", 1000) | |||
| page_options = { | |||
| "exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [], | |||
| "include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [], | |||
| "wait_time": max(1000, wait_time), # minimum wait time is 1 second | |||
| "include_html": False, | |||
| "only_main_content": options.get("only_main_content", True), | |||
| "include_links": False, | |||
| "timeout": 15000, | |||
| "accept_cookies_selector": "#cookies-accept", | |||
| "locale": "en-US", | |||
| "actions": [], | |||
| } | |||
| result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options) | |||
| return {"status": "active", "job_id": result.get("uuid")} | |||
| def get_crawl_status(self, crawl_request_id) -> dict: | |||
| response = self.client.get_crawl_request(crawl_request_id) | |||
| data = [] | |||
| if response["status"] in ["new", "running"]: | |||
| status = "active" | |||
| else: | |||
| status = "completed" | |||
| data = list(self._get_results(crawl_request_id)) | |||
| time_str = response.get("duration") | |||
| time_consuming: float = 0 | |||
| if time_str: | |||
| time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") | |||
| time_consuming = ( | |||
| time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 | |||
| ) | |||
| return { | |||
| "status": status, | |||
| "job_id": response.get("uuid"), | |||
| "total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1), | |||
| "current": response.get("number_of_documents", 0), | |||
| "data": data, | |||
| "time_consuming": time_consuming, | |||
| } | |||
| def get_crawl_url_data(self, job_id, url) -> dict | None: | |||
| if not job_id: | |||
| return self.scrape_url(url) | |||
| for result in self._get_results( | |||
| job_id, | |||
| { | |||
| # filter by url | |||
| "url": url | |||
| }, | |||
| ): | |||
| return result | |||
| return None | |||
| def scrape_url(self, url: str) -> dict: | |||
| response = self.client.scrape_url(url=url, sync=True, prefetched=True) | |||
| return self._structure_data(response) | |||
| def _structure_data(self, result_object: dict) -> dict: | |||
| if isinstance(result_object.get("result", {}), str): | |||
| raise ValueError("Invalid result object. Expected a dictionary.") | |||
| metadata = result_object.get("result", {}).get("metadata", {}) | |||
| return { | |||
| "title": metadata.get("og:title") or metadata.get("title"), | |||
| "description": metadata.get("description"), | |||
| "source_url": result_object.get("url"), | |||
| "markdown": result_object.get("result", {}).get("markdown"), | |||
| } | |||
| def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]: | |||
| page = 0 | |||
| page_size = 100 | |||
| query_params = query_params or {} | |||
| query_params.update({"prefetched": "true"}) | |||
| while True: | |||
| page += 1 | |||
| response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params) | |||
| if not response["results"]: | |||
| break | |||
| for result in response["results"]: | |||
| yield self._structure_data(result) | |||
| if response["next"] is None: | |||
| break | |||
| @@ -17,6 +17,10 @@ class ApiKeyAuthFactory: | |||
| from services.auth.firecrawl.firecrawl import FirecrawlAuth | |||
| return FirecrawlAuth | |||
| case AuthType.WATERCRAWL: | |||
| from services.auth.watercrawl.watercrawl import WatercrawlAuth | |||
| return WatercrawlAuth | |||
| case AuthType.JINA: | |||
| from services.auth.jina.jina import JinaAuth | |||
| @@ -3,4 +3,5 @@ from enum import StrEnum | |||
| class AuthType(StrEnum): | |||
| FIRECRAWL = "firecrawl" | |||
| WATERCRAWL = "watercrawl" | |||
| JINA = "jinareader" | |||
| @@ -0,0 +1,44 @@ | |||
| import json | |||
| from urllib.parse import urljoin | |||
| import requests | |||
| from services.auth.api_key_auth_base import ApiKeyAuthBase | |||
| class WatercrawlAuth(ApiKeyAuthBase): | |||
| def __init__(self, credentials: dict): | |||
| super().__init__(credentials) | |||
| auth_type = credentials.get("auth_type") | |||
| if auth_type != "x-api-key": | |||
| raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key") | |||
| self.api_key = credentials.get("config", {}).get("api_key", None) | |||
| self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev") | |||
| if not self.api_key: | |||
| raise ValueError("No API key provided") | |||
| def validate_credentials(self): | |||
| headers = self._prepare_headers() | |||
| url = urljoin(self.base_url, "/api/v1/core/crawl-requests/") | |||
| response = self._get_request(url, headers) | |||
| if response.status_code == 200: | |||
| return True | |||
| else: | |||
| self._handle_error(response) | |||
| def _prepare_headers(self): | |||
| return {"Content-Type": "application/json", "X-API-KEY": self.api_key} | |||
| def _get_request(self, url, headers): | |||
| return requests.get(url, headers=headers) | |||
| def _handle_error(self, response): | |||
| if response.status_code in {402, 409, 500}: | |||
| error_message = response.json().get("error", "Unknown error occurred") | |||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||
| else: | |||
| if response.text: | |||
| error_message = json.loads(response.text).get("error", "Unknown error occurred") | |||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||
| raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") | |||
| @@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore | |||
| from core.helper import encrypter | |||
| from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp | |||
| from core.rag.extractor.watercrawl.provider import WaterCrawlProvider | |||
| from extensions.ext_redis import redis_client | |||
| from extensions.ext_storage import storage | |||
| from services.auth.api_key_auth_service import ApiKeyAuthService | |||
| @@ -59,6 +60,13 @@ class WebsiteService: | |||
| time = str(datetime.datetime.now().timestamp()) | |||
| redis_client.setex(website_crawl_time_cache_key, 3600, time) | |||
| return {"status": "active", "job_id": job_id} | |||
| elif provider == "watercrawl": | |||
| # decrypt api_key | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| ) | |||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options) | |||
| elif provider == "jinareader": | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| @@ -116,6 +124,14 @@ class WebsiteService: | |||
| time_consuming = abs(end_time - float(start_time)) | |||
| crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" | |||
| redis_client.delete(website_crawl_time_cache_key) | |||
| elif provider == "watercrawl": | |||
| # decrypt api_key | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| ) | |||
| crawl_status_data = WaterCrawlProvider( | |||
| api_key, credentials.get("config").get("base_url", None) | |||
| ).get_crawl_status(job_id) | |||
| elif provider == "jinareader": | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| @@ -180,6 +196,11 @@ class WebsiteService: | |||
| if item.get("source_url") == url: | |||
| return dict(item) | |||
| return None | |||
| elif provider == "watercrawl": | |||
| api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) | |||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data( | |||
| job_id, url | |||
| ) | |||
| elif provider == "jinareader": | |||
| if not job_id: | |||
| response = requests.get( | |||
| @@ -223,5 +244,8 @@ class WebsiteService: | |||
| params = {"onlyMainContent": only_main_content} | |||
| result = firecrawl_app.scrape_url(url, params) | |||
| return result | |||
| elif provider == "watercrawl": | |||
| api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) | |||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url) | |||
| else: | |||
| raise ValueError("Invalid provider") | |||
| @@ -0,0 +1,20 @@ | |||
| <?xml version="1.0" encoding="utf-8"?> | |||
| <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500"> | |||
| <path style="fill: rgb(0, 23, 87); stroke: rgb(13, 14, 52);" d="M 247.794 213.903 L 246.81 76.976 L 254.345 76.963 L 254.592 213.989 L 247.794 213.903 Z"/> | |||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.025" cy="43.859" rx="33.966" ry="33.906"/> | |||
| <path style="fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 282.472 260.389 L 414.181 330.253 L 410.563 336.234 L 279.38 265.739 L 282.472 260.389 Z"/> | |||
| <path style="fill: rgb(15, 17, 57); stroke: rgb(13, 14, 52);" d="M 255.105 281.394 L 254.485 417.656 L 246.156 417.691 L 246.688 280.51 L 255.105 281.394 Z"/> | |||
| <path style="paint-order: fill; fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 279.486 229.517 L 410.351 160.07 L 413.923 167.04 L 283.727 235.998 L 279.486 229.517 Z"/> | |||
| <path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 88.545 164.884 L 219.797 236.07 L 222.867 229.568 L 90.887 159.47 L 88.545 164.884 Z"/> | |||
| <path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 224.76 266.9 L 95.55 334.829 L 92.878 328.37 L 219.955 261.275 L 224.76 266.9 Z"/> | |||
| <ellipse style="paint-order: fill; fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="251.242" cy="247.466" rx="33.966" ry="33.906"/> | |||
| <path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 279.502 433.617 L 408.666 359.443 C 408.666 359.443 412.398 366.965 412.398 366.916 C 412.398 366.867 281.544 440.217 281.544 440.217 L 279.502 433.617 Z"/> | |||
| <path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 223.119 431.408 L 96.643 361.068 L 93.265 368.047 L 218.895 438.099 L 223.119 431.408 Z"/> | |||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.504" cy="451.168" rx="33.966" ry="33.906"/> | |||
| <path style="fill: rgb(90, 191, 187); stroke: rgb(90, 191, 187);" d="M 435.665 180.895 L 435.859 316.869 L 443.103 315.579 L 442.56 180.697 L 435.665 180.895 Z"/> | |||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="441.06" cy="349.665" rx="33.966" ry="33.906"/> | |||
| <ellipse style="fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="441.512" cy="147.767" rx="33.966" ry="33.906"/> | |||
| <path style="fill: rgb(84, 187, 181); stroke: rgb(84, 187, 181);" d="M 64.755 314.523 L 57.928 315.006 L 58.307 182.961 L 65.169 182.865 L 64.755 314.523 Z"/> | |||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="58.177" cy="149.757" rx="33.966" ry="33.906"/> | |||
| <ellipse style="fill: rgb(61, 224, 203); stroke: rgb(61, 224, 203);" cx="65.909" cy="348.17" rx="33.966" ry="33.906"/> | |||
| </svg> | |||
| @@ -4,3 +4,10 @@ | |||
| background-image: url(../assets/jina.png); | |||
| background-size: 16px; | |||
| } | |||
| .watercrawlLogo { | |||
| @apply w-5 h-5 bg-center bg-no-repeat inline-block; | |||
| /*background-color: #F5FAFF;*/ | |||
| background-image: url(../assets/watercrawl.svg); | |||
| background-size: 16px; | |||
| } | |||
| @@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next' | |||
| import s from './index.module.css' | |||
| import NoData from './no-data' | |||
| import Firecrawl from './firecrawl' | |||
| import Watercrawl from './watercrawl' | |||
| import JinaReader from './jina-reader' | |||
| import cn from '@/utils/classnames' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| @@ -47,7 +48,11 @@ const Website: FC<Props> = ({ | |||
| // If users have configured one of the providers, select it. | |||
| const availableProviders = res.sources.filter((item: DataSourceItem) => | |||
| [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), | |||
| [ | |||
| DataSourceProvider.jinaReader, | |||
| DataSourceProvider.fireCrawl, | |||
| DataSourceProvider.waterCrawl, | |||
| ].includes(item.provider), | |||
| ) | |||
| if (availableProviders.length > 0) | |||
| @@ -70,6 +75,8 @@ const Website: FC<Props> = ({ | |||
| if (!isLoaded) | |||
| return null | |||
| const source = sources.find(source => source.provider === selectedProvider) | |||
| return ( | |||
| <div> | |||
| <div className="mb-4"> | |||
| @@ -86,7 +93,7 @@ const Website: FC<Props> = ({ | |||
| )} | |||
| onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} | |||
| > | |||
| <span className={cn(s.jinaLogo, 'mr-2')} /> | |||
| <span className={cn(s.jinaLogo, 'mr-2')}/> | |||
| <span>Jina Reader</span> | |||
| </button> | |||
| <button | |||
| @@ -100,40 +107,53 @@ const Website: FC<Props> = ({ | |||
| > | |||
| 🔥 Firecrawl | |||
| </button> | |||
| <button | |||
| className={cn('flex items-center justify-center rounded-lg px-4 py-2', | |||
| selectedProvider === DataSourceProvider.waterCrawl | |||
| ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' | |||
| : `system-sm-regular border border-components-option-card-option-border bg-components-option-card-option-bg text-text-secondary | |||
| hover:border-components-option-card-option-border-hover hover:bg-components-option-card-option-bg-hover hover:shadow-xs hover:shadow-shadow-shadow-3`, | |||
| )} | |||
| onClick={() => setSelectedProvider(DataSourceProvider.waterCrawl)} | |||
| > | |||
| <span className={cn(s.watercrawlLogo, 'mr-2')}/> | |||
| <span>WaterCrawl</span> | |||
| </button> | |||
| </div> | |||
| </div> | |||
| { | |||
| selectedProvider === DataSourceProvider.fireCrawl | |||
| ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) | |||
| ? ( | |||
| <Firecrawl | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| ) | |||
| : ( | |||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||
| ) | |||
| : sources.find(source => source.provider === DataSourceProvider.jinaReader) | |||
| ? ( | |||
| <JinaReader | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| ) | |||
| : ( | |||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||
| ) | |||
| } | |||
| {source && selectedProvider === DataSourceProvider.fireCrawl && ( | |||
| <Firecrawl | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| )} | |||
| {source && selectedProvider === DataSourceProvider.waterCrawl && ( | |||
| <Watercrawl | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| )} | |||
| {source && selectedProvider === DataSourceProvider.jinaReader && ( | |||
| <JinaReader | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| )} | |||
| {!source && ( | |||
| <NoData onConfig={handleOnConfig} provider={selectedProvider}/> | |||
| )} | |||
| </div> | |||
| ) | |||
| } | |||
| @@ -31,6 +31,11 @@ const NoData: FC<Props> = ({ | |||
| title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), | |||
| description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), | |||
| }, | |||
| [DataSourceProvider.waterCrawl]: { | |||
| emoji: <span className={s.watercrawlLogo} />, | |||
| title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), | |||
| description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), | |||
| }, | |||
| } | |||
| const currentProvider = providerConfig[provider] | |||
| @@ -0,0 +1,43 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react' | |||
| import Button from '@/app/components/base/button' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| onSetting: () => void | |||
| } | |||
| const Header: FC<Props> = ({ | |||
| onSetting, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| return ( | |||
| <div className='flex h-6 items-center justify-between'> | |||
| <div className='flex items-center'> | |||
| <div className='text-base font-medium text-text-secondary'>{t(`${I18N_PREFIX}.watercrawlTitle`)}</div> | |||
| <div className='ml-2 mr-2 w-px h-3.5 bg-divider-regular' /> | |||
| <Button className='flex items-center gap-x-[1px] h-6 px-1.5' onClick={onSetting}> | |||
| <RiEqualizer2Line className='w-3.5 h-3.5 text-components-button-secondary-text' /> | |||
| <span className='text-components-button-secondary-text text-xs font-medium px-[3px]'> | |||
| {t(`${I18N_PREFIX}.configureWatercrawl`)} | |||
| </span> | |||
| </Button> | |||
| </div> | |||
| <a | |||
| href='https://docs.watercrawl.dev/' | |||
| target='_blank' | |||
| rel='noopener noreferrer' | |||
| className='inline-flex items-center gap-x-1 text-xs font-medium text-text-accent' | |||
| > | |||
| <RiBookOpenLine className='w-3.5 h-3.5 text-text-accent' /> | |||
| <span>{t(`${I18N_PREFIX}.watercrawlDoc`)}</span> | |||
| </a> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(Header) | |||
| @@ -0,0 +1,217 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useEffect, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import UrlInput from '../base/url-input' | |||
| import OptionsWrap from '../base/options-wrap' | |||
| import CrawledResult from '../base/crawled-result' | |||
| import Crawling from '../base/crawling' | |||
| import ErrorMessage from '../base/error-message' | |||
| import Header from './header' | |||
| import Options from './options' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' | |||
| import Toast from '@/app/components/base/toast' | |||
| import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' | |||
| import { sleep } from '@/utils' | |||
| const ERROR_I18N_PREFIX = 'common.errorMsg' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| onPreview: (payload: CrawlResultItem) => void | |||
| checkedCrawlResult: CrawlResultItem[] | |||
| onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void | |||
| onJobIdChange: (jobId: string) => void | |||
| crawlOptions: CrawlOptions | |||
| onCrawlOptionsChange: (payload: CrawlOptions) => void | |||
| } | |||
| enum Step { | |||
| init = 'init', | |||
| running = 'running', | |||
| finished = 'finished', | |||
| } | |||
| const WaterCrawl: FC<Props> = ({ | |||
| onPreview, | |||
| checkedCrawlResult, | |||
| onCheckedCrawlResultChange, | |||
| onJobIdChange, | |||
| crawlOptions, | |||
| onCrawlOptionsChange, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const [step, setStep] = useState<Step>(Step.init) | |||
| const [controlFoldOptions, setControlFoldOptions] = useState<number>(0) | |||
| useEffect(() => { | |||
| if (step !== Step.init) | |||
| setControlFoldOptions(Date.now()) | |||
| }, [step]) | |||
| const { setShowAccountSettingModal } = useModalContext() | |||
| const handleSetting = useCallback(() => { | |||
| setShowAccountSettingModal({ | |||
| payload: 'data-source', | |||
| }) | |||
| }, [setShowAccountSettingModal]) | |||
| const checkValid = useCallback((url: string) => { | |||
| let errorMsg = '' | |||
| if (!url) { | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||
| field: 'url', | |||
| }) | |||
| } | |||
| if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) | |||
| if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||
| field: t(`${I18N_PREFIX}.limit`), | |||
| }) | |||
| } | |||
| return { | |||
| isValid: !errorMsg, | |||
| errorMsg, | |||
| } | |||
| }, [crawlOptions, t]) | |||
| const isInit = step === Step.init | |||
| const isCrawlFinished = step === Step.finished | |||
| const isRunning = step === Step.running | |||
| const [crawlResult, setCrawlResult] = useState<{ | |||
| current: number | |||
| total: number | |||
| data: CrawlResultItem[] | |||
| time_consuming: number | string | |||
| } | undefined>(undefined) | |||
| const [crawlErrorMessage, setCrawlErrorMessage] = useState('') | |||
| const showError = isCrawlFinished && crawlErrorMessage | |||
| const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => { | |||
| try { | |||
| const res = await checkWatercrawlTaskStatus(jobId) as any | |||
| if (res.status === 'completed') { | |||
| return { | |||
| isError: false, | |||
| data: { | |||
| ...res, | |||
| total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), | |||
| }, | |||
| } | |||
| } | |||
| if (res.status === 'error' || !res.status) { | |||
| // can't get the error message from the watercrawl api | |||
| return { | |||
| isError: true, | |||
| errorMessage: res.message, | |||
| data: { | |||
| data: [], | |||
| }, | |||
| } | |||
| } | |||
| // update the progress | |||
| setCrawlResult({ | |||
| ...res, | |||
| total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), | |||
| }) | |||
| onCheckedCrawlResultChange(res.data || []) // default select the crawl result | |||
| await sleep(2500) | |||
| return await waitForCrawlFinished(jobId) | |||
| } | |||
| catch (e: any) { | |||
| const errorBody = await e.json() | |||
| return { | |||
| isError: true, | |||
| errorMessage: errorBody.message, | |||
| data: { | |||
| data: [], | |||
| }, | |||
| } | |||
| } | |||
| }, [crawlOptions.limit]) | |||
| const handleRun = useCallback(async (url: string) => { | |||
| const { isValid, errorMsg } = checkValid(url) | |||
| if (!isValid) { | |||
| Toast.notify({ | |||
| message: errorMsg!, | |||
| type: 'error', | |||
| }) | |||
| return | |||
| } | |||
| setStep(Step.running) | |||
| try { | |||
| const passToServerCrawlOptions: any = { | |||
| ...crawlOptions, | |||
| } | |||
| if (crawlOptions.max_depth === '') | |||
| delete passToServerCrawlOptions.max_depth | |||
| const res = await createWatercrawlTask({ | |||
| url, | |||
| options: passToServerCrawlOptions, | |||
| }) as any | |||
| const jobId = res.job_id | |||
| onJobIdChange(jobId) | |||
| const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) | |||
| if (isError) { | |||
| setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) | |||
| } | |||
| else { | |||
| setCrawlResult(data) | |||
| onCheckedCrawlResultChange(data.data || []) // default select the crawl result | |||
| setCrawlErrorMessage('') | |||
| } | |||
| } | |||
| catch (e) { | |||
| setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) | |||
| console.log(e) | |||
| } | |||
| finally { | |||
| setStep(Step.finished) | |||
| } | |||
| }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) | |||
| return ( | |||
| <div> | |||
| <Header onSetting={handleSetting} /> | |||
| <div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle p-4 pb-0'> | |||
| <UrlInput onRun={handleRun} isRunning={isRunning} /> | |||
| <OptionsWrap | |||
| className='mt-4' | |||
| controlFoldOptions={controlFoldOptions} | |||
| > | |||
| <Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} /> | |||
| </OptionsWrap> | |||
| {!isInit && ( | |||
| <div className='relative left-[-16px] mt-3 w-[calc(100%_+_32px)] rounded-b-xl'> | |||
| {isRunning | |||
| && <Crawling | |||
| className='mt-2' | |||
| crawledNum={crawlResult?.current || 0} | |||
| totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0} | |||
| />} | |||
| {showError && ( | |||
| <ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} /> | |||
| )} | |||
| {isCrawlFinished && !showError | |||
| && <CrawledResult | |||
| className='mb-2' | |||
| list={crawlResult?.data || []} | |||
| checkedList={checkedCrawlResult} | |||
| onSelectedChange={onCheckedCrawlResultChange} | |||
| onPreview={onPreview} | |||
| usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0} | |||
| /> | |||
| } | |||
| </div> | |||
| )} | |||
| </div> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(WaterCrawl) | |||
| @@ -0,0 +1,85 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import CheckboxWithLabel from '../base/checkbox-with-label' | |||
| import Field from '../base/field' | |||
| import cn from '@/utils/classnames' | |||
| import type { CrawlOptions } from '@/models/datasets' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| className?: string | |||
| payload: CrawlOptions | |||
| onChange: (payload: CrawlOptions) => void | |||
| } | |||
| const Options: FC<Props> = ({ | |||
| className = '', | |||
| payload, | |||
| onChange, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const handleChange = useCallback((key: keyof CrawlOptions) => { | |||
| return (value: any) => { | |||
| onChange({ | |||
| ...payload, | |||
| [key]: value, | |||
| }) | |||
| } | |||
| }, [payload, onChange]) | |||
| return ( | |||
| <div className={cn(className, ' space-y-2')}> | |||
| <CheckboxWithLabel | |||
| label={t(`${I18N_PREFIX}.crawlSubPage`)} | |||
| isChecked={payload.crawl_sub_pages} | |||
| onChange={handleChange('crawl_sub_pages')} | |||
| labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary' | |||
| /> | |||
| <div className='flex justify-between space-x-4'> | |||
| <Field | |||
| className='shrink-0 grow' | |||
| label={t(`${I18N_PREFIX}.limit`)} | |||
| value={payload.limit} | |||
| onChange={handleChange('limit')} | |||
| isNumber | |||
| isRequired | |||
| /> | |||
| <Field | |||
| className='shrink-0 grow' | |||
| label={t(`${I18N_PREFIX}.maxDepth`)} | |||
| value={payload.max_depth} | |||
| onChange={handleChange('max_depth')} | |||
| isNumber | |||
| tooltip={t(`${I18N_PREFIX}.maxDepthTooltip`)!} | |||
| /> | |||
| </div> | |||
| <div className='flex justify-between space-x-4'> | |||
| <Field | |||
| className='shrink-0 grow' | |||
| label={t(`${I18N_PREFIX}.excludePaths`)} | |||
| value={payload.excludes} | |||
| onChange={handleChange('excludes')} | |||
| placeholder='blog/*, /about/*' | |||
| /> | |||
| <Field | |||
| className='shrink-0 grow' | |||
| label={t(`${I18N_PREFIX}.includeOnlyPaths`)} | |||
| value={payload.includes} | |||
| onChange={handleChange('includes')} | |||
| placeholder='articles/*' | |||
| /> | |||
| </div> | |||
| <CheckboxWithLabel | |||
| label={t(`${I18N_PREFIX}.extractOnlyMainContent`)} | |||
| isChecked={payload.only_main_content} | |||
| onChange={handleChange('only_main_content')} | |||
| labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary' | |||
| /> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(Options) | |||
| @@ -0,0 +1,161 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { | |||
| PortalToFollowElem, | |||
| PortalToFollowElemContent, | |||
| } from '@/app/components/base/portal-to-follow-elem' | |||
| import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' | |||
| import Button from '@/app/components/base/button' | |||
| import type { WatercrawlConfig } from '@/models/common' | |||
| import Field from '@/app/components/datasets/create/website/base/field' | |||
| import Toast from '@/app/components/base/toast' | |||
| import { createDataSourceApiKeyBinding } from '@/service/datasets' | |||
| import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' | |||
| type Props = { | |||
| onCancel: () => void | |||
| onSaved: () => void | |||
| } | |||
| const I18N_PREFIX = 'datasetCreation.watercrawl' | |||
| const DEFAULT_BASE_URL = 'https://app.watercrawl.dev' | |||
| const ConfigWatercrawlModal: FC<Props> = ({ | |||
| onCancel, | |||
| onSaved, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const [isSaving, setIsSaving] = useState(false) | |||
| const [config, setConfig] = useState<WatercrawlConfig>({ | |||
| api_key: '', | |||
| base_url: '', | |||
| }) | |||
| const handleConfigChange = useCallback((key: string) => { | |||
| return (value: string | number) => { | |||
| setConfig(prev => ({ ...prev, [key]: value as string })) | |||
| } | |||
| }, []) | |||
| const handleSave = useCallback(async () => { | |||
| if (isSaving) | |||
| return | |||
| let errorMsg = '' | |||
| if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://')))) | |||
| errorMsg = t('common.errorMsg.urlError') | |||
| if (!errorMsg) { | |||
| if (!config.api_key) { | |||
| errorMsg = t('common.errorMsg.fieldRequired', { | |||
| field: 'API Key', | |||
| }) | |||
| } | |||
| } | |||
| if (errorMsg) { | |||
| Toast.notify({ | |||
| type: 'error', | |||
| message: errorMsg, | |||
| }) | |||
| return | |||
| } | |||
| const postData = { | |||
| category: 'website', | |||
| provider: 'watercrawl', | |||
| credentials: { | |||
| auth_type: 'x-api-key', | |||
| config: { | |||
| api_key: config.api_key, | |||
| base_url: config.base_url || DEFAULT_BASE_URL, | |||
| }, | |||
| }, | |||
| } | |||
| try { | |||
| setIsSaving(true) | |||
| await createDataSourceApiKeyBinding(postData) | |||
| Toast.notify({ | |||
| type: 'success', | |||
| message: t('common.api.success'), | |||
| }) | |||
| } | |||
| finally { | |||
| setIsSaving(false) | |||
| } | |||
| onSaved() | |||
| }, [config.api_key, config.base_url, onSaved, t, isSaving]) | |||
| return ( | |||
| <PortalToFollowElem open> | |||
| <PortalToFollowElemContent className='w-full h-full z-[60]'> | |||
| <div className='fixed inset-0 flex items-center justify-center bg-background-overlay'> | |||
| <div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-components-panel-bg shadow-xl rounded-2xl overflow-y-auto'> | |||
| <div className='px-8 pt-8'> | |||
| <div className='flex justify-between items-center mb-4'> | |||
| <div className='system-xl-semibold text-text-primary'>{t(`${I18N_PREFIX}.configWatercrawl`)}</div> | |||
| </div> | |||
| <div className='space-y-4'> | |||
| <Field | |||
| label='API Key' | |||
| labelClassName='!text-sm' | |||
| isRequired | |||
| value={config.api_key} | |||
| onChange={handleConfigChange('api_key')} | |||
| placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!} | |||
| /> | |||
| <Field | |||
| label='Base URL' | |||
| labelClassName='!text-sm' | |||
| value={config.base_url} | |||
| onChange={handleConfigChange('base_url')} | |||
| placeholder={DEFAULT_BASE_URL} | |||
| /> | |||
| </div> | |||
| <div className='my-8 flex justify-between items-center h-8'> | |||
| <a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-text-accent' target='_blank' href='https://app.watercrawl.dev/'> | |||
| <span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span> | |||
| <LinkExternal02 className='w-3 h-3' /> | |||
| </a> | |||
| <div className='flex'> | |||
| <Button | |||
| size='large' | |||
| className='mr-2' | |||
| onClick={onCancel} | |||
| > | |||
| {t('common.operation.cancel')} | |||
| </Button> | |||
| <Button | |||
| variant='primary' | |||
| size='large' | |||
| onClick={handleSave} | |||
| loading={isSaving} | |||
| > | |||
| {t('common.operation.save')} | |||
| </Button> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| <div className='border-t-[0.5px] border-t-divider-regular'> | |||
| <div className='flex justify-center items-center py-3 bg-background-section-burn text-xs text-text-tertiary'> | |||
| <Lock01 className='mr-1 w-3 h-3 text-text-tertiary' /> | |||
| {t('common.modelProvider.encrypted.front')} | |||
| <a | |||
| className='text-text-accent mx-1' | |||
| target='_blank' rel='noopener noreferrer' | |||
| href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html' | |||
| > | |||
| PKCS1_OAEP | |||
| </a> | |||
| {t('common.modelProvider.encrypted.back')} | |||
| </div> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| </PortalToFollowElemContent> | |||
| </PortalToFollowElem> | |||
| ) | |||
| } | |||
| export default React.memo(ConfigWatercrawlModal) | |||
| @@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next' | |||
| import Panel from '../panel' | |||
| import { DataSourceType } from '../panel/types' | |||
| import ConfigFirecrawlModal from './config-firecrawl-modal' | |||
| import ConfigWatercrawlModal from './config-watercrawl-modal' | |||
| import ConfigJinaReaderModal from './config-jina-reader-modal' | |||
| import cn from '@/utils/classnames' | |||
| import s from '@/app/components/datasets/create/website/index.module.css' | |||
| import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' | |||
| import type { | |||
| DataSourceItem, | |||
| } from '@/models/common' | |||
| import type { DataSourceItem } from '@/models/common' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| import { useAppContext } from '@/context/app-context' | |||
| import { | |||
| DataSourceProvider, | |||
| } from '@/models/common' | |||
| import Toast from '@/app/components/base/toast' | |||
| type Props = { | |||
| @@ -58,6 +54,16 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => { | |||
| return source?.id | |||
| } | |||
| const getProviderName = (provider: DataSourceProvider): string => { | |||
| if (provider === DataSourceProvider.fireCrawl) | |||
| return 'Firecrawl' | |||
| if (provider === DataSourceProvider.waterCrawl) | |||
| return 'WaterCrawl' | |||
| return 'Jina Reader' | |||
| } | |||
| const handleRemove = useCallback((provider: DataSourceProvider) => { | |||
| return async () => { | |||
| const dataSourceId = getIdByProvider(provider) | |||
| @@ -82,27 +88,42 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => { | |||
| readOnly={!isCurrentWorkspaceManager} | |||
| configuredList={sources.filter(item => item.provider === provider).map(item => ({ | |||
| id: item.id, | |||
| logo: ({ className }: { className: string }) => ( | |||
| item.provider === DataSourceProvider.fireCrawl | |||
| ? ( | |||
| <div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div> | |||
| logo: ({ className }: { className: string }) => { | |||
| if (item.provider === DataSourceProvider.fireCrawl) { | |||
| return ( | |||
| <div | |||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div> | |||
| ) | |||
| : ( | |||
| <div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||
| <span className={s.jinaLogo} /> | |||
| } | |||
| if (item.provider === DataSourceProvider.waterCrawl) { | |||
| return ( | |||
| <div | |||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||
| <span className={s.watercrawlLogo}/> | |||
| </div> | |||
| ) | |||
| ), | |||
| name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', | |||
| } | |||
| return ( | |||
| <div | |||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||
| <span className={s.jinaLogo}/> | |||
| </div> | |||
| ) | |||
| }, | |||
| name: getProviderName(item.provider), | |||
| isActive: true, | |||
| }))} | |||
| onRemove={handleRemove(provider)} | |||
| /> | |||
| {configTarget === DataSourceProvider.fireCrawl && ( | |||
| <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} /> | |||
| <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig}/> | |||
| )} | |||
| {configTarget === DataSourceProvider.waterCrawl && ( | |||
| <ConfigWatercrawlModal onSaved={handleAdded} onCancel={hideConfig}/> | |||
| )} | |||
| {configTarget === DataSourceProvider.jinaReader && ( | |||
| <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} /> | |||
| <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig}/> | |||
| )} | |||
| </> | |||
| @@ -15,6 +15,7 @@ export default function DataSourcePage() { | |||
| <DataSourceNotion workspaces={notionWorkspaces} /> | |||
| <DataSourceWebsite provider={DataSourceProvider.jinaReader} /> | |||
| <DataSourceWebsite provider={DataSourceProvider.fireCrawl} /> | |||
| <DataSourceWebsite provider={DataSourceProvider.waterCrawl} /> | |||
| </div> | |||
| ) | |||
| } | |||
| @@ -41,6 +41,12 @@ const Panel: FC<Props> = ({ | |||
| const isNotion = type === DataSourceType.notion | |||
| const isWebsite = type === DataSourceType.website | |||
| const getProviderName = (): string => { | |||
| if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl' | |||
| if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl' | |||
| return 'Jina Reader' | |||
| } | |||
| return ( | |||
| <div className='mb-2 rounded-xl bg-background-section-burn'> | |||
| <div className='flex items-center px-3 py-[9px]'> | |||
| @@ -50,7 +56,7 @@ const Panel: FC<Props> = ({ | |||
| <div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div> | |||
| {isWebsite && ( | |||
| <div className='ml-1 rounded-md bg-components-badge-white-to-dark px-1.5 text-xs font-medium leading-[18px] text-text-secondary'> | |||
| <span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'} | |||
| <span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> {getProviderName()} | |||
| </div> | |||
| )} | |||
| </div> | |||
| @@ -15,6 +15,11 @@ const translation = { | |||
| apiKeyPlaceholder: 'API key from firecrawl.dev', | |||
| getApiKeyLinkText: 'Get your API key from firecrawl.dev', | |||
| }, | |||
| watercrawl: { | |||
| configWatercrawl: 'Configure Watercrawl', | |||
| apiKeyPlaceholder: 'API key from watercrawl.dev', | |||
| getApiKeyLinkText: 'Get your API key from watercrawl.dev', | |||
| }, | |||
| jinaReader: { | |||
| configJinaReader: 'Configure Jina Reader', | |||
| apiKeyPlaceholder: 'API key from jina.ai', | |||
| @@ -64,15 +69,21 @@ const translation = { | |||
| chooseProvider: 'Select a provider', | |||
| fireCrawlNotConfigured: 'Firecrawl is not configured', | |||
| fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', | |||
| watercrawlNotConfigured: 'Watercrawl is not configured', | |||
| watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.', | |||
| jinaReaderNotConfigured: 'Jina Reader is not configured', | |||
| jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', | |||
| configure: 'Configure', | |||
| configureFirecrawl: 'Configure Firecrawl', | |||
| configureWatercrawl: 'Configure Watercrawl', | |||
| configureJinaReader: 'Configure Jina Reader', | |||
| run: 'Run', | |||
| firecrawlTitle: 'Extract web content with 🔥Firecrawl', | |||
| firecrawlDoc: 'Firecrawl docs', | |||
| firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | |||
| watercrawlTitle: 'Extract web content with Watercrawl', | |||
| watercrawlDoc: 'Watercrawl docs', | |||
| watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | |||
| jinaReaderTitle: 'Convert the entire site to Markdown', | |||
| jinaReaderDoc: 'Learn more about Jina Reader', | |||
| jinaReaderDocLink: 'https://jina.ai/reader', | |||
| @@ -178,6 +178,7 @@ export enum DataSourceCategory { | |||
| export enum DataSourceProvider { | |||
| fireCrawl = 'firecrawl', | |||
| jinaReader = 'jinareader', | |||
| waterCrawl = 'watercrawl', | |||
| } | |||
| export type FirecrawlConfig = { | |||
| @@ -185,6 +186,11 @@ export type FirecrawlConfig = { | |||
| base_url: string | |||
| } | |||
| export type WatercrawlConfig = { | |||
| api_key: string | |||
| base_url: string | |||
| } | |||
| export type DataSourceItem = { | |||
| id: string | |||
| category: DataSourceCategory | |||
| @@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId | |||
| }) | |||
| } | |||
| export const createWatercrawlTask: Fetcher<CommonResponse, Record<string, any>> = (body) => { | |||
| return post<CommonResponse>('website/crawl', { | |||
| body: { | |||
| ...body, | |||
| provider: DataSourceProvider.waterCrawl, | |||
| }, | |||
| }) | |||
| } | |||
| export const checkWatercrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => { | |||
| return get<CommonResponse>(`website/crawl/status/${jobId}`, { | |||
| params: { | |||
| provider: DataSourceProvider.waterCrawl, | |||
| }, | |||
| }, { | |||
| silent: true, | |||
| }) | |||
| } | |||
| type FileTypesRes = { | |||
| allowed_extensions: string[] | |||
| } | |||