Co-authored-by: crazywoola <427733928@qq.com>tags/1.2.0
| def post(self): | def post(self): | ||||
| parser = reqparse.RequestParser() | parser = reqparse.RequestParser() | ||||
| parser.add_argument( | parser.add_argument( | ||||
| "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" | |||||
| "provider", | |||||
| type=str, | |||||
| choices=["firecrawl", "watercrawl", "jinareader"], | |||||
| required=True, | |||||
| nullable=True, | |||||
| location="json", | |||||
| ) | ) | ||||
| parser.add_argument("url", type=str, required=True, nullable=True, location="json") | parser.add_argument("url", type=str, required=True, nullable=True, location="json") | ||||
| parser.add_argument("options", type=dict, required=True, nullable=True, location="json") | parser.add_argument("options", type=dict, required=True, nullable=True, location="json") | ||||
| @account_initialization_required | @account_initialization_required | ||||
| def get(self, job_id: str): | def get(self, job_id: str): | ||||
| parser = reqparse.RequestParser() | parser = reqparse.RequestParser() | ||||
| parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") | |||||
| parser.add_argument( | |||||
| "provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], required=True, location="args" | |||||
| ) | |||||
| args = parser.parse_args() | args = parser.parse_args() | ||||
| # get crawl status | # get crawl status | ||||
| try: | try: |
| from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor | from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor | ||||
| from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor | from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor | ||||
| from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor | from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor | ||||
| from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor | |||||
| from core.rag.extractor.word_extractor import WordExtractor | from core.rag.extractor.word_extractor import WordExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| from extensions.ext_storage import storage | from extensions.ext_storage import storage | ||||
| only_main_content=extract_setting.website_info.only_main_content, | only_main_content=extract_setting.website_info.only_main_content, | ||||
| ) | ) | ||||
| return extractor.extract() | return extractor.extract() | ||||
| elif extract_setting.website_info.provider == "watercrawl": | |||||
| extractor = WaterCrawlWebExtractor( | |||||
| url=extract_setting.website_info.url, | |||||
| job_id=extract_setting.website_info.job_id, | |||||
| tenant_id=extract_setting.website_info.tenant_id, | |||||
| mode=extract_setting.website_info.mode, | |||||
| only_main_content=extract_setting.website_info.only_main_content, | |||||
| ) | |||||
| return extractor.extract() | |||||
| elif extract_setting.website_info.provider == "jinareader": | elif extract_setting.website_info.provider == "jinareader": | ||||
| extractor = JinaReaderWebExtractor( | extractor = JinaReaderWebExtractor( | ||||
| url=extract_setting.website_info.url, | url=extract_setting.website_info.url, |
| import json | |||||
| from collections.abc import Generator | |||||
| from typing import Union | |||||
| from urllib.parse import urljoin | |||||
| import requests | |||||
| from requests import Response | |||||
| class BaseAPIClient: | |||||
| def __init__(self, api_key, base_url): | |||||
| self.api_key = api_key | |||||
| self.base_url = base_url | |||||
| self.session = self.init_session() | |||||
| def init_session(self): | |||||
| session = requests.Session() | |||||
| session.headers.update({"X-API-Key": self.api_key}) | |||||
| session.headers.update({"Content-Type": "application/json"}) | |||||
| session.headers.update({"Accept": "application/json"}) | |||||
| session.headers.update({"User-Agent": "WaterCrawl-Plugin"}) | |||||
| session.headers.update({"Accept-Language": "en-US"}) | |||||
| return session | |||||
| def _get(self, endpoint: str, query_params: dict | None = None, **kwargs): | |||||
| return self.session.get(urljoin(self.base_url, endpoint), params=query_params, **kwargs) | |||||
| def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||||
| return self.session.post(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||||
| def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||||
| return self.session.put(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||||
| def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs): | |||||
| return self.session.delete(urljoin(self.base_url, endpoint), params=query_params, **kwargs) | |||||
| def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): | |||||
| return self.session.patch(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) | |||||
| class WaterCrawlAPIClient(BaseAPIClient): | |||||
| def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"): | |||||
| super().__init__(api_key, base_url) | |||||
| def process_eventstream(self, response: Response, download: bool = False) -> Generator: | |||||
| for line in response.iter_lines(): | |||||
| line = line.decode("utf-8") | |||||
| if line.startswith("data:"): | |||||
| line = line[5:].strip() | |||||
| data = json.loads(line) | |||||
| if data["type"] == "result" and download: | |||||
| data["data"] = self.download_result(data["data"]) | |||||
| yield data | |||||
| def process_response(self, response: Response) -> dict | bytes | list | None | Generator: | |||||
| response.raise_for_status() | |||||
| if response.status_code == 204: | |||||
| return None | |||||
| if response.headers.get("Content-Type") == "application/json": | |||||
| return response.json() or {} | |||||
| if response.headers.get("Content-Type") == "application/octet-stream": | |||||
| return response.content | |||||
| if response.headers.get("Content-Type") == "text/event-stream": | |||||
| return self.process_eventstream(response) | |||||
| raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}") | |||||
| def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): | |||||
| query_params = {"page": page or 1, "page_size": page_size or 10} | |||||
| return self.process_response( | |||||
| self._get( | |||||
| "/api/v1/core/crawl-requests/", | |||||
| query_params=query_params, | |||||
| ) | |||||
| ) | |||||
| def get_crawl_request(self, item_id: str): | |||||
| return self.process_response( | |||||
| self._get( | |||||
| f"/api/v1/core/crawl-requests/{item_id}/", | |||||
| ) | |||||
| ) | |||||
| def create_crawl_request( | |||||
| self, | |||||
| url: Union[list, str] | None = None, | |||||
| spider_options: dict | None = None, | |||||
| page_options: dict | None = None, | |||||
| plugin_options: dict | None = None, | |||||
| ): | |||||
| data = { | |||||
| # 'urls': url if isinstance(url, list) else [url], | |||||
| "url": url, | |||||
| "options": { | |||||
| "spider_options": spider_options or {}, | |||||
| "page_options": page_options or {}, | |||||
| "plugin_options": plugin_options or {}, | |||||
| }, | |||||
| } | |||||
| return self.process_response( | |||||
| self._post( | |||||
| "/api/v1/core/crawl-requests/", | |||||
| data=data, | |||||
| ) | |||||
| ) | |||||
| def stop_crawl_request(self, item_id: str): | |||||
| return self.process_response( | |||||
| self._delete( | |||||
| f"/api/v1/core/crawl-requests/{item_id}/", | |||||
| ) | |||||
| ) | |||||
| def download_crawl_request(self, item_id: str): | |||||
| return self.process_response( | |||||
| self._get( | |||||
| f"/api/v1/core/crawl-requests/{item_id}/download/", | |||||
| ) | |||||
| ) | |||||
| def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator: | |||||
| query_params = {"prefetched": str(prefetched).lower()} | |||||
| generator = self.process_response( | |||||
| self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params), | |||||
| ) | |||||
| if not isinstance(generator, Generator): | |||||
| raise ValueError("Generator expected") | |||||
| yield from generator | |||||
| def get_crawl_request_results( | |||||
| self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None | |||||
| ): | |||||
| query_params = query_params or {} | |||||
| query_params.update({"page": page or 1, "page_size": page_size or 25}) | |||||
| return self.process_response( | |||||
| self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params) | |||||
| ) | |||||
| def scrape_url( | |||||
| self, | |||||
| url: str, | |||||
| page_options: dict | None = None, | |||||
| plugin_options: dict | None = None, | |||||
| sync: bool = True, | |||||
| prefetched: bool = True, | |||||
| ): | |||||
| response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options) | |||||
| if not sync: | |||||
| return response_result | |||||
| for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched): | |||||
| if event_data["type"] == "result": | |||||
| return event_data["data"] | |||||
| def download_result(self, result_object: dict): | |||||
| response = requests.get(result_object["result"]) | |||||
| response.raise_for_status() | |||||
| result_object["result"] = response.json() | |||||
| return result_object |
| from core.rag.extractor.extractor_base import BaseExtractor | |||||
| from core.rag.models.document import Document | |||||
| from services.website_service import WebsiteService | |||||
| class WaterCrawlWebExtractor(BaseExtractor): | |||||
| """ | |||||
| Crawl and scrape websites and return content in clean llm-ready markdown. | |||||
| Args: | |||||
| url: The URL to scrape. | |||||
| api_key: The API key for WaterCrawl. | |||||
| base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'. | |||||
| mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. | |||||
| only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. | |||||
| """ | |||||
| def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): | |||||
| """Initialize with url, api_key, base_url and mode.""" | |||||
| self._url = url | |||||
| self.job_id = job_id | |||||
| self.tenant_id = tenant_id | |||||
| self.mode = mode | |||||
| self.only_main_content = only_main_content | |||||
| def extract(self) -> list[Document]: | |||||
| """Extract content from the URL.""" | |||||
| documents = [] | |||||
| if self.mode == "crawl": | |||||
| crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) | |||||
| if crawl_data is None: | |||||
| return [] | |||||
| document = Document( | |||||
| page_content=crawl_data.get("markdown", ""), | |||||
| metadata={ | |||||
| "source_url": crawl_data.get("source_url"), | |||||
| "description": crawl_data.get("description"), | |||||
| "title": crawl_data.get("title"), | |||||
| }, | |||||
| ) | |||||
| documents.append(document) | |||||
| elif self.mode == "scrape": | |||||
| scrape_data = WebsiteService.get_scrape_url_data( | |||||
| "watercrawl", self._url, self.tenant_id, self.only_main_content | |||||
| ) | |||||
| document = Document( | |||||
| page_content=scrape_data.get("markdown", ""), | |||||
| metadata={ | |||||
| "source_url": scrape_data.get("source_url"), | |||||
| "description": scrape_data.get("description"), | |||||
| "title": scrape_data.get("title"), | |||||
| }, | |||||
| ) | |||||
| documents.append(document) | |||||
| return documents |
| from collections.abc import Generator | |||||
| from datetime import datetime | |||||
| from typing import Any | |||||
| from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient | |||||
| class WaterCrawlProvider: | |||||
| def __init__(self, api_key, base_url: str | None = None): | |||||
| self.client = WaterCrawlAPIClient(api_key, base_url) | |||||
| def crawl_url(self, url, options: dict | Any = None) -> dict: | |||||
| options = options or {} | |||||
| spider_options = { | |||||
| "max_depth": 1, | |||||
| "page_limit": 1, | |||||
| "allowed_domains": [], | |||||
| "exclude_paths": [], | |||||
| "include_paths": [], | |||||
| } | |||||
| if options.get("crawl_sub_pages", True): | |||||
| spider_options["page_limit"] = options.get("limit", 1) | |||||
| spider_options["max_depth"] = options.get("depth", 1) | |||||
| spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else [] | |||||
| spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else [] | |||||
| wait_time = options.get("wait_time", 1000) | |||||
| page_options = { | |||||
| "exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [], | |||||
| "include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [], | |||||
| "wait_time": max(1000, wait_time), # minimum wait time is 1 second | |||||
| "include_html": False, | |||||
| "only_main_content": options.get("only_main_content", True), | |||||
| "include_links": False, | |||||
| "timeout": 15000, | |||||
| "accept_cookies_selector": "#cookies-accept", | |||||
| "locale": "en-US", | |||||
| "actions": [], | |||||
| } | |||||
| result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options) | |||||
| return {"status": "active", "job_id": result.get("uuid")} | |||||
| def get_crawl_status(self, crawl_request_id) -> dict: | |||||
| response = self.client.get_crawl_request(crawl_request_id) | |||||
| data = [] | |||||
| if response["status"] in ["new", "running"]: | |||||
| status = "active" | |||||
| else: | |||||
| status = "completed" | |||||
| data = list(self._get_results(crawl_request_id)) | |||||
| time_str = response.get("duration") | |||||
| time_consuming: float = 0 | |||||
| if time_str: | |||||
| time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") | |||||
| time_consuming = ( | |||||
| time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 | |||||
| ) | |||||
| return { | |||||
| "status": status, | |||||
| "job_id": response.get("uuid"), | |||||
| "total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1), | |||||
| "current": response.get("number_of_documents", 0), | |||||
| "data": data, | |||||
| "time_consuming": time_consuming, | |||||
| } | |||||
| def get_crawl_url_data(self, job_id, url) -> dict | None: | |||||
| if not job_id: | |||||
| return self.scrape_url(url) | |||||
| for result in self._get_results( | |||||
| job_id, | |||||
| { | |||||
| # filter by url | |||||
| "url": url | |||||
| }, | |||||
| ): | |||||
| return result | |||||
| return None | |||||
| def scrape_url(self, url: str) -> dict: | |||||
| response = self.client.scrape_url(url=url, sync=True, prefetched=True) | |||||
| return self._structure_data(response) | |||||
| def _structure_data(self, result_object: dict) -> dict: | |||||
| if isinstance(result_object.get("result", {}), str): | |||||
| raise ValueError("Invalid result object. Expected a dictionary.") | |||||
| metadata = result_object.get("result", {}).get("metadata", {}) | |||||
| return { | |||||
| "title": metadata.get("og:title") or metadata.get("title"), | |||||
| "description": metadata.get("description"), | |||||
| "source_url": result_object.get("url"), | |||||
| "markdown": result_object.get("result", {}).get("markdown"), | |||||
| } | |||||
| def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]: | |||||
| page = 0 | |||||
| page_size = 100 | |||||
| query_params = query_params or {} | |||||
| query_params.update({"prefetched": "true"}) | |||||
| while True: | |||||
| page += 1 | |||||
| response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params) | |||||
| if not response["results"]: | |||||
| break | |||||
| for result in response["results"]: | |||||
| yield self._structure_data(result) | |||||
| if response["next"] is None: | |||||
| break |
| from services.auth.firecrawl.firecrawl import FirecrawlAuth | from services.auth.firecrawl.firecrawl import FirecrawlAuth | ||||
| return FirecrawlAuth | return FirecrawlAuth | ||||
| case AuthType.WATERCRAWL: | |||||
| from services.auth.watercrawl.watercrawl import WatercrawlAuth | |||||
| return WatercrawlAuth | |||||
| case AuthType.JINA: | case AuthType.JINA: | ||||
| from services.auth.jina.jina import JinaAuth | from services.auth.jina.jina import JinaAuth | ||||
| class AuthType(StrEnum): | class AuthType(StrEnum): | ||||
| FIRECRAWL = "firecrawl" | FIRECRAWL = "firecrawl" | ||||
| WATERCRAWL = "watercrawl" | |||||
| JINA = "jinareader" | JINA = "jinareader" |
| import json | |||||
| from urllib.parse import urljoin | |||||
| import requests | |||||
| from services.auth.api_key_auth_base import ApiKeyAuthBase | |||||
| class WatercrawlAuth(ApiKeyAuthBase): | |||||
| def __init__(self, credentials: dict): | |||||
| super().__init__(credentials) | |||||
| auth_type = credentials.get("auth_type") | |||||
| if auth_type != "x-api-key": | |||||
| raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key") | |||||
| self.api_key = credentials.get("config", {}).get("api_key", None) | |||||
| self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev") | |||||
| if not self.api_key: | |||||
| raise ValueError("No API key provided") | |||||
| def validate_credentials(self): | |||||
| headers = self._prepare_headers() | |||||
| url = urljoin(self.base_url, "/api/v1/core/crawl-requests/") | |||||
| response = self._get_request(url, headers) | |||||
| if response.status_code == 200: | |||||
| return True | |||||
| else: | |||||
| self._handle_error(response) | |||||
| def _prepare_headers(self): | |||||
| return {"Content-Type": "application/json", "X-API-KEY": self.api_key} | |||||
| def _get_request(self, url, headers): | |||||
| return requests.get(url, headers=headers) | |||||
| def _handle_error(self, response): | |||||
| if response.status_code in {402, 409, 500}: | |||||
| error_message = response.json().get("error", "Unknown error occurred") | |||||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||||
| else: | |||||
| if response.text: | |||||
| error_message = json.loads(response.text).get("error", "Unknown error occurred") | |||||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||||
| raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") |
| from core.helper import encrypter | from core.helper import encrypter | ||||
| from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp | from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp | ||||
| from core.rag.extractor.watercrawl.provider import WaterCrawlProvider | |||||
| from extensions.ext_redis import redis_client | from extensions.ext_redis import redis_client | ||||
| from extensions.ext_storage import storage | from extensions.ext_storage import storage | ||||
| from services.auth.api_key_auth_service import ApiKeyAuthService | from services.auth.api_key_auth_service import ApiKeyAuthService | ||||
| time = str(datetime.datetime.now().timestamp()) | time = str(datetime.datetime.now().timestamp()) | ||||
| redis_client.setex(website_crawl_time_cache_key, 3600, time) | redis_client.setex(website_crawl_time_cache_key, 3600, time) | ||||
| return {"status": "active", "job_id": job_id} | return {"status": "active", "job_id": job_id} | ||||
| elif provider == "watercrawl": | |||||
| # decrypt api_key | |||||
| api_key = encrypter.decrypt_token( | |||||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||||
| ) | |||||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options) | |||||
| elif provider == "jinareader": | elif provider == "jinareader": | ||||
| api_key = encrypter.decrypt_token( | api_key = encrypter.decrypt_token( | ||||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | ||||
| time_consuming = abs(end_time - float(start_time)) | time_consuming = abs(end_time - float(start_time)) | ||||
| crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" | crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" | ||||
| redis_client.delete(website_crawl_time_cache_key) | redis_client.delete(website_crawl_time_cache_key) | ||||
| elif provider == "watercrawl": | |||||
| # decrypt api_key | |||||
| api_key = encrypter.decrypt_token( | |||||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||||
| ) | |||||
| crawl_status_data = WaterCrawlProvider( | |||||
| api_key, credentials.get("config").get("base_url", None) | |||||
| ).get_crawl_status(job_id) | |||||
| elif provider == "jinareader": | elif provider == "jinareader": | ||||
| api_key = encrypter.decrypt_token( | api_key = encrypter.decrypt_token( | ||||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | ||||
| if item.get("source_url") == url: | if item.get("source_url") == url: | ||||
| return dict(item) | return dict(item) | ||||
| return None | return None | ||||
| elif provider == "watercrawl": | |||||
| api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) | |||||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data( | |||||
| job_id, url | |||||
| ) | |||||
| elif provider == "jinareader": | elif provider == "jinareader": | ||||
| if not job_id: | if not job_id: | ||||
| response = requests.get( | response = requests.get( | ||||
| params = {"onlyMainContent": only_main_content} | params = {"onlyMainContent": only_main_content} | ||||
| result = firecrawl_app.scrape_url(url, params) | result = firecrawl_app.scrape_url(url, params) | ||||
| return result | return result | ||||
| elif provider == "watercrawl": | |||||
| api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) | |||||
| return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url) | |||||
| else: | else: | ||||
| raise ValueError("Invalid provider") | raise ValueError("Invalid provider") |
| <?xml version="1.0" encoding="utf-8"?> | |||||
| <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500"> | |||||
| <path style="fill: rgb(0, 23, 87); stroke: rgb(13, 14, 52);" d="M 247.794 213.903 L 246.81 76.976 L 254.345 76.963 L 254.592 213.989 L 247.794 213.903 Z"/> | |||||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.025" cy="43.859" rx="33.966" ry="33.906"/> | |||||
| <path style="fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 282.472 260.389 L 414.181 330.253 L 410.563 336.234 L 279.38 265.739 L 282.472 260.389 Z"/> | |||||
| <path style="fill: rgb(15, 17, 57); stroke: rgb(13, 14, 52);" d="M 255.105 281.394 L 254.485 417.656 L 246.156 417.691 L 246.688 280.51 L 255.105 281.394 Z"/> | |||||
| <path style="paint-order: fill; fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 279.486 229.517 L 410.351 160.07 L 413.923 167.04 L 283.727 235.998 L 279.486 229.517 Z"/> | |||||
| <path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 88.545 164.884 L 219.797 236.07 L 222.867 229.568 L 90.887 159.47 L 88.545 164.884 Z"/> | |||||
| <path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 224.76 266.9 L 95.55 334.829 L 92.878 328.37 L 219.955 261.275 L 224.76 266.9 Z"/> | |||||
| <ellipse style="paint-order: fill; fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="251.242" cy="247.466" rx="33.966" ry="33.906"/> | |||||
| <path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 279.502 433.617 L 408.666 359.443 C 408.666 359.443 412.398 366.965 412.398 366.916 C 412.398 366.867 281.544 440.217 281.544 440.217 L 279.502 433.617 Z"/> | |||||
| <path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 223.119 431.408 L 96.643 361.068 L 93.265 368.047 L 218.895 438.099 L 223.119 431.408 Z"/> | |||||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.504" cy="451.168" rx="33.966" ry="33.906"/> | |||||
| <path style="fill: rgb(90, 191, 187); stroke: rgb(90, 191, 187);" d="M 435.665 180.895 L 435.859 316.869 L 443.103 315.579 L 442.56 180.697 L 435.665 180.895 Z"/> | |||||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="441.06" cy="349.665" rx="33.966" ry="33.906"/> | |||||
| <ellipse style="fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="441.512" cy="147.767" rx="33.966" ry="33.906"/> | |||||
| <path style="fill: rgb(84, 187, 181); stroke: rgb(84, 187, 181);" d="M 64.755 314.523 L 57.928 315.006 L 58.307 182.961 L 65.169 182.865 L 64.755 314.523 Z"/> | |||||
| <ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="58.177" cy="149.757" rx="33.966" ry="33.906"/> | |||||
| <ellipse style="fill: rgb(61, 224, 203); stroke: rgb(61, 224, 203);" cx="65.909" cy="348.17" rx="33.966" ry="33.906"/> | |||||
| </svg> |
| background-image: url(../assets/jina.png); | background-image: url(../assets/jina.png); | ||||
| background-size: 16px; | background-size: 16px; | ||||
| } | } | ||||
| .watercrawlLogo { | |||||
| @apply w-5 h-5 bg-center bg-no-repeat inline-block; | |||||
| /*background-color: #F5FAFF;*/ | |||||
| background-image: url(../assets/watercrawl.svg); | |||||
| background-size: 16px; | |||||
| } |
| import s from './index.module.css' | import s from './index.module.css' | ||||
| import NoData from './no-data' | import NoData from './no-data' | ||||
| import Firecrawl from './firecrawl' | import Firecrawl from './firecrawl' | ||||
| import Watercrawl from './watercrawl' | |||||
| import JinaReader from './jina-reader' | import JinaReader from './jina-reader' | ||||
| import cn from '@/utils/classnames' | import cn from '@/utils/classnames' | ||||
| import { useModalContext } from '@/context/modal-context' | import { useModalContext } from '@/context/modal-context' | ||||
| // If users have configured one of the providers, select it. | // If users have configured one of the providers, select it. | ||||
| const availableProviders = res.sources.filter((item: DataSourceItem) => | const availableProviders = res.sources.filter((item: DataSourceItem) => | ||||
| [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), | |||||
| [ | |||||
| DataSourceProvider.jinaReader, | |||||
| DataSourceProvider.fireCrawl, | |||||
| DataSourceProvider.waterCrawl, | |||||
| ].includes(item.provider), | |||||
| ) | ) | ||||
| if (availableProviders.length > 0) | if (availableProviders.length > 0) | ||||
| if (!isLoaded) | if (!isLoaded) | ||||
| return null | return null | ||||
| const source = sources.find(source => source.provider === selectedProvider) | |||||
| return ( | return ( | ||||
| <div> | <div> | ||||
| <div className="mb-4"> | <div className="mb-4"> | ||||
| )} | )} | ||||
| onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} | onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} | ||||
| > | > | ||||
| <span className={cn(s.jinaLogo, 'mr-2')} /> | |||||
| <span className={cn(s.jinaLogo, 'mr-2')}/> | |||||
| <span>Jina Reader</span> | <span>Jina Reader</span> | ||||
| </button> | </button> | ||||
| <button | <button | ||||
| > | > | ||||
| 🔥 Firecrawl | 🔥 Firecrawl | ||||
| </button> | </button> | ||||
| <button | |||||
| className={cn('flex items-center justify-center rounded-lg px-4 py-2', | |||||
| selectedProvider === DataSourceProvider.waterCrawl | |||||
| ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' | |||||
| : `system-sm-regular border border-components-option-card-option-border bg-components-option-card-option-bg text-text-secondary | |||||
| hover:border-components-option-card-option-border-hover hover:bg-components-option-card-option-bg-hover hover:shadow-xs hover:shadow-shadow-shadow-3`, | |||||
| )} | |||||
| onClick={() => setSelectedProvider(DataSourceProvider.waterCrawl)} | |||||
| > | |||||
| <span className={cn(s.watercrawlLogo, 'mr-2')}/> | |||||
| <span>WaterCrawl</span> | |||||
| </button> | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| { | |||||
| selectedProvider === DataSourceProvider.fireCrawl | |||||
| ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) | |||||
| ? ( | |||||
| <Firecrawl | |||||
| onPreview={onPreview} | |||||
| checkedCrawlResult={checkedCrawlResult} | |||||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||||
| onJobIdChange={onJobIdChange} | |||||
| crawlOptions={crawlOptions} | |||||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||||
| /> | |||||
| ) | |||||
| : ( | |||||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||||
| ) | |||||
| : sources.find(source => source.provider === DataSourceProvider.jinaReader) | |||||
| ? ( | |||||
| <JinaReader | |||||
| onPreview={onPreview} | |||||
| checkedCrawlResult={checkedCrawlResult} | |||||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||||
| onJobIdChange={onJobIdChange} | |||||
| crawlOptions={crawlOptions} | |||||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||||
| /> | |||||
| ) | |||||
| : ( | |||||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||||
| ) | |||||
| } | |||||
| {source && selectedProvider === DataSourceProvider.fireCrawl && ( | |||||
| <Firecrawl | |||||
| onPreview={onPreview} | |||||
| checkedCrawlResult={checkedCrawlResult} | |||||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||||
| onJobIdChange={onJobIdChange} | |||||
| crawlOptions={crawlOptions} | |||||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||||
| /> | |||||
| )} | |||||
| {source && selectedProvider === DataSourceProvider.waterCrawl && ( | |||||
| <Watercrawl | |||||
| onPreview={onPreview} | |||||
| checkedCrawlResult={checkedCrawlResult} | |||||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||||
| onJobIdChange={onJobIdChange} | |||||
| crawlOptions={crawlOptions} | |||||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||||
| /> | |||||
| )} | |||||
| {source && selectedProvider === DataSourceProvider.jinaReader && ( | |||||
| <JinaReader | |||||
| onPreview={onPreview} | |||||
| checkedCrawlResult={checkedCrawlResult} | |||||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||||
| onJobIdChange={onJobIdChange} | |||||
| crawlOptions={crawlOptions} | |||||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||||
| /> | |||||
| )} | |||||
| {!source && ( | |||||
| <NoData onConfig={handleOnConfig} provider={selectedProvider}/> | |||||
| )} | |||||
| </div> | </div> | ||||
| ) | ) | ||||
| } | } |
| title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), | title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), | ||||
| description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), | description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), | ||||
| }, | }, | ||||
| [DataSourceProvider.waterCrawl]: { | |||||
| emoji: <span className={s.watercrawlLogo} />, | |||||
| title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), | |||||
| description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), | |||||
| }, | |||||
| } | } | ||||
| const currentProvider = providerConfig[provider] | const currentProvider = providerConfig[provider] |
| 'use client' | |||||
| import type { FC } from 'react' | |||||
| import React from 'react' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react' | |||||
| import Button from '@/app/components/base/button' | |||||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||||
| type Props = { | |||||
| onSetting: () => void | |||||
| } | |||||
| const Header: FC<Props> = ({ | |||||
| onSetting, | |||||
| }) => { | |||||
| const { t } = useTranslation() | |||||
| return ( | |||||
| <div className='flex h-6 items-center justify-between'> | |||||
| <div className='flex items-center'> | |||||
| <div className='text-base font-medium text-text-secondary'>{t(`${I18N_PREFIX}.watercrawlTitle`)}</div> | |||||
| <div className='ml-2 mr-2 w-px h-3.5 bg-divider-regular' /> | |||||
| <Button className='flex items-center gap-x-[1px] h-6 px-1.5' onClick={onSetting}> | |||||
| <RiEqualizer2Line className='w-3.5 h-3.5 text-components-button-secondary-text' /> | |||||
| <span className='text-components-button-secondary-text text-xs font-medium px-[3px]'> | |||||
| {t(`${I18N_PREFIX}.configureWatercrawl`)} | |||||
| </span> | |||||
| </Button> | |||||
| </div> | |||||
| <a | |||||
| href='https://docs.watercrawl.dev/' | |||||
| target='_blank' | |||||
| rel='noopener noreferrer' | |||||
| className='inline-flex items-center gap-x-1 text-xs font-medium text-text-accent' | |||||
| > | |||||
| <RiBookOpenLine className='w-3.5 h-3.5 text-text-accent' /> | |||||
| <span>{t(`${I18N_PREFIX}.watercrawlDoc`)}</span> | |||||
| </a> | |||||
| </div> | |||||
| ) | |||||
| } | |||||
| export default React.memo(Header) |
| 'use client' | |||||
| import type { FC } from 'react' | |||||
| import React, { useCallback, useEffect, useState } from 'react' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import UrlInput from '../base/url-input' | |||||
| import OptionsWrap from '../base/options-wrap' | |||||
| import CrawledResult from '../base/crawled-result' | |||||
| import Crawling from '../base/crawling' | |||||
| import ErrorMessage from '../base/error-message' | |||||
| import Header from './header' | |||||
| import Options from './options' | |||||
| import { useModalContext } from '@/context/modal-context' | |||||
| import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' | |||||
| import Toast from '@/app/components/base/toast' | |||||
| import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' | |||||
| import { sleep } from '@/utils' | |||||
| const ERROR_I18N_PREFIX = 'common.errorMsg' | |||||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||||
| type Props = { | |||||
| onPreview: (payload: CrawlResultItem) => void | |||||
| checkedCrawlResult: CrawlResultItem[] | |||||
| onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void | |||||
| onJobIdChange: (jobId: string) => void | |||||
| crawlOptions: CrawlOptions | |||||
| onCrawlOptionsChange: (payload: CrawlOptions) => void | |||||
| } | |||||
| enum Step { | |||||
| init = 'init', | |||||
| running = 'running', | |||||
| finished = 'finished', | |||||
| } | |||||
| const WaterCrawl: FC<Props> = ({ | |||||
| onPreview, | |||||
| checkedCrawlResult, | |||||
| onCheckedCrawlResultChange, | |||||
| onJobIdChange, | |||||
| crawlOptions, | |||||
| onCrawlOptionsChange, | |||||
| }) => { | |||||
| const { t } = useTranslation() | |||||
| const [step, setStep] = useState<Step>(Step.init) | |||||
| const [controlFoldOptions, setControlFoldOptions] = useState<number>(0) | |||||
| useEffect(() => { | |||||
| if (step !== Step.init) | |||||
| setControlFoldOptions(Date.now()) | |||||
| }, [step]) | |||||
| const { setShowAccountSettingModal } = useModalContext() | |||||
| const handleSetting = useCallback(() => { | |||||
| setShowAccountSettingModal({ | |||||
| payload: 'data-source', | |||||
| }) | |||||
| }, [setShowAccountSettingModal]) | |||||
| const checkValid = useCallback((url: string) => { | |||||
| let errorMsg = '' | |||||
| if (!url) { | |||||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||||
| field: 'url', | |||||
| }) | |||||
| } | |||||
| if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) | |||||
| errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) | |||||
| if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { | |||||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||||
| field: t(`${I18N_PREFIX}.limit`), | |||||
| }) | |||||
| } | |||||
| return { | |||||
| isValid: !errorMsg, | |||||
| errorMsg, | |||||
| } | |||||
| }, [crawlOptions, t]) | |||||
| const isInit = step === Step.init | |||||
| const isCrawlFinished = step === Step.finished | |||||
| const isRunning = step === Step.running | |||||
| const [crawlResult, setCrawlResult] = useState<{ | |||||
| current: number | |||||
| total: number | |||||
| data: CrawlResultItem[] | |||||
| time_consuming: number | string | |||||
| } | undefined>(undefined) | |||||
| const [crawlErrorMessage, setCrawlErrorMessage] = useState('') | |||||
| const showError = isCrawlFinished && crawlErrorMessage | |||||
| const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => { | |||||
| try { | |||||
| const res = await checkWatercrawlTaskStatus(jobId) as any | |||||
| if (res.status === 'completed') { | |||||
| return { | |||||
| isError: false, | |||||
| data: { | |||||
| ...res, | |||||
| total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), | |||||
| }, | |||||
| } | |||||
| } | |||||
| if (res.status === 'error' || !res.status) { | |||||
| // can't get the error message from the watercrawl api | |||||
| return { | |||||
| isError: true, | |||||
| errorMessage: res.message, | |||||
| data: { | |||||
| data: [], | |||||
| }, | |||||
| } | |||||
| } | |||||
| // update the progress | |||||
| setCrawlResult({ | |||||
| ...res, | |||||
| total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), | |||||
| }) | |||||
| onCheckedCrawlResultChange(res.data || []) // default select the crawl result | |||||
| await sleep(2500) | |||||
| return await waitForCrawlFinished(jobId) | |||||
| } | |||||
| catch (e: any) { | |||||
| const errorBody = await e.json() | |||||
| return { | |||||
| isError: true, | |||||
| errorMessage: errorBody.message, | |||||
| data: { | |||||
| data: [], | |||||
| }, | |||||
| } | |||||
| } | |||||
| }, [crawlOptions.limit]) | |||||
| const handleRun = useCallback(async (url: string) => { | |||||
| const { isValid, errorMsg } = checkValid(url) | |||||
| if (!isValid) { | |||||
| Toast.notify({ | |||||
| message: errorMsg!, | |||||
| type: 'error', | |||||
| }) | |||||
| return | |||||
| } | |||||
| setStep(Step.running) | |||||
| try { | |||||
| const passToServerCrawlOptions: any = { | |||||
| ...crawlOptions, | |||||
| } | |||||
| if (crawlOptions.max_depth === '') | |||||
| delete passToServerCrawlOptions.max_depth | |||||
| const res = await createWatercrawlTask({ | |||||
| url, | |||||
| options: passToServerCrawlOptions, | |||||
| }) as any | |||||
| const jobId = res.job_id | |||||
| onJobIdChange(jobId) | |||||
| const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) | |||||
| if (isError) { | |||||
| setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) | |||||
| } | |||||
| else { | |||||
| setCrawlResult(data) | |||||
| onCheckedCrawlResultChange(data.data || []) // default select the crawl result | |||||
| setCrawlErrorMessage('') | |||||
| } | |||||
| } | |||||
| catch (e) { | |||||
| setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) | |||||
| console.log(e) | |||||
| } | |||||
| finally { | |||||
| setStep(Step.finished) | |||||
| } | |||||
| }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) | |||||
| return ( | |||||
| <div> | |||||
| <Header onSetting={handleSetting} /> | |||||
| <div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle p-4 pb-0'> | |||||
| <UrlInput onRun={handleRun} isRunning={isRunning} /> | |||||
| <OptionsWrap | |||||
| className='mt-4' | |||||
| controlFoldOptions={controlFoldOptions} | |||||
| > | |||||
| <Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} /> | |||||
| </OptionsWrap> | |||||
| {!isInit && ( | |||||
| <div className='relative left-[-16px] mt-3 w-[calc(100%_+_32px)] rounded-b-xl'> | |||||
| {isRunning | |||||
| && <Crawling | |||||
| className='mt-2' | |||||
| crawledNum={crawlResult?.current || 0} | |||||
| totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0} | |||||
| />} | |||||
| {showError && ( | |||||
| <ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} /> | |||||
| )} | |||||
| {isCrawlFinished && !showError | |||||
| && <CrawledResult | |||||
| className='mb-2' | |||||
| list={crawlResult?.data || []} | |||||
| checkedList={checkedCrawlResult} | |||||
| onSelectedChange={onCheckedCrawlResultChange} | |||||
| onPreview={onPreview} | |||||
| usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0} | |||||
| /> | |||||
| } | |||||
| </div> | |||||
| )} | |||||
| </div> | |||||
| </div> | |||||
| ) | |||||
| } | |||||
| export default React.memo(WaterCrawl) |
| 'use client' | |||||
| import type { FC } from 'react' | |||||
| import React, { useCallback } from 'react' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import CheckboxWithLabel from '../base/checkbox-with-label' | |||||
| import Field from '../base/field' | |||||
| import cn from '@/utils/classnames' | |||||
| import type { CrawlOptions } from '@/models/datasets' | |||||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||||
| type Props = { | |||||
| className?: string | |||||
| payload: CrawlOptions | |||||
| onChange: (payload: CrawlOptions) => void | |||||
| } | |||||
| const Options: FC<Props> = ({ | |||||
| className = '', | |||||
| payload, | |||||
| onChange, | |||||
| }) => { | |||||
| const { t } = useTranslation() | |||||
| const handleChange = useCallback((key: keyof CrawlOptions) => { | |||||
| return (value: any) => { | |||||
| onChange({ | |||||
| ...payload, | |||||
| [key]: value, | |||||
| }) | |||||
| } | |||||
| }, [payload, onChange]) | |||||
| return ( | |||||
| <div className={cn(className, ' space-y-2')}> | |||||
| <CheckboxWithLabel | |||||
| label={t(`${I18N_PREFIX}.crawlSubPage`)} | |||||
| isChecked={payload.crawl_sub_pages} | |||||
| onChange={handleChange('crawl_sub_pages')} | |||||
| labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary' | |||||
| /> | |||||
| <div className='flex justify-between space-x-4'> | |||||
| <Field | |||||
| className='shrink-0 grow' | |||||
| label={t(`${I18N_PREFIX}.limit`)} | |||||
| value={payload.limit} | |||||
| onChange={handleChange('limit')} | |||||
| isNumber | |||||
| isRequired | |||||
| /> | |||||
| <Field | |||||
| className='shrink-0 grow' | |||||
| label={t(`${I18N_PREFIX}.maxDepth`)} | |||||
| value={payload.max_depth} | |||||
| onChange={handleChange('max_depth')} | |||||
| isNumber | |||||
| tooltip={t(`${I18N_PREFIX}.maxDepthTooltip`)!} | |||||
| /> | |||||
| </div> | |||||
| <div className='flex justify-between space-x-4'> | |||||
| <Field | |||||
| className='shrink-0 grow' | |||||
| label={t(`${I18N_PREFIX}.excludePaths`)} | |||||
| value={payload.excludes} | |||||
| onChange={handleChange('excludes')} | |||||
| placeholder='blog/*, /about/*' | |||||
| /> | |||||
| <Field | |||||
| className='shrink-0 grow' | |||||
| label={t(`${I18N_PREFIX}.includeOnlyPaths`)} | |||||
| value={payload.includes} | |||||
| onChange={handleChange('includes')} | |||||
| placeholder='articles/*' | |||||
| /> | |||||
| </div> | |||||
| <CheckboxWithLabel | |||||
| label={t(`${I18N_PREFIX}.extractOnlyMainContent`)} | |||||
| isChecked={payload.only_main_content} | |||||
| onChange={handleChange('only_main_content')} | |||||
| labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary' | |||||
| /> | |||||
| </div> | |||||
| ) | |||||
| } | |||||
| export default React.memo(Options) |
| 'use client' | |||||
| import type { FC } from 'react' | |||||
| import React, { useCallback, useState } from 'react' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import { | |||||
| PortalToFollowElem, | |||||
| PortalToFollowElemContent, | |||||
| } from '@/app/components/base/portal-to-follow-elem' | |||||
| import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' | |||||
| import Button from '@/app/components/base/button' | |||||
| import type { WatercrawlConfig } from '@/models/common' | |||||
| import Field from '@/app/components/datasets/create/website/base/field' | |||||
| import Toast from '@/app/components/base/toast' | |||||
| import { createDataSourceApiKeyBinding } from '@/service/datasets' | |||||
| import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' | |||||
| type Props = { | |||||
| onCancel: () => void | |||||
| onSaved: () => void | |||||
| } | |||||
| const I18N_PREFIX = 'datasetCreation.watercrawl' | |||||
| const DEFAULT_BASE_URL = 'https://app.watercrawl.dev' | |||||
| const ConfigWatercrawlModal: FC<Props> = ({ | |||||
| onCancel, | |||||
| onSaved, | |||||
| }) => { | |||||
| const { t } = useTranslation() | |||||
| const [isSaving, setIsSaving] = useState(false) | |||||
| const [config, setConfig] = useState<WatercrawlConfig>({ | |||||
| api_key: '', | |||||
| base_url: '', | |||||
| }) | |||||
| const handleConfigChange = useCallback((key: string) => { | |||||
| return (value: string | number) => { | |||||
| setConfig(prev => ({ ...prev, [key]: value as string })) | |||||
| } | |||||
| }, []) | |||||
| const handleSave = useCallback(async () => { | |||||
| if (isSaving) | |||||
| return | |||||
| let errorMsg = '' | |||||
| if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://')))) | |||||
| errorMsg = t('common.errorMsg.urlError') | |||||
| if (!errorMsg) { | |||||
| if (!config.api_key) { | |||||
| errorMsg = t('common.errorMsg.fieldRequired', { | |||||
| field: 'API Key', | |||||
| }) | |||||
| } | |||||
| } | |||||
| if (errorMsg) { | |||||
| Toast.notify({ | |||||
| type: 'error', | |||||
| message: errorMsg, | |||||
| }) | |||||
| return | |||||
| } | |||||
| const postData = { | |||||
| category: 'website', | |||||
| provider: 'watercrawl', | |||||
| credentials: { | |||||
| auth_type: 'x-api-key', | |||||
| config: { | |||||
| api_key: config.api_key, | |||||
| base_url: config.base_url || DEFAULT_BASE_URL, | |||||
| }, | |||||
| }, | |||||
| } | |||||
| try { | |||||
| setIsSaving(true) | |||||
| await createDataSourceApiKeyBinding(postData) | |||||
| Toast.notify({ | |||||
| type: 'success', | |||||
| message: t('common.api.success'), | |||||
| }) | |||||
| } | |||||
| finally { | |||||
| setIsSaving(false) | |||||
| } | |||||
| onSaved() | |||||
| }, [config.api_key, config.base_url, onSaved, t, isSaving]) | |||||
| return ( | |||||
| <PortalToFollowElem open> | |||||
| <PortalToFollowElemContent className='w-full h-full z-[60]'> | |||||
| <div className='fixed inset-0 flex items-center justify-center bg-background-overlay'> | |||||
| <div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-components-panel-bg shadow-xl rounded-2xl overflow-y-auto'> | |||||
| <div className='px-8 pt-8'> | |||||
| <div className='flex justify-between items-center mb-4'> | |||||
| <div className='system-xl-semibold text-text-primary'>{t(`${I18N_PREFIX}.configWatercrawl`)}</div> | |||||
| </div> | |||||
| <div className='space-y-4'> | |||||
| <Field | |||||
| label='API Key' | |||||
| labelClassName='!text-sm' | |||||
| isRequired | |||||
| value={config.api_key} | |||||
| onChange={handleConfigChange('api_key')} | |||||
| placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!} | |||||
| /> | |||||
| <Field | |||||
| label='Base URL' | |||||
| labelClassName='!text-sm' | |||||
| value={config.base_url} | |||||
| onChange={handleConfigChange('base_url')} | |||||
| placeholder={DEFAULT_BASE_URL} | |||||
| /> | |||||
| </div> | |||||
| <div className='my-8 flex justify-between items-center h-8'> | |||||
| <a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-text-accent' target='_blank' href='https://app.watercrawl.dev/'> | |||||
| <span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span> | |||||
| <LinkExternal02 className='w-3 h-3' /> | |||||
| </a> | |||||
| <div className='flex'> | |||||
| <Button | |||||
| size='large' | |||||
| className='mr-2' | |||||
| onClick={onCancel} | |||||
| > | |||||
| {t('common.operation.cancel')} | |||||
| </Button> | |||||
| <Button | |||||
| variant='primary' | |||||
| size='large' | |||||
| onClick={handleSave} | |||||
| loading={isSaving} | |||||
| > | |||||
| {t('common.operation.save')} | |||||
| </Button> | |||||
| </div> | |||||
| </div> | |||||
| </div> | |||||
| <div className='border-t-[0.5px] border-t-divider-regular'> | |||||
| <div className='flex justify-center items-center py-3 bg-background-section-burn text-xs text-text-tertiary'> | |||||
| <Lock01 className='mr-1 w-3 h-3 text-text-tertiary' /> | |||||
| {t('common.modelProvider.encrypted.front')} | |||||
| <a | |||||
| className='text-text-accent mx-1' | |||||
| target='_blank' rel='noopener noreferrer' | |||||
| href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html' | |||||
| > | |||||
| PKCS1_OAEP | |||||
| </a> | |||||
| {t('common.modelProvider.encrypted.back')} | |||||
| </div> | |||||
| </div> | |||||
| </div> | |||||
| </div> | |||||
| </PortalToFollowElemContent> | |||||
| </PortalToFollowElem> | |||||
| ) | |||||
| } | |||||
| export default React.memo(ConfigWatercrawlModal) |
| import Panel from '../panel' | import Panel from '../panel' | ||||
| import { DataSourceType } from '../panel/types' | import { DataSourceType } from '../panel/types' | ||||
| import ConfigFirecrawlModal from './config-firecrawl-modal' | import ConfigFirecrawlModal from './config-firecrawl-modal' | ||||
| import ConfigWatercrawlModal from './config-watercrawl-modal' | |||||
| import ConfigJinaReaderModal from './config-jina-reader-modal' | import ConfigJinaReaderModal from './config-jina-reader-modal' | ||||
| import cn from '@/utils/classnames' | import cn from '@/utils/classnames' | ||||
| import s from '@/app/components/datasets/create/website/index.module.css' | import s from '@/app/components/datasets/create/website/index.module.css' | ||||
| import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' | import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' | ||||
| import type { | |||||
| DataSourceItem, | |||||
| } from '@/models/common' | |||||
| import type { DataSourceItem } from '@/models/common' | |||||
| import { DataSourceProvider } from '@/models/common' | |||||
| import { useAppContext } from '@/context/app-context' | import { useAppContext } from '@/context/app-context' | ||||
| import { | |||||
| DataSourceProvider, | |||||
| } from '@/models/common' | |||||
| import Toast from '@/app/components/base/toast' | import Toast from '@/app/components/base/toast' | ||||
| type Props = { | type Props = { | ||||
| return source?.id | return source?.id | ||||
| } | } | ||||
| const getProviderName = (provider: DataSourceProvider): string => { | |||||
| if (provider === DataSourceProvider.fireCrawl) | |||||
| return 'Firecrawl' | |||||
| if (provider === DataSourceProvider.waterCrawl) | |||||
| return 'WaterCrawl' | |||||
| return 'Jina Reader' | |||||
| } | |||||
| const handleRemove = useCallback((provider: DataSourceProvider) => { | const handleRemove = useCallback((provider: DataSourceProvider) => { | ||||
| return async () => { | return async () => { | ||||
| const dataSourceId = getIdByProvider(provider) | const dataSourceId = getIdByProvider(provider) | ||||
| readOnly={!isCurrentWorkspaceManager} | readOnly={!isCurrentWorkspaceManager} | ||||
| configuredList={sources.filter(item => item.provider === provider).map(item => ({ | configuredList={sources.filter(item => item.provider === provider).map(item => ({ | ||||
| id: item.id, | id: item.id, | ||||
| logo: ({ className }: { className: string }) => ( | |||||
| item.provider === DataSourceProvider.fireCrawl | |||||
| ? ( | |||||
| <div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div> | |||||
| logo: ({ className }: { className: string }) => { | |||||
| if (item.provider === DataSourceProvider.fireCrawl) { | |||||
| return ( | |||||
| <div | |||||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div> | |||||
| ) | ) | ||||
| : ( | |||||
| <div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||||
| <span className={s.jinaLogo} /> | |||||
| } | |||||
| if (item.provider === DataSourceProvider.waterCrawl) { | |||||
| return ( | |||||
| <div | |||||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||||
| <span className={s.watercrawlLogo}/> | |||||
| </div> | </div> | ||||
| ) | ) | ||||
| ), | |||||
| name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', | |||||
| } | |||||
| return ( | |||||
| <div | |||||
| className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}> | |||||
| <span className={s.jinaLogo}/> | |||||
| </div> | |||||
| ) | |||||
| }, | |||||
| name: getProviderName(item.provider), | |||||
| isActive: true, | isActive: true, | ||||
| }))} | }))} | ||||
| onRemove={handleRemove(provider)} | onRemove={handleRemove(provider)} | ||||
| /> | /> | ||||
| {configTarget === DataSourceProvider.fireCrawl && ( | {configTarget === DataSourceProvider.fireCrawl && ( | ||||
| <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} /> | |||||
| <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig}/> | |||||
| )} | |||||
| {configTarget === DataSourceProvider.waterCrawl && ( | |||||
| <ConfigWatercrawlModal onSaved={handleAdded} onCancel={hideConfig}/> | |||||
| )} | )} | ||||
| {configTarget === DataSourceProvider.jinaReader && ( | {configTarget === DataSourceProvider.jinaReader && ( | ||||
| <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} /> | |||||
| <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig}/> | |||||
| )} | )} | ||||
| </> | </> | ||||
| <DataSourceNotion workspaces={notionWorkspaces} /> | <DataSourceNotion workspaces={notionWorkspaces} /> | ||||
| <DataSourceWebsite provider={DataSourceProvider.jinaReader} /> | <DataSourceWebsite provider={DataSourceProvider.jinaReader} /> | ||||
| <DataSourceWebsite provider={DataSourceProvider.fireCrawl} /> | <DataSourceWebsite provider={DataSourceProvider.fireCrawl} /> | ||||
| <DataSourceWebsite provider={DataSourceProvider.waterCrawl} /> | |||||
| </div> | </div> | ||||
| ) | ) | ||||
| } | } |
| const isNotion = type === DataSourceType.notion | const isNotion = type === DataSourceType.notion | ||||
| const isWebsite = type === DataSourceType.website | const isWebsite = type === DataSourceType.website | ||||
| const getProviderName = (): string => { | |||||
| if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl' | |||||
| if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl' | |||||
| return 'Jina Reader' | |||||
| } | |||||
| return ( | return ( | ||||
| <div className='mb-2 rounded-xl bg-background-section-burn'> | <div className='mb-2 rounded-xl bg-background-section-burn'> | ||||
| <div className='flex items-center px-3 py-[9px]'> | <div className='flex items-center px-3 py-[9px]'> | ||||
| <div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div> | <div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div> | ||||
| {isWebsite && ( | {isWebsite && ( | ||||
| <div className='ml-1 rounded-md bg-components-badge-white-to-dark px-1.5 text-xs font-medium leading-[18px] text-text-secondary'> | <div className='ml-1 rounded-md bg-components-badge-white-to-dark px-1.5 text-xs font-medium leading-[18px] text-text-secondary'> | ||||
| <span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'} | |||||
| <span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> {getProviderName()} | |||||
| </div> | </div> | ||||
| )} | )} | ||||
| </div> | </div> |
| apiKeyPlaceholder: 'API key from firecrawl.dev', | apiKeyPlaceholder: 'API key from firecrawl.dev', | ||||
| getApiKeyLinkText: 'Get your API key from firecrawl.dev', | getApiKeyLinkText: 'Get your API key from firecrawl.dev', | ||||
| }, | }, | ||||
| watercrawl: { | |||||
| configWatercrawl: 'Configure Watercrawl', | |||||
| apiKeyPlaceholder: 'API key from watercrawl.dev', | |||||
| getApiKeyLinkText: 'Get your API key from watercrawl.dev', | |||||
| }, | |||||
| jinaReader: { | jinaReader: { | ||||
| configJinaReader: 'Configure Jina Reader', | configJinaReader: 'Configure Jina Reader', | ||||
| apiKeyPlaceholder: 'API key from jina.ai', | apiKeyPlaceholder: 'API key from jina.ai', | ||||
| chooseProvider: 'Select a provider', | chooseProvider: 'Select a provider', | ||||
| fireCrawlNotConfigured: 'Firecrawl is not configured', | fireCrawlNotConfigured: 'Firecrawl is not configured', | ||||
| fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', | fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', | ||||
| watercrawlNotConfigured: 'Watercrawl is not configured', | |||||
| watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.', | |||||
| jinaReaderNotConfigured: 'Jina Reader is not configured', | jinaReaderNotConfigured: 'Jina Reader is not configured', | ||||
| jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', | jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', | ||||
| configure: 'Configure', | configure: 'Configure', | ||||
| configureFirecrawl: 'Configure Firecrawl', | configureFirecrawl: 'Configure Firecrawl', | ||||
| configureWatercrawl: 'Configure Watercrawl', | |||||
| configureJinaReader: 'Configure Jina Reader', | configureJinaReader: 'Configure Jina Reader', | ||||
| run: 'Run', | run: 'Run', | ||||
| firecrawlTitle: 'Extract web content with 🔥Firecrawl', | firecrawlTitle: 'Extract web content with 🔥Firecrawl', | ||||
| firecrawlDoc: 'Firecrawl docs', | firecrawlDoc: 'Firecrawl docs', | ||||
| firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | ||||
| watercrawlTitle: 'Extract web content with Watercrawl', | |||||
| watercrawlDoc: 'Watercrawl docs', | |||||
| watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | |||||
| jinaReaderTitle: 'Convert the entire site to Markdown', | jinaReaderTitle: 'Convert the entire site to Markdown', | ||||
| jinaReaderDoc: 'Learn more about Jina Reader', | jinaReaderDoc: 'Learn more about Jina Reader', | ||||
| jinaReaderDocLink: 'https://jina.ai/reader', | jinaReaderDocLink: 'https://jina.ai/reader', |
| export enum DataSourceProvider { | export enum DataSourceProvider { | ||||
| fireCrawl = 'firecrawl', | fireCrawl = 'firecrawl', | ||||
| jinaReader = 'jinareader', | jinaReader = 'jinareader', | ||||
| waterCrawl = 'watercrawl', | |||||
| } | } | ||||
| export type FirecrawlConfig = { | export type FirecrawlConfig = { | ||||
| base_url: string | base_url: string | ||||
| } | } | ||||
| export type WatercrawlConfig = { | |||||
| api_key: string | |||||
| base_url: string | |||||
| } | |||||
| export type DataSourceItem = { | export type DataSourceItem = { | ||||
| id: string | id: string | ||||
| category: DataSourceCategory | category: DataSourceCategory |
| }) | }) | ||||
| } | } | ||||
| export const createWatercrawlTask: Fetcher<CommonResponse, Record<string, any>> = (body) => { | |||||
| return post<CommonResponse>('website/crawl', { | |||||
| body: { | |||||
| ...body, | |||||
| provider: DataSourceProvider.waterCrawl, | |||||
| }, | |||||
| }) | |||||
| } | |||||
| export const checkWatercrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => { | |||||
| return get<CommonResponse>(`website/crawl/status/${jobId}`, { | |||||
| params: { | |||||
| provider: DataSourceProvider.waterCrawl, | |||||
| }, | |||||
| }, { | |||||
| silent: true, | |||||
| }) | |||||
| } | |||||
| type FileTypesRes = { | type FileTypesRes = { | ||||
| allowed_extensions: string[] | allowed_extensions: string[] | ||||
| } | } |