|
|
|
@@ -1,98 +1,93 @@ |
|
|
|
import time |
|
|
|
from collections.abc import Mapping |
|
|
|
from typing import Any |
|
|
|
|
|
|
|
import requests |
|
|
|
from requests.exceptions import HTTPError |
|
|
|
|
|
|
|
|
|
|
|
class FirecrawlApp: |
|
|
|
def __init__(self, api_key=None, base_url=None): |
|
|
|
def __init__(self, api_key: str | None = None, base_url: str | None = None): |
|
|
|
self.api_key = api_key |
|
|
|
self.base_url = base_url or 'https://api.firecrawl.dev' |
|
|
|
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev': |
|
|
|
raise ValueError('No API key provided') |
|
|
|
if not self.api_key: |
|
|
|
raise ValueError("API key is required") |
|
|
|
|
|
|
|
def scrape_url(self, url, params=None) -> dict: |
|
|
|
def _prepare_headers(self, idempotency_key: str | None = None): |
|
|
|
headers = { |
|
|
|
'Content-Type': 'application/json', |
|
|
|
'Authorization': f'Bearer {self.api_key}' |
|
|
|
} |
|
|
|
json_data = {'url': url} |
|
|
|
if params: |
|
|
|
json_data.update(params) |
|
|
|
response = requests.post( |
|
|
|
f'{self.base_url}/v0/scrape', |
|
|
|
headers=headers, |
|
|
|
json=json_data |
|
|
|
) |
|
|
|
if response.status_code == 200: |
|
|
|
response = response.json() |
|
|
|
if response['success'] == True: |
|
|
|
return response['data'] |
|
|
|
else: |
|
|
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') |
|
|
|
if idempotency_key: |
|
|
|
headers['Idempotency-Key'] = idempotency_key |
|
|
|
return headers |
|
|
|
|
|
|
|
elif response.status_code in [402, 409, 500]: |
|
|
|
error_message = response.json().get('error', 'Unknown error occurred') |
|
|
|
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') |
|
|
|
else: |
|
|
|
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') |
|
|
|
def _request( |
|
|
|
self, |
|
|
|
method: str, |
|
|
|
url: str, |
|
|
|
data: Mapping[str, Any] | None = None, |
|
|
|
headers: Mapping[str, str] | None = None, |
|
|
|
retries: int = 3, |
|
|
|
backoff_factor: float = 0.3, |
|
|
|
) -> Mapping[str, Any] | None: |
|
|
|
for i in range(retries): |
|
|
|
try: |
|
|
|
response = requests.request(method, url, json=data, headers=headers) |
|
|
|
response.raise_for_status() |
|
|
|
return response.json() |
|
|
|
except requests.exceptions.RequestException as e: |
|
|
|
if i < retries - 1: |
|
|
|
time.sleep(backoff_factor * (2 ** i)) |
|
|
|
else: |
|
|
|
raise |
|
|
|
return None |
|
|
|
|
|
|
|
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str: |
|
|
|
def scrape_url(self, url: str, **kwargs): |
|
|
|
endpoint = f'{self.base_url}/v0/scrape' |
|
|
|
headers = self._prepare_headers() |
|
|
|
json_data = {'url': url} |
|
|
|
if params: |
|
|
|
json_data.update(params) |
|
|
|
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers) |
|
|
|
if response.status_code == 200: |
|
|
|
job_id = response.json().get('jobId') |
|
|
|
if wait_until_done: |
|
|
|
return self._monitor_job_status(job_id, headers, timeout) |
|
|
|
else: |
|
|
|
return {'jobId': job_id} |
|
|
|
else: |
|
|
|
self._handle_error(response, 'start crawl job') |
|
|
|
data = {'url': url, **kwargs} |
|
|
|
response = self._request('POST', endpoint, data, headers) |
|
|
|
if response is None: |
|
|
|
raise HTTPError("Failed to scrape URL after multiple retries") |
|
|
|
return response |
|
|
|
|
|
|
|
def check_crawl_status(self, job_id) -> dict: |
|
|
|
def search(self, query: str, **kwargs): |
|
|
|
endpoint = f'{self.base_url}/v0/search' |
|
|
|
headers = self._prepare_headers() |
|
|
|
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) |
|
|
|
if response.status_code == 200: |
|
|
|
return response.json() |
|
|
|
else: |
|
|
|
self._handle_error(response, 'check crawl status') |
|
|
|
|
|
|
|
def _prepare_headers(self): |
|
|
|
return { |
|
|
|
'Content-Type': 'application/json', |
|
|
|
'Authorization': f'Bearer {self.api_key}' |
|
|
|
} |
|
|
|
data = {'query': query, **kwargs} |
|
|
|
response = self._request('POST', endpoint, data, headers) |
|
|
|
if response is None: |
|
|
|
raise HTTPError("Failed to perform search after multiple retries") |
|
|
|
return response |
|
|
|
|
|
|
|
def _post_request(self, url, data, headers): |
|
|
|
return requests.post(url, headers=headers, json=data) |
|
|
|
def crawl_url( |
|
|
|
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs |
|
|
|
): |
|
|
|
endpoint = f'{self.base_url}/v0/crawl' |
|
|
|
headers = self._prepare_headers(idempotency_key) |
|
|
|
data = {'url': url, **kwargs} |
|
|
|
response = self._request('POST', endpoint, data, headers) |
|
|
|
if response is None: |
|
|
|
raise HTTPError("Failed to initiate crawl after multiple retries") |
|
|
|
job_id: str = response['jobId'] |
|
|
|
if wait: |
|
|
|
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval) |
|
|
|
return job_id |
|
|
|
|
|
|
|
def _get_request(self, url, headers): |
|
|
|
return requests.get(url, headers=headers) |
|
|
|
def check_crawl_status(self, job_id: str): |
|
|
|
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' |
|
|
|
headers = self._prepare_headers() |
|
|
|
response = self._request('GET', endpoint, headers=headers) |
|
|
|
if response is None: |
|
|
|
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") |
|
|
|
return response |
|
|
|
|
|
|
|
def _monitor_job_status(self, job_id, headers, timeout): |
|
|
|
def _monitor_job_status(self, job_id: str, poll_interval: int): |
|
|
|
while True: |
|
|
|
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) |
|
|
|
if status_response.status_code == 200: |
|
|
|
status_data = status_response.json() |
|
|
|
if status_data['status'] == 'completed': |
|
|
|
if 'data' in status_data: |
|
|
|
return status_data['data'] |
|
|
|
else: |
|
|
|
raise Exception('Crawl job completed but no data was returned') |
|
|
|
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: |
|
|
|
if timeout < 2: |
|
|
|
timeout = 2 |
|
|
|
time.sleep(timeout) # Wait for the specified timeout before checking again |
|
|
|
else: |
|
|
|
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') |
|
|
|
else: |
|
|
|
self._handle_error(status_response, 'check crawl status') |
|
|
|
|
|
|
|
def _handle_error(self, response, action): |
|
|
|
if response.status_code in [402, 409, 500]: |
|
|
|
error_message = response.json().get('error', 'Unknown error occurred') |
|
|
|
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') |
|
|
|
else: |
|
|
|
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') |
|
|
|
status = self.check_crawl_status(job_id) |
|
|
|
if status['status'] == 'completed': |
|
|
|
return status |
|
|
|
elif status['status'] == 'failed': |
|
|
|
raise HTTPError(f'Job {job_id} failed: {status["error"]}') |
|
|
|
time.sleep(poll_interval) |