| @@ -13,9 +13,10 @@ class FirecrawlWebExtractor(BaseExtractor): | |||
| api_key: The API key for Firecrawl. | |||
| base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'. | |||
| mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. | |||
| only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. | |||
| """ | |||
| def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): | |||
| def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): | |||
| """Initialize with url, api_key, base_url and mode.""" | |||
| self._url = url | |||
| self.job_id = job_id | |||
| @@ -21,8 +21,8 @@ class FirecrawlAuth(ApiKeyAuthBase): | |||
| headers = self._prepare_headers() | |||
| options = { | |||
| "url": "https://example.com", | |||
| "excludes": [], | |||
| "includes": [], | |||
| "includePaths": [], | |||
| "excludePaths": [], | |||
| "limit": 1, | |||
| "scrapeOptions": {"onlyMainContent": True}, | |||
| } | |||
| @@ -38,9 +38,8 @@ class WebsiteService: | |||
| only_main_content = options.get("only_main_content", False) | |||
| if not crawl_sub_pages: | |||
| params = { | |||
| "includes": [], | |||
| "excludes": [], | |||
| "generateImgAltText": True, | |||
| "includePaths": [], | |||
| "excludePaths": [], | |||
| "limit": 1, | |||
| "scrapeOptions": {"onlyMainContent": only_main_content}, | |||
| } | |||
| @@ -48,9 +47,8 @@ class WebsiteService: | |||
| includes = options.get("includes").split(",") if options.get("includes") else [] | |||
| excludes = options.get("excludes").split(",") if options.get("excludes") else [] | |||
| params = { | |||
| "includes": includes, | |||
| "excludes": excludes, | |||
| "generateImgAltText": True, | |||
| "includePaths": includes, | |||
| "excludePaths": excludes, | |||
| "limit": options.get("limit", 1), | |||
| "scrapeOptions": {"onlyMainContent": only_main_content}, | |||
| } | |||
| @@ -10,9 +10,8 @@ def test_firecrawl_web_extractor_crawl_mode(mocker): | |||
| base_url = "https://api.firecrawl.dev" | |||
| firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url) | |||
| params = { | |||
| "includes": [], | |||
| "excludes": [], | |||
| "generateImgAltText": True, | |||
| "includePaths": [], | |||
| "excludePaths": [], | |||
| "maxDepth": 1, | |||
| "limit": 1, | |||
| } | |||