| @@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource): | |||
| @account_initialization_required | |||
| def post(self): | |||
| parser = reqparse.RequestParser() | |||
| parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json") | |||
| parser.add_argument( | |||
| "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" | |||
| ) | |||
| parser.add_argument("url", type=str, required=True, nullable=True, location="json") | |||
| parser.add_argument("options", type=dict, required=True, nullable=True, location="json") | |||
| args = parser.parse_args() | |||
| @@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource): | |||
| @account_initialization_required | |||
| def get(self, job_id: str): | |||
| parser = reqparse.RequestParser() | |||
| parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args") | |||
| parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") | |||
| args = parser.parse_args() | |||
| # get crawl status | |||
| try: | |||
| @@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting | |||
| from core.rag.extractor.excel_extractor import ExcelExtractor | |||
| from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor | |||
| from core.rag.extractor.html_extractor import HtmlExtractor | |||
| from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor | |||
| from core.rag.extractor.markdown_extractor import MarkdownExtractor | |||
| from core.rag.extractor.notion_extractor import NotionExtractor | |||
| from core.rag.extractor.pdf_extractor import PdfExtractor | |||
| @@ -171,6 +172,15 @@ class ExtractProcessor: | |||
| only_main_content=extract_setting.website_info.only_main_content, | |||
| ) | |||
| return extractor.extract() | |||
| elif extract_setting.website_info.provider == "jinareader": | |||
| extractor = JinaReaderWebExtractor( | |||
| url=extract_setting.website_info.url, | |||
| job_id=extract_setting.website_info.job_id, | |||
| tenant_id=extract_setting.website_info.tenant_id, | |||
| mode=extract_setting.website_info.mode, | |||
| only_main_content=extract_setting.website_info.only_main_content, | |||
| ) | |||
| return extractor.extract() | |||
| else: | |||
| raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}") | |||
| else: | |||
| @@ -0,0 +1,35 @@ | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| from services.website_service import WebsiteService | |||
| class JinaReaderWebExtractor(BaseExtractor): | |||
| """ | |||
| Crawl and scrape websites and return content in clean llm-ready markdown. | |||
| """ | |||
| def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): | |||
| """Initialize with url, api_key, base_url and mode.""" | |||
| self._url = url | |||
| self.job_id = job_id | |||
| self.tenant_id = tenant_id | |||
| self.mode = mode | |||
| self.only_main_content = only_main_content | |||
| def extract(self) -> list[Document]: | |||
| """Extract content from the URL.""" | |||
| documents = [] | |||
| if self.mode == "crawl": | |||
| crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id) | |||
| if crawl_data is None: | |||
| return [] | |||
| document = Document( | |||
| page_content=crawl_data.get("content", ""), | |||
| metadata={ | |||
| "source_url": crawl_data.get("url"), | |||
| "description": crawl_data.get("description"), | |||
| "title": crawl_data.get("title"), | |||
| }, | |||
| ) | |||
| documents.append(document) | |||
| return documents | |||
| @@ -1,10 +1,13 @@ | |||
| from services.auth.firecrawl import FirecrawlAuth | |||
| from services.auth.jina import JinaAuth | |||
| class ApiKeyAuthFactory: | |||
| def __init__(self, provider: str, credentials: dict): | |||
| if provider == "firecrawl": | |||
| self.auth = FirecrawlAuth(credentials) | |||
| elif provider == "jinareader": | |||
| self.auth = JinaAuth(credentials) | |||
| else: | |||
| raise ValueError("Invalid provider") | |||
| @@ -0,0 +1,44 @@ | |||
| import json | |||
| import requests | |||
| from services.auth.api_key_auth_base import ApiKeyAuthBase | |||
| class JinaAuth(ApiKeyAuthBase): | |||
| def __init__(self, credentials: dict): | |||
| super().__init__(credentials) | |||
| auth_type = credentials.get("auth_type") | |||
| if auth_type != "bearer": | |||
| raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer") | |||
| self.api_key = credentials.get("config").get("api_key", None) | |||
| if not self.api_key: | |||
| raise ValueError("No API key provided") | |||
| def validate_credentials(self): | |||
| headers = self._prepare_headers() | |||
| options = { | |||
| "url": "https://example.com", | |||
| } | |||
| response = self._post_request("https://r.jina.ai", options, headers) | |||
| if response.status_code == 200: | |||
| return True | |||
| else: | |||
| self._handle_error(response) | |||
| def _prepare_headers(self): | |||
| return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"} | |||
| def _post_request(self, url, data, headers): | |||
| return requests.post(url, headers=headers, json=data) | |||
| def _handle_error(self, response): | |||
| if response.status_code in {402, 409, 500}: | |||
| error_message = response.json().get("error", "Unknown error occurred") | |||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||
| else: | |||
| if response.text: | |||
| error_message = json.loads(response.text).get("error", "Unknown error occurred") | |||
| raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") | |||
| raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") | |||
| @@ -1,6 +1,7 @@ | |||
| import datetime | |||
| import json | |||
| import requests | |||
| from flask_login import current_user | |||
| from core.helper import encrypter | |||
| @@ -65,6 +66,35 @@ class WebsiteService: | |||
| time = str(datetime.datetime.now().timestamp()) | |||
| redis_client.setex(website_crawl_time_cache_key, 3600, time) | |||
| return {"status": "active", "job_id": job_id} | |||
| elif provider == "jinareader": | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| ) | |||
| crawl_sub_pages = options.get("crawl_sub_pages", False) | |||
| if not crawl_sub_pages: | |||
| response = requests.get( | |||
| f"https://r.jina.ai/{url}", | |||
| headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| ) | |||
| if response.json().get("code") != 200: | |||
| raise ValueError("Failed to crawl") | |||
| return {"status": "active", "data": response.json().get("data")} | |||
| else: | |||
| response = requests.post( | |||
| "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app", | |||
| json={ | |||
| "url": url, | |||
| "maxPages": options.get("limit", 1), | |||
| "useSitemap": options.get("use_sitemap", True), | |||
| }, | |||
| headers={ | |||
| "Content-Type": "application/json", | |||
| "Authorization": f"Bearer {api_key}", | |||
| }, | |||
| ) | |||
| if response.json().get("code") != 200: | |||
| raise ValueError("Failed to crawl") | |||
| return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")} | |||
| else: | |||
| raise ValueError("Invalid provider") | |||
| @@ -93,6 +123,42 @@ class WebsiteService: | |||
| time_consuming = abs(end_time - float(start_time)) | |||
| crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" | |||
| redis_client.delete(website_crawl_time_cache_key) | |||
| elif provider == "jinareader": | |||
| api_key = encrypter.decrypt_token( | |||
| tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") | |||
| ) | |||
| response = requests.post( | |||
| "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", | |||
| headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| json={"taskId": job_id}, | |||
| ) | |||
| data = response.json().get("data", {}) | |||
| crawl_status_data = { | |||
| "status": data.get("status", "active"), | |||
| "job_id": job_id, | |||
| "total": len(data.get("urls", [])), | |||
| "current": len(data.get("processed", [])) + len(data.get("failed", [])), | |||
| "data": [], | |||
| "time_consuming": data.get("duration", 0) / 1000, | |||
| } | |||
| if crawl_status_data["status"] == "completed": | |||
| response = requests.post( | |||
| "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", | |||
| headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, | |||
| ) | |||
| data = response.json().get("data", {}) | |||
| formatted_data = [ | |||
| { | |||
| "title": item.get("data", {}).get("title"), | |||
| "source_url": item.get("data", {}).get("url"), | |||
| "description": item.get("data", {}).get("description"), | |||
| "markdown": item.get("data", {}).get("content"), | |||
| } | |||
| for item in data.get("processed", {}).values() | |||
| ] | |||
| crawl_status_data["data"] = formatted_data | |||
| else: | |||
| raise ValueError("Invalid provider") | |||
| return crawl_status_data | |||
| @@ -119,6 +185,40 @@ class WebsiteService: | |||
| if item.get("source_url") == url: | |||
| return item | |||
| return None | |||
| elif provider == "jinareader": | |||
| file_key = "website_files/" + job_id + ".txt" | |||
| if storage.exists(file_key): | |||
| data = storage.load_once(file_key) | |||
| if data: | |||
| data = json.loads(data.decode("utf-8")) | |||
| elif not job_id: | |||
| response = requests.get( | |||
| f"https://r.jina.ai/{url}", | |||
| headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| ) | |||
| if response.json().get("code") != 200: | |||
| raise ValueError("Failed to crawl") | |||
| return response.json().get("data") | |||
| else: | |||
| api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) | |||
| response = requests.post( | |||
| "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", | |||
| headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| json={"taskId": job_id}, | |||
| ) | |||
| data = response.json().get("data", {}) | |||
| if data.get("status") != "completed": | |||
| raise ValueError("Crawl job is not completed") | |||
| response = requests.post( | |||
| "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app", | |||
| headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}, | |||
| json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())}, | |||
| ) | |||
| data = response.json().get("data", {}) | |||
| for item in data.get("processed", {}).values(): | |||
| if item.get("data", {}).get("url") == url: | |||
| return item.get("data", {}) | |||
| else: | |||
| raise ValueError("Invalid provider") | |||
| @@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets' | |||
| import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets' | |||
| import { fetchDataSource } from '@/service/common' | |||
| import { fetchDatasetDetail } from '@/service/datasets' | |||
| import type { NotionPage } from '@/models/common' | |||
| import { DataSourceProvider, type NotionPage } from '@/models/common' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks' | |||
| @@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = { | |||
| excludes: '', | |||
| limit: 10, | |||
| max_depth: '', | |||
| use_sitemap: true, | |||
| } | |||
| const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { | |||
| @@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { | |||
| const updateFileList = (preparedFiles: FileItem[]) => { | |||
| setFiles(preparedFiles) | |||
| } | |||
| const [fireCrawlJobId, setFireCrawlJobId] = useState('') | |||
| const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl) | |||
| const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('') | |||
| const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => { | |||
| const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID) | |||
| @@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { | |||
| onStepChange={nextStep} | |||
| websitePages={websitePages} | |||
| updateWebsitePages={setWebsitePages} | |||
| onFireCrawlJobIdChange={setFireCrawlJobId} | |||
| onWebsiteCrawlProviderChange={setWebsiteCrawlProvider} | |||
| onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={setCrawlOptions} | |||
| /> | |||
| @@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => { | |||
| files={fileList.map(file => file.file)} | |||
| notionPages={notionPages} | |||
| websitePages={websitePages} | |||
| fireCrawlJobId={fireCrawlJobId} | |||
| websiteCrawlProvider={websiteCrawlProvider} | |||
| websiteCrawlJobId={websiteCrawlJobId} | |||
| onStepChange={changeStep} | |||
| updateIndexingTypeCache={updateIndexingTypeCache} | |||
| updateResultCache={updateResultCache} | |||
| @@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview' | |||
| import s from './index.module.css' | |||
| import cn from '@/utils/classnames' | |||
| import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets' | |||
| import type { NotionPage } from '@/models/common' | |||
| import type { DataSourceProvider, NotionPage } from '@/models/common' | |||
| import { DataSourceType } from '@/models/datasets' | |||
| import Button from '@/app/components/base/button' | |||
| import { NotionPageSelector } from '@/app/components/base/notion-page-selector' | |||
| @@ -33,7 +33,8 @@ type IStepOneProps = { | |||
| changeType: (type: DataSourceType) => void | |||
| websitePages?: CrawlResultItem[] | |||
| updateWebsitePages: (value: CrawlResultItem[]) => void | |||
| onFireCrawlJobIdChange: (jobId: string) => void | |||
| onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void | |||
| onWebsiteCrawlJobIdChange: (jobId: string) => void | |||
| crawlOptions: CrawlOptions | |||
| onCrawlOptionsChange: (payload: CrawlOptions) => void | |||
| } | |||
| @@ -69,7 +70,8 @@ const StepOne = ({ | |||
| updateNotionPages, | |||
| websitePages = [], | |||
| updateWebsitePages, | |||
| onFireCrawlJobIdChange, | |||
| onWebsiteCrawlProviderChange, | |||
| onWebsiteCrawlJobIdChange, | |||
| crawlOptions, | |||
| onCrawlOptionsChange, | |||
| }: IStepOneProps) => { | |||
| @@ -229,7 +231,8 @@ const StepOne = ({ | |||
| onPreview={setCurrentWebsite} | |||
| checkedCrawlResult={websitePages} | |||
| onCheckedCrawlResultChange={updateWebsitePages} | |||
| onJobIdChange={onFireCrawlJobIdChange} | |||
| onCrawlProviderChange={onWebsiteCrawlProviderChange} | |||
| onJobIdChange={onWebsiteCrawlJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| @@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen | |||
| import Toast from '@/app/components/base/toast' | |||
| import { formatNumber } from '@/utils/format' | |||
| import type { NotionPage } from '@/models/common' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| import { DataSourceType, DocForm } from '@/models/datasets' | |||
| import NotionIcon from '@/app/components/base/notion-icon' | |||
| import Switch from '@/app/components/base/switch' | |||
| @@ -63,7 +64,8 @@ type StepTwoProps = { | |||
| notionPages?: NotionPage[] | |||
| websitePages?: CrawlResultItem[] | |||
| crawlOptions?: CrawlOptions | |||
| fireCrawlJobId?: string | |||
| websiteCrawlProvider?: DataSourceProvider | |||
| websiteCrawlJobId?: string | |||
| onStepChange?: (delta: number) => void | |||
| updateIndexingTypeCache?: (type: string) => void | |||
| updateResultCache?: (res: createDocumentResponse) => void | |||
| @@ -94,7 +96,8 @@ const StepTwo = ({ | |||
| notionPages = [], | |||
| websitePages = [], | |||
| crawlOptions, | |||
| fireCrawlJobId = '', | |||
| websiteCrawlProvider = DataSourceProvider.fireCrawl, | |||
| websiteCrawlJobId = '', | |||
| onStepChange, | |||
| updateIndexingTypeCache, | |||
| updateResultCache, | |||
| @@ -260,8 +263,8 @@ const StepTwo = ({ | |||
| const getWebsiteInfo = () => { | |||
| return { | |||
| provider: 'firecrawl', | |||
| job_id: fireCrawlJobId, | |||
| provider: websiteCrawlProvider, | |||
| job_id: websiteCrawlJobId, | |||
| urls: websitePages.map(page => page.source_url), | |||
| only_main_content: crawlOptions?.only_main_content, | |||
| } | |||
| @@ -3,6 +3,7 @@ import type { FC } from 'react' | |||
| import React from 'react' | |||
| import cn from '@/utils/classnames' | |||
| import Checkbox from '@/app/components/base/checkbox' | |||
| import Tooltip from '@/app/components/base/tooltip' | |||
| type Props = { | |||
| className?: string | |||
| @@ -10,6 +11,7 @@ type Props = { | |||
| onChange: (isChecked: boolean) => void | |||
| label: string | |||
| labelClassName?: string | |||
| tooltip?: string | |||
| } | |||
| const CheckboxWithLabel: FC<Props> = ({ | |||
| @@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({ | |||
| onChange, | |||
| label, | |||
| labelClassName, | |||
| tooltip, | |||
| }) => { | |||
| return ( | |||
| <label className={cn(className, 'flex items-center h-7 space-x-2')}> | |||
| <Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} /> | |||
| <div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div> | |||
| {tooltip && ( | |||
| <Tooltip | |||
| popupContent={ | |||
| <div className='w-[200px]'>{tooltip}</div> | |||
| } | |||
| triggerClassName='ml-0.5 w-4 h-4' | |||
| /> | |||
| )} | |||
| </label> | |||
| ) | |||
| } | |||
| @@ -2,7 +2,7 @@ | |||
| import type { FC } from 'react' | |||
| import React, { useCallback } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import CheckboxWithLabel from './base/checkbox-with-label' | |||
| import CheckboxWithLabel from './checkbox-with-label' | |||
| import CrawledResultItem from './crawled-result-item' | |||
| import cn from '@/utils/classnames' | |||
| import type { CrawlResultItem } from '@/models/datasets' | |||
| @@ -2,13 +2,13 @@ | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useEffect, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import UrlInput from '../base/url-input' | |||
| import OptionsWrap from '../base/options-wrap' | |||
| import CrawledResult from '../base/crawled-result' | |||
| import Crawling from '../base/crawling' | |||
| import ErrorMessage from '../base/error-message' | |||
| import Header from './header' | |||
| import UrlInput from './base/url-input' | |||
| import OptionsWrap from './base/options-wrap' | |||
| import Options from './options' | |||
| import CrawledResult from './crawled-result' | |||
| import Crawling from './crawling' | |||
| import ErrorMessage from './base/error-message' | |||
| import cn from '@/utils/classnames' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' | |||
| @@ -2,8 +2,8 @@ | |||
| import type { FC } from 'react' | |||
| import React, { useCallback } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import CheckboxWithLabel from './base/checkbox-with-label' | |||
| import Field from './base/field' | |||
| import CheckboxWithLabel from '../base/checkbox-with-label' | |||
| import Field from '../base/field' | |||
| import cn from '@/utils/classnames' | |||
| import type { CrawlOptions } from '@/models/datasets' | |||
| @@ -0,0 +1,6 @@ | |||
| .jinaLogo { | |||
| @apply w-4 h-4 bg-center bg-no-repeat inline-block; | |||
| background-color: #F5FAFF; | |||
| background-image: url(../assets/jina.png); | |||
| background-size: 16px; | |||
| } | |||
| @@ -1,8 +1,12 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useEffect, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import s from './index.module.css' | |||
| import NoData from './no-data' | |||
| import Firecrawl from './firecrawl' | |||
| import JinaReader from './jina-reader' | |||
| import cn from '@/utils/classnames' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' | |||
| import { fetchDataSources } from '@/service/datasets' | |||
| @@ -12,6 +16,7 @@ type Props = { | |||
| onPreview: (payload: CrawlResultItem) => void | |||
| checkedCrawlResult: CrawlResultItem[] | |||
| onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void | |||
| onCrawlProviderChange: (provider: DataSourceProvider) => void | |||
| onJobIdChange: (jobId: string) => void | |||
| crawlOptions: CrawlOptions | |||
| onCrawlOptionsChange: (payload: CrawlOptions) => void | |||
| @@ -21,17 +26,32 @@ const Website: FC<Props> = ({ | |||
| onPreview, | |||
| checkedCrawlResult, | |||
| onCheckedCrawlResultChange, | |||
| onCrawlProviderChange, | |||
| onJobIdChange, | |||
| crawlOptions, | |||
| onCrawlOptionsChange, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const { setShowAccountSettingModal } = useModalContext() | |||
| const [isLoaded, setIsLoaded] = useState(false) | |||
| const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false) | |||
| const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader) | |||
| const [sources, setSources] = useState<DataSourceItem[]>([]) | |||
| useEffect(() => { | |||
| onCrawlProviderChange(selectedProvider) | |||
| }, [selectedProvider, onCrawlProviderChange]) | |||
| const checkSetApiKey = useCallback(async () => { | |||
| const res = await fetchDataSources() as any | |||
| const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl) | |||
| setIsSetFirecrawlApiKey(isFirecrawlSet) | |||
| setSources(res.sources) | |||
| // If users have configured one of the providers, select it. | |||
| const availableProviders = res.sources.filter((item: DataSourceItem) => | |||
| [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), | |||
| ) | |||
| if (availableProviders.length > 0) | |||
| setSelectedProvider(availableProviders[0].provider) | |||
| }, []) | |||
| useEffect(() => { | |||
| @@ -52,20 +72,66 @@ const Website: FC<Props> = ({ | |||
| return ( | |||
| <div> | |||
| {isSetFirecrawlApiKey | |||
| ? ( | |||
| <Firecrawl | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| ) | |||
| : ( | |||
| <NoData onConfig={handleOnConfig} /> | |||
| )} | |||
| <div className="mb-4"> | |||
| <div className="font-medium text-gray-700 mb-2 h-6"> | |||
| {t('datasetCreation.stepOne.website.chooseProvider')} | |||
| </div> | |||
| <div className="flex space-x-2"> | |||
| <button | |||
| className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${ | |||
| selectedProvider === DataSourceProvider.jinaReader | |||
| ? 'bg-primary-50 text-primary-600' | |||
| : 'bg-gray-100 text-gray-600 hover:bg-gray-200' | |||
| }`} | |||
| onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} | |||
| > | |||
| <span className={cn(s.jinaLogo, 'mr-2')} /> | |||
| <span>Jina Reader</span> | |||
| </button> | |||
| <button | |||
| className={`px-4 py-2 text-sm font-medium rounded-md ${ | |||
| selectedProvider === DataSourceProvider.fireCrawl | |||
| ? 'bg-primary-50 text-primary-600' | |||
| : 'bg-gray-100 text-gray-600 hover:bg-gray-200' | |||
| }`} | |||
| onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)} | |||
| > | |||
| 🔥 Firecrawl | |||
| </button> | |||
| </div> | |||
| </div> | |||
| { | |||
| selectedProvider === DataSourceProvider.fireCrawl | |||
| ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) | |||
| ? ( | |||
| <Firecrawl | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| ) | |||
| : ( | |||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||
| ) | |||
| : sources.find(source => source.provider === DataSourceProvider.jinaReader) | |||
| ? ( | |||
| <JinaReader | |||
| onPreview={onPreview} | |||
| checkedCrawlResult={checkedCrawlResult} | |||
| onCheckedCrawlResultChange={onCheckedCrawlResultChange} | |||
| onJobIdChange={onJobIdChange} | |||
| crawlOptions={crawlOptions} | |||
| onCrawlOptionsChange={onCrawlOptionsChange} | |||
| /> | |||
| ) | |||
| : ( | |||
| <NoData onConfig={handleOnConfig} provider={selectedProvider} /> | |||
| ) | |||
| } | |||
| </div> | |||
| ) | |||
| } | |||
| @@ -0,0 +1,42 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' | |||
| import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| onSetting: () => void | |||
| } | |||
| const Header: FC<Props> = ({ | |||
| onSetting, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| return ( | |||
| <div className='flex h-6 items-center justify-between'> | |||
| <div className='flex items-center'> | |||
| <div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div> | |||
| <div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div> | |||
| <div | |||
| className='p-1 rounded-md hover:bg-black/5 cursor-pointer' | |||
| onClick={onSetting} | |||
| > | |||
| <Settings01 className='w-3.5 h-3.5 text-gray-500' /> | |||
| </div> | |||
| </div> | |||
| <a | |||
| href='https://jina.ai/reader' | |||
| target='_blank' rel='noopener noreferrer' | |||
| className='flex items-center text-xs text-primary-600' | |||
| > | |||
| <BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' /> | |||
| {t(`${I18N_PREFIX}.jinaReaderDoc`)} | |||
| </a> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(Header) | |||
| @@ -0,0 +1,232 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useEffect, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import UrlInput from '../base/url-input' | |||
| import OptionsWrap from '../base/options-wrap' | |||
| import CrawledResult from '../base/crawled-result' | |||
| import Crawling from '../base/crawling' | |||
| import ErrorMessage from '../base/error-message' | |||
| import Header from './header' | |||
| import Options from './options' | |||
| import cn from '@/utils/classnames' | |||
| import { useModalContext } from '@/context/modal-context' | |||
| import Toast from '@/app/components/base/toast' | |||
| import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets' | |||
| import { sleep } from '@/utils' | |||
| import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' | |||
| const ERROR_I18N_PREFIX = 'common.errorMsg' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| onPreview: (payload: CrawlResultItem) => void | |||
| checkedCrawlResult: CrawlResultItem[] | |||
| onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void | |||
| onJobIdChange: (jobId: string) => void | |||
| crawlOptions: CrawlOptions | |||
| onCrawlOptionsChange: (payload: CrawlOptions) => void | |||
| } | |||
| enum Step { | |||
| init = 'init', | |||
| running = 'running', | |||
| finished = 'finished', | |||
| } | |||
| const JinaReader: FC<Props> = ({ | |||
| onPreview, | |||
| checkedCrawlResult, | |||
| onCheckedCrawlResultChange, | |||
| onJobIdChange, | |||
| crawlOptions, | |||
| onCrawlOptionsChange, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const [step, setStep] = useState<Step>(Step.init) | |||
| const [controlFoldOptions, setControlFoldOptions] = useState<number>(0) | |||
| useEffect(() => { | |||
| if (step !== Step.init) | |||
| setControlFoldOptions(Date.now()) | |||
| }, [step]) | |||
| const { setShowAccountSettingModal } = useModalContext() | |||
| const handleSetting = useCallback(() => { | |||
| setShowAccountSettingModal({ | |||
| payload: 'data-source', | |||
| }) | |||
| }, [setShowAccountSettingModal]) | |||
| const checkValid = useCallback((url: string) => { | |||
| let errorMsg = '' | |||
| if (!url) { | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||
| field: 'url', | |||
| }) | |||
| } | |||
| if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) | |||
| if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { | |||
| errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { | |||
| field: t(`${I18N_PREFIX}.limit`), | |||
| }) | |||
| } | |||
| return { | |||
| isValid: !errorMsg, | |||
| errorMsg, | |||
| } | |||
| }, [crawlOptions, t]) | |||
| const isInit = step === Step.init | |||
| const isCrawlFinished = step === Step.finished | |||
| const isRunning = step === Step.running | |||
| const [crawlResult, setCrawlResult] = useState<{ | |||
| current: number | |||
| total: number | |||
| data: CrawlResultItem[] | |||
| time_consuming: number | string | |||
| } | undefined>(undefined) | |||
| const [crawlErrorMessage, setCrawlErrorMessage] = useState('') | |||
| const showError = isCrawlFinished && crawlErrorMessage | |||
| const waitForCrawlFinished = useCallback(async (jobId: string) => { | |||
| try { | |||
| const res = await checkJinaReaderTaskStatus(jobId) as any | |||
| console.log('res', res) | |||
| if (res.status === 'completed') { | |||
| return { | |||
| isError: false, | |||
| data: { | |||
| ...res, | |||
| total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), | |||
| }, | |||
| } | |||
| } | |||
| if (res.status === 'failed' || !res.status) { | |||
| return { | |||
| isError: true, | |||
| errorMessage: res.message, | |||
| data: { | |||
| data: [], | |||
| }, | |||
| } | |||
| } | |||
| // update the progress | |||
| setCrawlResult({ | |||
| ...res, | |||
| total: Math.min(res.total, parseFloat(crawlOptions.limit as string)), | |||
| }) | |||
| onCheckedCrawlResultChange(res.data || []) // default select the crawl result | |||
| await sleep(2500) | |||
| return await waitForCrawlFinished(jobId) | |||
| } | |||
| catch (e: any) { | |||
| const errorBody = await e.json() | |||
| return { | |||
| isError: true, | |||
| errorMessage: errorBody.message, | |||
| data: { | |||
| data: [], | |||
| }, | |||
| } | |||
| } | |||
| }, [crawlOptions.limit]) | |||
| const handleRun = useCallback(async (url: string) => { | |||
| const { isValid, errorMsg } = checkValid(url) | |||
| if (!isValid) { | |||
| Toast.notify({ | |||
| message: errorMsg!, | |||
| type: 'error', | |||
| }) | |||
| return | |||
| } | |||
| setStep(Step.running) | |||
| try { | |||
| const startTime = Date.now() | |||
| const res = await createJinaReaderTask({ | |||
| url, | |||
| options: crawlOptions, | |||
| }) as any | |||
| if (res.data) { | |||
| const data = { | |||
| current: 1, | |||
| total: 1, | |||
| data: [{ | |||
| title: res.data.title, | |||
| markdown: res.data.content, | |||
| description: res.data.description, | |||
| source_url: res.data.url, | |||
| }], | |||
| time_consuming: (Date.now() - startTime) / 1000, | |||
| } | |||
| setCrawlResult(data) | |||
| onCheckedCrawlResultChange(data.data || []) | |||
| setCrawlErrorMessage('') | |||
| } | |||
| else if (res.job_id) { | |||
| const jobId = res.job_id | |||
| onJobIdChange(jobId) | |||
| const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) | |||
| if (isError) { | |||
| setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) | |||
| } | |||
| else { | |||
| setCrawlResult(data) | |||
| onCheckedCrawlResultChange(data.data || []) // default select the crawl result | |||
| setCrawlErrorMessage('') | |||
| } | |||
| } | |||
| } | |||
| catch (e) { | |||
| setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) | |||
| console.log(e) | |||
| } | |||
| finally { | |||
| setStep(Step.finished) | |||
| } | |||
| }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) | |||
| return ( | |||
| <div> | |||
| <Header onSetting={handleSetting} /> | |||
| <div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}> | |||
| <UrlInput onRun={handleRun} isRunning={isRunning} /> | |||
| <OptionsWrap | |||
| className={cn('mt-4')} | |||
| controlFoldOptions={controlFoldOptions} | |||
| > | |||
| <Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} /> | |||
| </OptionsWrap> | |||
| {!isInit && ( | |||
| <div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'> | |||
| {isRunning | |||
| && <Crawling | |||
| className='mt-2' | |||
| crawledNum={crawlResult?.current || 0} | |||
| totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0} | |||
| />} | |||
| {showError && ( | |||
| <ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} /> | |||
| )} | |||
| {isCrawlFinished && !showError | |||
| && <CrawledResult | |||
| className='mb-2' | |||
| list={crawlResult?.data || []} | |||
| checkedList={checkedCrawlResult} | |||
| onSelectedChange={onCheckedCrawlResultChange} | |||
| onPreview={onPreview} | |||
| usedTime={parseFloat(crawlResult?.time_consuming as string) || 0} | |||
| /> | |||
| } | |||
| </div> | |||
| )} | |||
| </div> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(JinaReader) | |||
| @@ -0,0 +1,59 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import CheckboxWithLabel from '../base/checkbox-with-label' | |||
| import Field from '../base/field' | |||
| import cn from '@/utils/classnames' | |||
| import type { CrawlOptions } from '@/models/datasets' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| className?: string | |||
| payload: CrawlOptions | |||
| onChange: (payload: CrawlOptions) => void | |||
| } | |||
| const Options: FC<Props> = ({ | |||
| className = '', | |||
| payload, | |||
| onChange, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const handleChange = useCallback((key: keyof CrawlOptions) => { | |||
| return (value: any) => { | |||
| onChange({ | |||
| ...payload, | |||
| [key]: value, | |||
| }) | |||
| } | |||
| }, [payload, onChange]) | |||
| return ( | |||
| <div className={cn(className, ' space-y-2')}> | |||
| <CheckboxWithLabel | |||
| label={t(`${I18N_PREFIX}.crawlSubPage`)} | |||
| isChecked={payload.crawl_sub_pages} | |||
| onChange={handleChange('crawl_sub_pages')} | |||
| /> | |||
| <CheckboxWithLabel | |||
| label={t(`${I18N_PREFIX}.useSitemap`)} | |||
| isChecked={payload.use_sitemap} | |||
| onChange={handleChange('use_sitemap')} | |||
| tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string} | |||
| /> | |||
| <div className='flex justify-between space-x-4'> | |||
| <Field | |||
| className='grow shrink-0' | |||
| label={t(`${I18N_PREFIX}.limit`)} | |||
| value={payload.limit} | |||
| onChange={handleChange('limit')} | |||
| isNumber | |||
| isRequired | |||
| /> | |||
| </div> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(Options) | |||
| @@ -2,35 +2,56 @@ | |||
| import type { FC } from 'react' | |||
| import React from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import s from './index.module.css' | |||
| import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' | |||
| import Button from '@/app/components/base/button' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| const I18N_PREFIX = 'datasetCreation.stepOne.website' | |||
| type Props = { | |||
| onConfig: () => void | |||
| provider: DataSourceProvider | |||
| } | |||
| const NoData: FC<Props> = ({ | |||
| onConfig, | |||
| provider, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const providerConfig = { | |||
| [DataSourceProvider.jinaReader]: { | |||
| emoji: <span className={s.jinaLogo} />, | |||
| title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), | |||
| description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), | |||
| }, | |||
| [DataSourceProvider.fireCrawl]: { | |||
| emoji: '🔥', | |||
| title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), | |||
| description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), | |||
| }, | |||
| } | |||
| const currentProvider = providerConfig[provider] | |||
| return ( | |||
| <div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'> | |||
| <div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'> | |||
| 🔥 | |||
| </div> | |||
| <div className='my-2'> | |||
| <span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span> | |||
| <div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'> | |||
| {t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)} | |||
| <> | |||
| <div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'> | |||
| <div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'> | |||
| {currentProvider.emoji} | |||
| </div> | |||
| <div className='my-2'> | |||
| <span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span> | |||
| <div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'> | |||
| {currentProvider.description} | |||
| </div> | |||
| </div> | |||
| <Button variant='primary' onClick={onConfig}> | |||
| {t(`${I18N_PREFIX}.configure`)} | |||
| </Button> | |||
| </div> | |||
| <Button variant='primary' onClick={onConfig}> | |||
| {t(`${I18N_PREFIX}.configure`)} | |||
| </Button> | |||
| </div> | |||
| </> | |||
| ) | |||
| } | |||
| export default React.memo(NoData) | |||
| @@ -9,7 +9,7 @@ import { | |||
| import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' | |||
| import Button from '@/app/components/base/button' | |||
| import type { FirecrawlConfig } from '@/models/common' | |||
| import Field from '@/app/components/datasets/create/website/firecrawl/base/field' | |||
| import Field from '@/app/components/datasets/create/website/base/field' | |||
| import Toast from '@/app/components/base/toast' | |||
| import { createDataSourceApiKeyBinding } from '@/service/datasets' | |||
| import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' | |||
| @@ -0,0 +1,140 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { | |||
| PortalToFollowElem, | |||
| PortalToFollowElemContent, | |||
| } from '@/app/components/base/portal-to-follow-elem' | |||
| import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' | |||
| import Button from '@/app/components/base/button' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| import Field from '@/app/components/datasets/create/website/base/field' | |||
| import Toast from '@/app/components/base/toast' | |||
| import { createDataSourceApiKeyBinding } from '@/service/datasets' | |||
| import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' | |||
| type Props = { | |||
| onCancel: () => void | |||
| onSaved: () => void | |||
| } | |||
| const I18N_PREFIX = 'datasetCreation.jinaReader' | |||
| const ConfigJinaReaderModal: FC<Props> = ({ | |||
| onCancel, | |||
| onSaved, | |||
| }) => { | |||
| const { t } = useTranslation() | |||
| const [isSaving, setIsSaving] = useState(false) | |||
| const [apiKey, setApiKey] = useState('') | |||
| const handleSave = useCallback(async () => { | |||
| if (isSaving) | |||
| return | |||
| let errorMsg = '' | |||
| if (!errorMsg) { | |||
| if (!apiKey) { | |||
| errorMsg = t('common.errorMsg.fieldRequired', { | |||
| field: 'API Key', | |||
| }) | |||
| } | |||
| } | |||
| if (errorMsg) { | |||
| Toast.notify({ | |||
| type: 'error', | |||
| message: errorMsg, | |||
| }) | |||
| return | |||
| } | |||
| const postData = { | |||
| category: 'website', | |||
| provider: DataSourceProvider.jinaReader, | |||
| credentials: { | |||
| auth_type: 'bearer', | |||
| config: { | |||
| api_key: apiKey, | |||
| }, | |||
| }, | |||
| } | |||
| try { | |||
| setIsSaving(true) | |||
| await createDataSourceApiKeyBinding(postData) | |||
| Toast.notify({ | |||
| type: 'success', | |||
| message: t('common.api.success'), | |||
| }) | |||
| } | |||
| finally { | |||
| setIsSaving(false) | |||
| } | |||
| onSaved() | |||
| }, [apiKey, onSaved, t, isSaving]) | |||
| return ( | |||
| <PortalToFollowElem open> | |||
| <PortalToFollowElemContent className='w-full h-full z-[60]'> | |||
| <div className='fixed inset-0 flex items-center justify-center bg-black/[.25]'> | |||
| <div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-white shadow-xl rounded-2xl overflow-y-auto'> | |||
| <div className='px-8 pt-8'> | |||
| <div className='flex justify-between items-center mb-4'> | |||
| <div className='text-xl font-semibold text-gray-900'>{t(`${I18N_PREFIX}.configJinaReader`)}</div> | |||
| </div> | |||
| <div className='space-y-4'> | |||
| <Field | |||
| label='API Key' | |||
| labelClassName='!text-sm' | |||
| isRequired | |||
| value={apiKey} | |||
| onChange={(value: string | number) => setApiKey(value as string)} | |||
| placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!} | |||
| /> | |||
| </div> | |||
| <div className='my-8 flex justify-between items-center h-8'> | |||
| <a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-[#155EEF]' target='_blank' href='https://jina.ai/reader/'> | |||
| <span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span> | |||
| <LinkExternal02 className='w-3 h-3' /> | |||
| </a> | |||
| <div className='flex'> | |||
| <Button | |||
| size='large' | |||
| className='mr-2' | |||
| onClick={onCancel} | |||
| > | |||
| {t('common.operation.cancel')} | |||
| </Button> | |||
| <Button | |||
| variant='primary' | |||
| size='large' | |||
| onClick={handleSave} | |||
| loading={isSaving} | |||
| > | |||
| {t('common.operation.save')} | |||
| </Button> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| <div className='border-t-[0.5px] border-t-black/5'> | |||
| <div className='flex justify-center items-center py-3 bg-gray-50 text-xs text-gray-500'> | |||
| <Lock01 className='mr-1 w-3 h-3 text-gray-500' /> | |||
| {t('common.modelProvider.encrypted.front')} | |||
| <a | |||
| className='text-primary-600 mx-1' | |||
| target='_blank' rel='noopener noreferrer' | |||
| href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html' | |||
| > | |||
| PKCS1_OAEP | |||
| </a> | |||
| {t('common.modelProvider.encrypted.back')} | |||
| </div> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| </PortalToFollowElemContent> | |||
| </PortalToFollowElem> | |||
| ) | |||
| } | |||
| export default React.memo(ConfigJinaReaderModal) | |||
| @@ -2,11 +2,12 @@ | |||
| import type { FC } from 'react' | |||
| import React, { useCallback, useEffect, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { useBoolean } from 'ahooks' | |||
| import Panel from '../panel' | |||
| import { DataSourceType } from '../panel/types' | |||
| import ConfigFirecrawlModal from './config-firecrawl-modal' | |||
| import ConfigJinaReaderModal from './config-jina-reader-modal' | |||
| import cn from '@/utils/classnames' | |||
| import s from '@/app/components/datasets/create/website/index.module.css' | |||
| import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' | |||
| import type { | |||
| @@ -19,9 +20,11 @@ import { | |||
| } from '@/models/common' | |||
| import Toast from '@/app/components/base/toast' | |||
| type Props = {} | |||
| type Props = { | |||
| provider: DataSourceProvider | |||
| } | |||
| const DataSourceWebsite: FC<Props> = () => { | |||
| const DataSourceWebsite: FC<Props> = ({ provider }) => { | |||
| const { t } = useTranslation() | |||
| const { isCurrentWorkspaceManager } = useAppContext() | |||
| const [sources, setSources] = useState<DataSourceItem[]>([]) | |||
| @@ -36,22 +39,26 @@ const DataSourceWebsite: FC<Props> = () => { | |||
| // eslint-disable-next-line react-hooks/exhaustive-deps | |||
| }, []) | |||
| const [isShowConfig, { | |||
| setTrue: showConfig, | |||
| setFalse: hideConfig, | |||
| }] = useBoolean(false) | |||
| const [configTarget, setConfigTarget] = useState<DataSourceProvider | null>(null) | |||
| const showConfig = useCallback((provider: DataSourceProvider) => { | |||
| setConfigTarget(provider) | |||
| }, [setConfigTarget]) | |||
| const hideConfig = useCallback(() => { | |||
| setConfigTarget(null) | |||
| }, [setConfigTarget]) | |||
| const handleAdded = useCallback(() => { | |||
| checkSetApiKey() | |||
| hideConfig() | |||
| }, [checkSetApiKey, hideConfig]) | |||
| const getIdByProvider = (provider: string): string | undefined => { | |||
| const getIdByProvider = (provider: DataSourceProvider): string | undefined => { | |||
| const source = sources.find(item => item.provider === provider) | |||
| return source?.id | |||
| } | |||
| const handleRemove = useCallback((provider: string) => { | |||
| const handleRemove = useCallback((provider: DataSourceProvider) => { | |||
| return async () => { | |||
| const dataSourceId = getIdByProvider(provider) | |||
| if (dataSourceId) { | |||
| @@ -69,22 +76,34 @@ const DataSourceWebsite: FC<Props> = () => { | |||
| <> | |||
| <Panel | |||
| type={DataSourceType.website} | |||
| isConfigured={sources.length > 0} | |||
| onConfigure={showConfig} | |||
| provider={provider} | |||
| isConfigured={sources.find(item => item.provider === provider) !== undefined} | |||
| onConfigure={() => showConfig(provider)} | |||
| readOnly={!isCurrentWorkspaceManager} | |||
| configuredList={sources.map(item => ({ | |||
| configuredList={sources.filter(item => item.provider === provider).map(item => ({ | |||
| id: item.id, | |||
| logo: ({ className }: { className: string }) => ( | |||
| <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div> | |||
| item.provider === DataSourceProvider.fireCrawl | |||
| ? ( | |||
| <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}>🔥</div> | |||
| ) | |||
| : ( | |||
| <div className={cn(className, 'flex items-center justify-center w-5 h-5 bg-white border border-gray-100 text-xs font-medium text-gray-500 rounded ml-3')}> | |||
| <span className={s.jinaLogo} /> | |||
| </div> | |||
| ) | |||
| ), | |||
| name: 'Firecrawl', | |||
| name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', | |||
| isActive: true, | |||
| }))} | |||
| onRemove={handleRemove(DataSourceProvider.fireCrawl)} | |||
| onRemove={handleRemove(provider)} | |||
| /> | |||
| {isShowConfig && ( | |||
| {configTarget === DataSourceProvider.fireCrawl && ( | |||
| <ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} /> | |||
| )} | |||
| {configTarget === DataSourceProvider.jinaReader && ( | |||
| <ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} /> | |||
| )} | |||
| </> | |||
| ) | |||
| @@ -3,6 +3,7 @@ import { useTranslation } from 'react-i18next' | |||
| import DataSourceNotion from './data-source-notion' | |||
| import DataSourceWebsite from './data-source-website' | |||
| import { fetchDataSource } from '@/service/common' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| export default function DataSourcePage() { | |||
| const { t } = useTranslation() | |||
| @@ -13,7 +14,8 @@ export default function DataSourcePage() { | |||
| <div className='mb-8'> | |||
| <div className='mb-2 text-sm font-medium text-gray-900'>{t('common.dataSource.add')}</div> | |||
| <DataSourceNotion workspaces={notionWorkspaces} /> | |||
| <DataSourceWebsite /> | |||
| <DataSourceWebsite provider={DataSourceProvider.jinaReader} /> | |||
| <DataSourceWebsite provider={DataSourceProvider.fireCrawl} /> | |||
| </div> | |||
| ) | |||
| } | |||
| @@ -8,10 +8,12 @@ import ConfigItem from './config-item' | |||
| import s from './style.module.css' | |||
| import { DataSourceType } from './types' | |||
| import { DataSourceProvider } from '@/models/common' | |||
| import cn from '@/utils/classnames' | |||
| type Props = { | |||
| type: DataSourceType | |||
| provider: DataSourceProvider | |||
| isConfigured: boolean | |||
| onConfigure: () => void | |||
| readOnly: boolean | |||
| @@ -25,6 +27,7 @@ type Props = { | |||
| const Panel: FC<Props> = ({ | |||
| type, | |||
| provider, | |||
| isConfigured, | |||
| onConfigure, | |||
| readOnly, | |||
| @@ -46,7 +49,7 @@ const Panel: FC<Props> = ({ | |||
| <div className='text-sm font-medium text-gray-800'>{t(`common.dataSource.${type}.title`)}</div> | |||
| {isWebsite && ( | |||
| <div className='ml-1 leading-[18px] px-1.5 rounded-md bg-white border border-gray-100 text-xs font-medium text-gray-700'> | |||
| <span className='text-gray-500'>{t('common.dataSource.website.with')}</span> 🔥 Firecrawl | |||
| <span className='text-gray-500'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'} | |||
| </div> | |||
| )} | |||
| </div> | |||
| @@ -16,6 +16,11 @@ const translation = { | |||
| apiKeyPlaceholder: 'API key from firecrawl.dev', | |||
| getApiKeyLinkText: 'Get your API key from firecrawl.dev', | |||
| }, | |||
| jinaReader: { | |||
| configJinaReader: 'Configure Jina Reader', | |||
| apiKeyPlaceholder: 'API key from jina.ai', | |||
| getApiKeyLinkText: 'Get your free API key at jina.ai', | |||
| }, | |||
| stepOne: { | |||
| filePreview: 'File Preview', | |||
| pagePreview: 'Page Preview', | |||
| @@ -56,13 +61,21 @@ const translation = { | |||
| failed: 'Creation failed', | |||
| }, | |||
| website: { | |||
| chooseProvider: 'Select a provider', | |||
| fireCrawlNotConfigured: 'Firecrawl is not configured', | |||
| fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', | |||
| jinaReaderNotConfigured: 'Jina Reader is not configured', | |||
| jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', | |||
| configure: 'Configure', | |||
| run: 'Run', | |||
| firecrawlTitle: 'Extract web content with 🔥Firecrawl', | |||
| firecrawlDoc: 'Firecrawl docs', | |||
| firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', | |||
| jinaReaderTitle: 'Convert the entire site to Markdown', | |||
| jinaReaderDoc: 'Learn more about Jina Reader', | |||
| jinaReaderDocLink: 'https://jina.ai/reader', | |||
| useSitemap: 'Use sitemap', | |||
| useSitemapTooltip: 'Follow the sitemap to crawl the site. If not, Jina Reader will crawl iteratively based on page relevance, yielding fewer but higher-quality pages.', | |||
| options: 'Options', | |||
| crawlSubPage: 'Crawl sub-pages', | |||
| limit: 'Limit', | |||
| @@ -70,7 +83,7 @@ const translation = { | |||
| excludePaths: 'Exclude paths', | |||
| includeOnlyPaths: 'Include only paths', | |||
| extractOnlyMainContent: 'Extract only main content (no headers, navs, footers, etc.)', | |||
| exceptionErrorTitle: 'An exception occurred while running Firecrawl job:', | |||
| exceptionErrorTitle: 'An exception occurred while running crawling job:', | |||
| unknownError: 'Unknown error', | |||
| totalPageScraped: 'Total pages scraped:', | |||
| selectAll: 'Select All', | |||
| @@ -16,6 +16,11 @@ const translation = { | |||
| apiKeyPlaceholder: '从 firecrawl.dev 获取 API Key', | |||
| getApiKeyLinkText: '从 firecrawl.dev 获取您的 API Key', | |||
| }, | |||
| jinaReader: { | |||
| configJinaReader: '配置 Jina Reader', | |||
| apiKeyPlaceholder: '从 jina.ai 获取 API Key', | |||
| getApiKeyLinkText: '从 jina.ai 获取您的免费 API Key', | |||
| }, | |||
| stepOne: { | |||
| filePreview: '文件预览', | |||
| pagePreview: '页面预览', | |||
| @@ -56,13 +61,21 @@ const translation = { | |||
| failed: '创建失败', | |||
| }, | |||
| website: { | |||
| chooseProvider: '选择工具', | |||
| fireCrawlNotConfigured: 'Firecrawl 未配置', | |||
| fireCrawlNotConfiguredDescription: '请配置 Firecrawl 的 API 密钥以使用它。', | |||
| jinaReaderNotConfigured: 'Jina Reader 未配置', | |||
| jinaReaderNotConfiguredDescription: '请配置 Jina Reader 的免费 API 密钥以访问它。', | |||
| configure: '配置', | |||
| run: '运行', | |||
| firecrawlTitle: '使用 🔥Firecrawl 提取网页内容', | |||
| firecrawlDoc: 'Firecrawl 文档', | |||
| firecrawlDocLink: 'https://docs.dify.ai/v/zh-hans/guides/knowledge-base/sync-from-website', | |||
| jinaReaderTitle: '将整个站点内容转换为 Markdown 格式', | |||
| jinaReaderDoc: '了解更多关于 Jina Reader', | |||
| jinaReaderDocLink: 'https://jina.ai/reader', | |||
| useSitemap: '使用 sitemap', | |||
| useSitemapTooltip: '根据 sitemap 爬取站点。否则,Jina Reader 将基于页面相关性迭代爬取,抓取较少的页面,但质量更高。', | |||
| options: '选项', | |||
| crawlSubPage: '爬取子页面', | |||
| limit: '限制数量', | |||
| @@ -70,7 +83,7 @@ const translation = { | |||
| excludePaths: '排除路径', | |||
| includeOnlyPaths: '仅包含路径', | |||
| extractOnlyMainContent: '仅提取主要内容(无标题、导航、页脚等)', | |||
| exceptionErrorTitle: '运行 Firecrawl 时发生异常:', | |||
| exceptionErrorTitle: '运行时发生异常:', | |||
| unknownError: '未知错误', | |||
| totalPageScraped: '抓取页面总数:', | |||
| selectAll: '全选', | |||
| @@ -177,6 +177,7 @@ export enum DataSourceCategory { | |||
| } | |||
| export enum DataSourceProvider { | |||
| fireCrawl = 'firecrawl', | |||
| jinaReader = 'jinareader', | |||
| } | |||
| export type FirecrawlConfig = { | |||
| @@ -49,6 +49,7 @@ export type CrawlOptions = { | |||
| excludes: string | |||
| limit: number | string | |||
| max_depth: number | string | |||
| use_sitemap: boolean | |||
| } | |||
| export type CrawlResultItem = { | |||
| @@ -23,7 +23,7 @@ import type { | |||
| SegmentsResponse, | |||
| createDocumentResponse, | |||
| } from '@/models/datasets' | |||
| import type { CommonResponse, DataSourceNotionWorkspace } from '@/models/common' | |||
| import { type CommonResponse, type DataSourceNotionWorkspace, DataSourceProvider } from '@/models/common' | |||
| import type { | |||
| ApiKeysListResponse, | |||
| CreateApiKeyResponse, | |||
| @@ -253,7 +253,7 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> = | |||
| return post<CommonResponse>('website/crawl', { | |||
| body: { | |||
| ...body, | |||
| provider: 'firecrawl', | |||
| provider: DataSourceProvider.fireCrawl, | |||
| }, | |||
| }) | |||
| } | |||
| @@ -261,7 +261,26 @@ export const createFirecrawlTask: Fetcher<CommonResponse, Record<string, any>> = | |||
| export const checkFirecrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => { | |||
| return get<CommonResponse>(`website/crawl/status/${jobId}`, { | |||
| params: { | |||
| provider: 'firecrawl', | |||
| provider: DataSourceProvider.fireCrawl, | |||
| }, | |||
| }, { | |||
| silent: true, | |||
| }) | |||
| } | |||
| export const createJinaReaderTask: Fetcher<CommonResponse, Record<string, any>> = (body) => { | |||
| return post<CommonResponse>('website/crawl', { | |||
| body: { | |||
| ...body, | |||
| provider: DataSourceProvider.jinaReader, | |||
| }, | |||
| }) | |||
| } | |||
| export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => { | |||
| return get<CommonResponse>(`website/crawl/status/${jobId}`, { | |||
| params: { | |||
| provider: 'jinareader', | |||
| }, | |||
| }, { | |||
| silent: true, | |||