浏览代码

feat: add switches for jina firecrawl watercrawl (#18153)

tags/1.3.0
crazywoola 6 个月前
父节点
当前提交
e1455cecd8
没有帐户链接到提交者的电子邮件

+ 6
- 0
docker/.env.example 查看文件

API_TOOL_DEFAULT_CONNECT_TIMEOUT=10 API_TOOL_DEFAULT_CONNECT_TIMEOUT=10
API_TOOL_DEFAULT_READ_TIMEOUT=60 API_TOOL_DEFAULT_READ_TIMEOUT=60


# -------------------------------
# Datasource Configuration
# --------------------------------
ENABLE_WEBSITE_JINAREADER=true
ENABLE_WEBSITE_FIRECRAWL=true
ENABLE_WEBSITE_WATERCRAWL=true


# ------------------------------ # ------------------------------
# Database Configuration # Database Configuration

+ 3
- 1
docker/docker-compose-template.yaml 查看文件

MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}

ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database. # The postgres database.
db: db:
image: postgres:15-alpine image: postgres:15-alpine

+ 6
- 1
docker/docker-compose.yaml 查看文件

CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-} CELERY_MIN_WORKERS: ${CELERY_MIN_WORKERS:-}
API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10} API_TOOL_DEFAULT_CONNECT_TIMEOUT: ${API_TOOL_DEFAULT_CONNECT_TIMEOUT:-10}
API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60} API_TOOL_DEFAULT_READ_TIMEOUT: ${API_TOOL_DEFAULT_READ_TIMEOUT:-60}
ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
DB_USERNAME: ${DB_USERNAME:-postgres} DB_USERNAME: ${DB_USERNAME:-postgres}
DB_PASSWORD: ${DB_PASSWORD:-difyai123456} DB_PASSWORD: ${DB_PASSWORD:-difyai123456}
DB_HOST: ${DB_HOST:-db} DB_HOST: ${DB_HOST:-db}
MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10} MAX_TOOLS_NUM: ${MAX_TOOLS_NUM:-10}
MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10} MAX_PARALLEL_LIMIT: ${MAX_PARALLEL_LIMIT:-10}
MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5} MAX_ITERATIONS_NUM: ${MAX_ITERATIONS_NUM:-5}

ENABLE_WEBSITE_JINAREADER: ${ENABLE_WEBSITE_JINAREADER:-true}
ENABLE_WEBSITE_FIRECRAWL: ${ENABLE_WEBSITE_FIRECRAWL:-true}
ENABLE_WEBSITE_WATERCRAWL: ${ENABLE_WEBSITE_WATERCRAWL:-true}
# The postgres database. # The postgres database.
db: db:
image: postgres:15-alpine image: postgres:15-alpine

+ 5
- 0
web/.env.example 查看文件



# The maximum number of iterations for agent setting # The maximum number of iterations for agent setting
NEXT_PUBLIC_MAX_ITERATIONS_NUM=5 NEXT_PUBLIC_MAX_ITERATIONS_NUM=5

NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=true
NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=true
NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=true


+ 7
- 7
web/app/components/datasets/create/step-one/index.tsx 查看文件

import VectorSpaceFull from '@/app/components/billing/vector-space-full' import VectorSpaceFull from '@/app/components/billing/vector-space-full'
import classNames from '@/utils/classnames' import classNames from '@/utils/classnames'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'
type IStepOneProps = { type IStepOneProps = {
datasetId?: string datasetId?: string
dataSourceType?: DataSourceType dataSourceType?: DataSourceType
return true return true
if (files.some(file => !file.file.id)) if (files.some(file => !file.file.id))
return true return true
if (isShowVectorSpaceFull)
return true
return false
return isShowVectorSpaceFull
}, [files, isShowVectorSpaceFull]) }, [files, isShowVectorSpaceFull])


return ( return (
{t('datasetCreation.stepOne.dataSourceType.notion')} {t('datasetCreation.stepOne.dataSourceType.notion')}
</span> </span>
</div> </div>
<div
{(ENABLE_WEBSITE_FIRECRAWL || ENABLE_WEBSITE_JINAREADER || ENABLE_WEBSITE_WATERCRAWL) && (
<div
className={cn( className={cn(
s.dataSourceItem, s.dataSourceItem,
'system-sm-medium', 'system-sm-medium',
dataSourceTypeDisable && dataSourceType !== DataSourceType.WEB && s.disabled, dataSourceTypeDisable && dataSourceType !== DataSourceType.WEB && s.disabled,
)} )}
onClick={() => changeType(DataSourceType.WEB)} onClick={() => changeType(DataSourceType.WEB)}
>
>
<span className={cn(s.datasetIcon, s.web)} /> <span className={cn(s.datasetIcon, s.web)} />
<span <span
title={t('datasetCreation.stepOne.dataSourceType.web')} title={t('datasetCreation.stepOne.dataSourceType.web')}
> >
{t('datasetCreation.stepOne.dataSourceType.web')} {t('datasetCreation.stepOne.dataSourceType.web')}
</span> </span>
</div>
</div>
)}
</div> </div>
) )
} }

+ 7
- 6
web/app/components/datasets/create/website/index.tsx 查看文件

import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets' import { fetchDataSources } from '@/service/datasets'
import { type DataSourceItem, DataSourceProvider } from '@/models/common' import { type DataSourceItem, DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'


type Props = { type Props = {
onPreview: (payload: CrawlResultItem) => void onPreview: (payload: CrawlResultItem) => void
{t('datasetCreation.stepOne.website.chooseProvider')} {t('datasetCreation.stepOne.website.chooseProvider')}
</div> </div>
<div className="flex space-x-2"> <div className="flex space-x-2">
<button
{ENABLE_WEBSITE_JINAREADER && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2', className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.jinaReader selectedProvider === DataSourceProvider.jinaReader
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
> >
<span className={cn(s.jinaLogo, 'mr-2')}/> <span className={cn(s.jinaLogo, 'mr-2')}/>
<span>Jina Reader</span> <span>Jina Reader</span>
</button>
<button
</button>}
{ENABLE_WEBSITE_FIRECRAWL && <button
className={cn('rounded-lg px-4 py-2', className={cn('rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.fireCrawl selectedProvider === DataSourceProvider.fireCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)} onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
> >
🔥 Firecrawl 🔥 Firecrawl
</button>
<button
</button>}
{ENABLE_WEBSITE_WATERCRAWL && <button
className={cn('flex items-center justify-center rounded-lg px-4 py-2', className={cn('flex items-center justify-center rounded-lg px-4 py-2',
selectedProvider === DataSourceProvider.waterCrawl selectedProvider === DataSourceProvider.waterCrawl
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary' ? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
> >
<span className={cn(s.watercrawlLogo, 'mr-2')}/> <span className={cn(s.watercrawlLogo, 'mr-2')}/>
<span>WaterCrawl</span> <span>WaterCrawl</span>
</button>
</button>}
</div> </div>
</div> </div>
{source && selectedProvider === DataSourceProvider.fireCrawl && ( {source && selectedProvider === DataSourceProvider.fireCrawl && (

+ 11
- 9
web/app/components/datasets/create/website/no-data.tsx 查看文件

import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others' import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import Button from '@/app/components/base/button' import Button from '@/app/components/base/button'
import { DataSourceProvider } from '@/models/common' import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'


const I18N_PREFIX = 'datasetCreation.stepOne.website' const I18N_PREFIX = 'datasetCreation.stepOne.website'




const NoData: FC<Props> = ({ const NoData: FC<Props> = ({
onConfig, onConfig,
provider,
}) => { }) => {
const { t } = useTranslation() const { t } = useTranslation()


const providerConfig = { const providerConfig = {
[DataSourceProvider.jinaReader]: {
[DataSourceProvider.jinaReader]: ENABLE_WEBSITE_JINAREADER ? {
emoji: <span className={s.jinaLogo} />, emoji: <span className={s.jinaLogo} />,
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`), title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`), description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
},
[DataSourceProvider.fireCrawl]: {
} : null,
[DataSourceProvider.fireCrawl]: ENABLE_WEBSITE_FIRECRAWL ? {
emoji: '🔥', emoji: '🔥',
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
},
[DataSourceProvider.waterCrawl]: {
emoji: <span className={s.watercrawlLogo} />,
} : null,
[DataSourceProvider.waterCrawl]: ENABLE_WEBSITE_WATERCRAWL ? {
emoji: '💧',
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
},
} : null,
} }


const currentProvider = providerConfig[provider]
const currentProvider = Object.values(providerConfig).find(provider => provider !== null) || providerConfig[DataSourceProvider.jinaReader]

if (!currentProvider) return null


return ( return (
<> <>

+ 4
- 3
web/app/components/header/account-setting/data-source-page/index.tsx 查看文件

import DataSourceWebsite from './data-source-website' import DataSourceWebsite from './data-source-website'
import { fetchDataSource } from '@/service/common' import { fetchDataSource } from '@/service/common'
import { DataSourceProvider } from '@/models/common' import { DataSourceProvider } from '@/models/common'
import { ENABLE_WEBSITE_FIRECRAWL, ENABLE_WEBSITE_JINAREADER, ENABLE_WEBSITE_WATERCRAWL } from '@/config'


export default function DataSourcePage() { export default function DataSourcePage() {
const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource) const { data } = useSWR({ url: 'data-source/integrates' }, fetchDataSource)
return ( return (
<div className='mb-8'> <div className='mb-8'>
<DataSourceNotion workspaces={notionWorkspaces} /> <DataSourceNotion workspaces={notionWorkspaces} />
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} />
{ENABLE_WEBSITE_JINAREADER && <DataSourceWebsite provider={DataSourceProvider.jinaReader} />}
{ENABLE_WEBSITE_FIRECRAWL && <DataSourceWebsite provider={DataSourceProvider.fireCrawl} />}
{ENABLE_WEBSITE_WATERCRAWL && <DataSourceWebsite provider={DataSourceProvider.waterCrawl} />}
</div> </div>
) )
} }

+ 12
- 0
web/config/index.ts 查看文件

maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string) maxIterationsNum = Number.parseInt(globalThis.document.body.getAttribute('data-public-max-iterations-num') as string)


export const MAX_ITERATIONS_NUM = maxIterationsNum export const MAX_ITERATIONS_NUM = maxIterationsNum

export const ENABLE_WEBSITE_JINAREADER = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER === 'true'
: true

export const ENABLE_WEBSITE_FIRECRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL === 'true'
: true

export const ENABLE_WEBSITE_WATERCRAWL = process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL !== undefined
? process.env.NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL === 'true'
: true

+ 3
- 1
web/docker/entrypoint.sh 查看文件

export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM} export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM}

export NEXT_PUBLIC_ENABLE_WEBSITE_JINAREADER=${ENABLE_WEBSITE_JINAREADER:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_FIRECRAWL=${ENABLE_WEBSITE_FIRECRAWL:-true}
export NEXT_PUBLIC_ENABLE_WEBSITE_WATERCRAWL=${ENABLE_WEBSITE_WATERCRAWL:-true}
pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon

正在加载...
取消
保存