Просмотр исходного кода

feat: enhance website crawl functionality with state management and result handling

tags/2.0.0-beta.1
twwu 4 месяцев назад
Родитель
Сommit
6ba4a4c165

+ 13
- 2
web/app/components/datasets/documents/create-from-pipeline/hooks.ts Просмотреть файл

import { BlockEnum, type Node } from '@/app/components/workflow/types' import { BlockEnum, type Node } from '@/app/components/workflow/types'
import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types' import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types'
import type { DatasourceType } from '@/models/pipeline' import type { DatasourceType } from '@/models/pipeline'
import type { CrawlResultItem, DocumentItem, FileItem } from '@/models/datasets'
import type { CrawlResult, CrawlResultItem, DocumentItem, FileItem } from '@/models/datasets'
import { CrawlStep } from '@/models/datasets'
import produce from 'immer' import produce from 'immer'
import type { NotionPage } from '@/models/common' import type { NotionPage } from '@/models/common'


export const useWebsiteCrawl = () => { export const useWebsiteCrawl = () => {
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([]) const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [currentWebsite, setCurrentWebsite] = useState<CrawlResultItem | undefined>() const [currentWebsite, setCurrentWebsite] = useState<CrawlResultItem | undefined>()
const [crawlResult, setCrawlResult] = useState<CrawlResult | undefined>()
const [step, setStep] = useState<CrawlStep>(CrawlStep.init)
const [previewIndex, setPreviewIndex] = useState<number>(-1)


const previewWebsitePage = useRef<CrawlResultItem>(websitePages[0]) const previewWebsitePage = useRef<CrawlResultItem>(websitePages[0])


const updateCurrentWebsite = useCallback((website: CrawlResultItem) => {
const updateCurrentWebsite = useCallback((website: CrawlResultItem, index: number) => {
setCurrentWebsite(website) setCurrentWebsite(website)
setPreviewIndex(index)
}, []) }, [])


const hideWebsitePreview = useCallback(() => { const hideWebsitePreview = useCallback(() => {
setCurrentWebsite(undefined) setCurrentWebsite(undefined)
setPreviewIndex(-1)
}, []) }, [])


const updataCheckedCrawlResultChange = useCallback((checkedCrawlResult: CrawlResultItem[]) => { const updataCheckedCrawlResultChange = useCallback((checkedCrawlResult: CrawlResultItem[]) => {


return { return {
websitePages, websitePages,
crawlResult,
setCrawlResult,
step,
setStep,
previewWebsitePage, previewWebsitePage,
updataCheckedCrawlResultChange, updataCheckedCrawlResultChange,
currentWebsite, currentWebsite,
updateCurrentWebsite, updateCurrentWebsite,
previewIndex,
hideWebsitePreview, hideWebsitePreview,
} }
} }

+ 112
- 96
web/app/components/datasets/documents/create-from-pipeline/index.tsx Просмотреть файл

} = useOnlineDocuments() } = useOnlineDocuments()
const { const {
websitePages, websitePages,
crawlResult,
setCrawlResult,
step,
setStep,
previewWebsitePage, previewWebsitePage,
updataCheckedCrawlResultChange, updataCheckedCrawlResultChange,
currentWebsite, currentWebsite,
updateCurrentWebsite, updateCurrentWebsite,
previewIndex,
hideWebsitePreview, hideWebsitePreview,
} = useWebsiteCrawl() } = useWebsiteCrawl()


<div <div
className='relative flex h-[calc(100vh-56px)] w-full min-w-[1024px] overflow-x-auto rounded-t-2xl border-t border-effects-highlight bg-background-default-subtle' className='relative flex h-[calc(100vh-56px)] w-full min-w-[1024px] overflow-x-auto rounded-t-2xl border-t border-effects-highlight bg-background-default-subtle'
> >
<div className='flex h-full flex-1 flex-col px-14'>
<LeftHeader
steps={steps}
title={t('datasetPipeline.addDocuments.title')}
currentStep={currentStep}
/>
<div className='grow overflow-y-auto'>
{
currentStep === 1 && (
<div className='flex flex-col gap-y-5 pt-4'>
<DataSourceOptions
datasourceNodeId={datasource?.nodeId || ''}
onSelect={setDatasource}
pipelineNodes={(pipelineInfo?.graph.nodes || []) as Node<DataSourceNodeType>[]}
/>
{datasource?.type === DatasourceType.localFile && (
<LocalFile
files={fileList}
allowedExtensions={datasource?.fileExtensions || []}
updateFile={updateFile}
updateFileList={updateFileList}
onPreview={updateCurrentFile}
notSupportBatchUpload={notSupportBatchUpload}
/>
)}
{datasource?.type === DatasourceType.onlineDocument && (
<OnlineDocuments
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
onlineDocuments={onlineDocuments}
updateOnlineDocuments={updateOnlineDocuments}
canPreview
onPreview={updateCurrentPage}
/>
)}
{datasource?.type === DatasourceType.websiteCrawl && (
<WebsiteCrawl
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updataCheckedCrawlResultChange}
onPreview={updateCurrentWebsite}
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col px-14'>
<LeftHeader
steps={steps}
title={t('datasetPipeline.addDocuments.title')}
currentStep={currentStep}
/>
<div className='grow overflow-y-auto'>
{
currentStep === 1 && (
<div className='flex flex-col gap-y-5 pt-4'>
<DataSourceOptions
datasourceNodeId={datasource?.nodeId || ''}
onSelect={setDatasource}
pipelineNodes={(pipelineInfo?.graph.nodes || []) as Node<DataSourceNodeType>[]}
/> />
)}
{isShowVectorSpaceFull && (
<VectorSpaceFull />
)}
<Actions disabled={nextBtnDisabled} handleNextStep={handleNextStep} />
</div>
)
}
{
currentStep === 2 && (
<ProcessDocuments
ref={formRef}
dataSourceNodeId={datasource?.nodeId || ''}
onProcess={onClickProcess}
onPreview={onClickPreview}
onSubmit={handleSubmit}
onBack={handleBackStep}
/>
)
}
{
currentStep === 3 && (
<Processing
batchId={batchId}
documents={documents}
/>
)
}
{datasource?.type === DatasourceType.localFile && (
<LocalFile
files={fileList}
allowedExtensions={datasource?.fileExtensions || []}
updateFile={updateFile}
updateFileList={updateFileList}
onPreview={updateCurrentFile}
notSupportBatchUpload={notSupportBatchUpload}
/>
)}
{datasource?.type === DatasourceType.onlineDocument && (
<OnlineDocuments
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
onlineDocuments={onlineDocuments}
updateOnlineDocuments={updateOnlineDocuments}
canPreview
onPreview={updateCurrentPage}
/>
)}
{datasource?.type === DatasourceType.websiteCrawl && (
<WebsiteCrawl
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updataCheckedCrawlResultChange}
onPreview={updateCurrentWebsite}
previewIndex={previewIndex}
/>
)}
{isShowVectorSpaceFull && (
<VectorSpaceFull />
)}
<Actions disabled={nextBtnDisabled} handleNextStep={handleNextStep} />
</div>
)
}
{
currentStep === 2 && (
<ProcessDocuments
ref={formRef}
dataSourceNodeId={datasource?.nodeId || ''}
onProcess={onClickProcess}
onPreview={onClickPreview}
onSubmit={handleSubmit}
onBack={handleBackStep}
/>
)
}
{
currentStep === 3 && (
<Processing
batchId={batchId}
documents={documents}
/>
)
}
</div>
</div> </div>
</div> </div>
{/* Preview */} {/* Preview */}
{ {
currentStep === 1 && ( currentStep === 1 && (
<div className='flex h-full flex-1 pl-2 pt-2'>
{currentFile && <FilePreview file={currentFile} hidePreview={hideFilePreview} />}
{currentDocuments && <OnlineDocumentPreview currentPage={currentDocuments} hidePreview={hideOnlineDocumentPreview} />}
{currentWebsite && <WebsitePreview payload={currentWebsite} hidePreview={hideWebsitePreview} />}
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col pl-2 pt-2'>
{currentFile && <FilePreview file={currentFile} hidePreview={hideFilePreview} />}
{currentDocuments && <OnlineDocumentPreview currentPage={currentDocuments} hidePreview={hideOnlineDocumentPreview} />}
{currentWebsite && <WebsitePreview payload={currentWebsite} hidePreview={hideWebsitePreview} />}
</div>
</div> </div>
) )
} }
{ {
currentStep === 2 && ( currentStep === 2 && (
<div className='flex h-full flex-1 pl-2 pt-2'>
<ChunkPreview
dataSourceType={datasource!.type}
files={fileList.map(file => file.file)}
onlineDocuments={onlineDocuments}
websitePages={websitePages}
isIdle={isIdle}
isPending={isPending && isPreview.current}
estimateData={estimateData}
onPreview={onClickPreview}
handlePreviewFileChange={handlePreviewFileChange}
handlePreviewOnlineDocumentChange={handlePreviewOnlineDocumentChange}
handlePreviewWebsitePageChange={handlePreviewWebsiteChange}
/>
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col pl-2 pt-2'>
<ChunkPreview
dataSourceType={datasource!.type}
files={fileList.map(file => file.file)}
onlineDocuments={onlineDocuments}
websitePages={websitePages}
isIdle={isIdle}
isPending={isPending && isPreview.current}
estimateData={estimateData}
onPreview={onClickPreview}
handlePreviewFileChange={handlePreviewFileChange}
handlePreviewOnlineDocumentChange={handlePreviewOnlineDocumentChange}
handlePreviewWebsitePageChange={handlePreviewWebsiteChange}
/>
</div>
</div> </div>
) )
} }

+ 13
- 8
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawled-result-item.tsx Просмотреть файл

onCheckChange(!isChecked) onCheckChange(!isChecked)
}, [isChecked, onCheckChange]) }, [isChecked, onCheckChange])
return ( return (
<div className={cn('flex cursor-pointer gap-x-2 rounded-lg p-2', isPreview ? 'bg-state-base-active' : 'group hover:bg-state-base-hover')}>
<div className={cn(
'relative flex cursor-pointer gap-x-2 rounded-lg p-2',
isPreview ? 'bg-state-base-active' : 'group hover:bg-state-base-hover',
)}>
<Checkbox <Checkbox
className='shrink-0' className='shrink-0'
checked={isChecked} checked={isChecked}
{payload.source_url} {payload.source_url}
</div> </div>
</div> </div>
{showPreview && <Button
size='small'
onClick={onPreview}
className='system-xs-medium-uppercase right-0 top-0 hidden px-1.5 group-hover:absolute group-hover:block'
>
{t('datasetCreation.stepOne.website.preview')}
</Button>}
{showPreview && (
<Button
size='small'
onClick={onPreview}
className='system-xs-medium-uppercase right-2 top-2 hidden px-1.5 group-hover:absolute group-hover:block'
>
{t('datasetCreation.stepOne.website.preview')}
</Button>
)}
</div> </div>
) )
} }

+ 5
- 5
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawled-result.tsx Просмотреть файл

'use client' 'use client'
import React, { useCallback, useState } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import cn from '@/utils/classnames' import cn from '@/utils/classnames'
import type { CrawlResultItem } from '@/models/datasets' import type { CrawlResultItem } from '@/models/datasets'


type CrawledResultProps = { type CrawledResultProps = {
className?: string className?: string
previewIndex?: number
list: CrawlResultItem[] list: CrawlResultItem[]
checkedList: CrawlResultItem[] checkedList: CrawlResultItem[]
onSelectedChange: (selected: CrawlResultItem[]) => void onSelectedChange: (selected: CrawlResultItem[]) => void
onPreview?: (payload: CrawlResultItem) => void
onPreview?: (payload: CrawlResultItem, index: number) => void
usedTime: number usedTime: number
} }


const CrawledResult = ({ const CrawledResult = ({
className = '', className = '',
previewIndex,
list, list,
checkedList, checkedList,
onSelectedChange, onSelectedChange,
onPreview, onPreview,
}: CrawledResultProps) => { }: CrawledResultProps) => {
const { t } = useTranslation() const { t } = useTranslation()
const [previewIndex, setPreviewIndex] = useState<number>(-1)


const isCheckAll = checkedList.length === list.length const isCheckAll = checkedList.length === list.length




const handlePreview = useCallback((index: number) => { const handlePreview = useCallback((index: number) => {
if (!onPreview) return if (!onPreview) return
setPreviewIndex(index)
onPreview(list[index])
onPreview(list[index], index)
}, [list, onPreview]) }, [list, onPreview])


return ( return (

+ 33
- 25
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawler.tsx Просмотреть файл

'use client' 'use client'
import React, { useCallback, useEffect, useRef, useState } from 'react' import React, { useCallback, useEffect, useRef, useState } from 'react'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import type { CrawlResultItem } from '@/models/datasets'
import type { CrawlResult, CrawlResultItem } from '@/models/datasets'
import { CrawlStep } from '@/models/datasets'
import Header from '@/app/components/datasets/create/website/base/header' import Header from '@/app/components/datasets/create/website/base/header'
import Options from './options' import Options from './options'
import Crawling from './crawling' import Crawling from './crawling'


const I18N_PREFIX = 'datasetCreation.stepOne.website' const I18N_PREFIX = 'datasetCreation.stepOne.website'


type CrawlerProps = {
export type CrawlerProps = {
nodeId: string nodeId: string
crawlResult: CrawlResult | undefined
setCrawlResult: (payload: CrawlResult) => void
step: CrawlStep
setStep: (step: CrawlStep) => void
checkedCrawlResult: CrawlResultItem[] checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
headerInfo: { headerInfo: {
docTitle: string docTitle: string
docLink: string docLink: string
} }
onPreview?: (payload: CrawlResultItem) => void
previewIndex?: number
onPreview?: (payload: CrawlResultItem, index: number) => void
isInPipeline?: boolean isInPipeline?: boolean
} }


enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}

const Crawler = ({ const Crawler = ({
nodeId, nodeId,
crawlResult,
setCrawlResult,
step,
setStep,
checkedCrawlResult, checkedCrawlResult,
headerInfo, headerInfo,
onCheckedCrawlResultChange, onCheckedCrawlResultChange,
previewIndex,
onPreview, onPreview,
isInPipeline = false, isInPipeline = false,
}: CrawlerProps) => { }: CrawlerProps) => {
const { t } = useTranslation() const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0) const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const [totalNum, setTotalNum] = useState(0) const [totalNum, setTotalNum] = useState(0)
const [crawledNum, setCrawledNum] = useState(0) const [crawledNum, setCrawledNum] = useState(0)
}, !!pipelineId && !!nodeId) }, !!pipelineId && !!nodeId)


useEffect(() => { useEffect(() => {
if (step !== Step.init)
if (step !== CrawlStep.init)
setControlFoldOptions(Date.now()) setControlFoldOptions(Date.now())
}, [step]) }, [step])


const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const isInit = step === CrawlStep.init
const isCrawlFinished = step === CrawlStep.finished
const isRunning = step === CrawlStep.running
const [crawlErrorMessage, setCrawlErrorMessage] = useState('') const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage const showError = isCrawlFinished && crawlErrorMessage


: `/rag/pipelines/${pipelineId}/workflows/draft/datasource/nodes/${nodeId}/run` : `/rag/pipelines/${pipelineId}/workflows/draft/datasource/nodes/${nodeId}/run`


const handleRun = useCallback(async (value: Record<string, any>) => { const handleRun = useCallback(async (value: Record<string, any>) => {
setStep(Step.running)
setStep(CrawlStep.running)
ssePost( ssePost(
datasourceNodeRunURL, datasourceNodeRunURL,
{ {
}, },
onDataSourceNodeCompleted: (data: DataSourceNodeCompletedResponse) => { onDataSourceNodeCompleted: (data: DataSourceNodeCompletedResponse) => {
const { data: crawlData, time_consuming } = data const { data: crawlData, time_consuming } = data
setCrawlResult({
data: crawlData as CrawlResultItem[],
const crawlResultData = {
data: crawlData.map((item: any) => {
const { content, ...rest } = item
return {
markdown: content || '',
...rest,
} as CrawlResultItem
}),
time_consuming: time_consuming ?? 0, time_consuming: time_consuming ?? 0,
})
}
setCrawlResult(crawlResultData)
onCheckedCrawlResultChange(crawlData || []) // default select the crawl result onCheckedCrawlResultChange(crawlData || []) // default select the crawl result
setCrawlErrorMessage('') setCrawlErrorMessage('')
setStep(Step.finished)
setStep(CrawlStep.finished)
}, },
onError: (message: string) => { onError: (message: string) => {
setCrawlErrorMessage(message || t(`${I18N_PREFIX}.unknownError`)) setCrawlErrorMessage(message || t(`${I18N_PREFIX}.unknownError`))
setStep(Step.finished)
setStep(CrawlStep.finished)
}, },
}, },
) )
}, [datasourceNodeRunURL, onCheckedCrawlResultChange, t])
}, [datasourceNodeRunURL, onCheckedCrawlResultChange, setCrawlResult, setStep, t])


const handleSubmit = useCallback((value: Record<string, any>) => { const handleSubmit = useCallback((value: Record<string, any>) => {
handleRun(value) handleRun(value)
checkedList={checkedCrawlResult} checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange} onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0} usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
previewIndex={previewIndex}
onPreview={onPreview} onPreview={onPreview}
/> />
)} )}

+ 12
- 13
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/index.tsx Просмотреть файл

'use client' 'use client'
import React from 'react' import React from 'react'
import type { CrawlResultItem } from '@/models/datasets'
import type { CrawlerProps } from './base/crawler'
import Crawler from './base/crawler' import Crawler from './base/crawler'


type WebsiteCrawlProps = {
nodeId: string
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
headerInfo: {
title: string
docTitle: string
docLink: string
}
onPreview?: (payload: CrawlResultItem) => void
isInPipeline?: boolean
}
type WebsiteCrawlProps = CrawlerProps


const WebsiteCrawl = ({ const WebsiteCrawl = ({
nodeId, nodeId,
crawlResult,
setCrawlResult,
step,
setStep,
checkedCrawlResult, checkedCrawlResult,
headerInfo, headerInfo,
onCheckedCrawlResultChange, onCheckedCrawlResultChange,
previewIndex,
onPreview, onPreview,
isInPipeline, isInPipeline,
}: WebsiteCrawlProps) => { }: WebsiteCrawlProps) => {
return ( return (
<Crawler <Crawler
nodeId={nodeId} nodeId={nodeId}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
checkedCrawlResult={checkedCrawlResult} checkedCrawlResult={checkedCrawlResult}
headerInfo={headerInfo} headerInfo={headerInfo}
onCheckedCrawlResultChange={onCheckedCrawlResultChange} onCheckedCrawlResultChange={onCheckedCrawlResultChange}
previewIndex={previewIndex}
onPreview={onPreview} onPreview={onPreview}
isInPipeline={isInPipeline} isInPipeline={isInPipeline}
/> />

+ 8
- 1
web/app/components/rag-pipeline/components/panel/test-run/hooks.ts Просмотреть файл

import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types' import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types'
import { useCallback, useMemo, useState } from 'react' import { useCallback, useMemo, useState } from 'react'
import type { DatasourceType } from '@/models/pipeline' import type { DatasourceType } from '@/models/pipeline'
import type { CrawlResultItem, FileItem } from '@/models/datasets'
import type { CrawlResult } from '@/models/datasets'
import { type CrawlResultItem, CrawlStep, type FileItem } from '@/models/datasets'
import produce from 'immer' import produce from 'immer'
import type { NotionPage } from '@/models/common' import type { NotionPage } from '@/models/common'




export const useWebsiteCrawl = () => { export const useWebsiteCrawl = () => {
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([]) const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [crawlResult, setCrawlResult] = useState<CrawlResult | undefined>()
const [step, setStep] = useState<CrawlStep>(CrawlStep.init)


return { return {
crawlResult,
setCrawlResult,
websitePages, websitePages,
setWebsitePages, setWebsitePages,
step,
setStep,
} }
} }

+ 8
- 0
web/app/components/rag-pipeline/components/panel/test-run/index.tsx Просмотреть файл

updateOnlineDocuments, updateOnlineDocuments,
} = useOnlineDocuments() } = useOnlineDocuments()
const { const {
crawlResult,
setCrawlResult,
websitePages, websitePages,
setWebsitePages, setWebsitePages,
step,
setStep,
} = useWebsiteCrawl() } = useWebsiteCrawl()
const { handleRun } = useWorkflowRun() const { handleRun } = useWorkflowRun()


docTitle: datasource.docTitle || '', docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '', docLink: datasource.docLink || '',
}} }}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
onCheckedCrawlResultChange={setWebsitePages} onCheckedCrawlResultChange={setWebsitePages}
isInPipeline isInPipeline
/> />

+ 11
- 0
web/models/datasets.ts Просмотреть файл

source_url: string source_url: string
} }


export type CrawlResult = {
data: CrawlResultItem[]
time_consuming: number | string
}

export enum CrawlStep {
init = 'init',
running = 'running',
finished = 'finished',
}

export type FileItem = { export type FileItem = {
fileID: string fileID: string
file: CustomFile file: CustomFile

Загрузка…
Отмена
Сохранить