瀏覽代碼

feat: enhance website crawl functionality with state management and result handling

tags/2.0.0-beta.1
twwu 4 月之前
父節點
當前提交
6ba4a4c165

+ 13
- 2
web/app/components/datasets/documents/create-from-pipeline/hooks.ts 查看文件

@@ -5,7 +5,8 @@ import { useCallback, useMemo, useRef, useState } from 'react'
import { BlockEnum, type Node } from '@/app/components/workflow/types'
import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types'
import type { DatasourceType } from '@/models/pipeline'
import type { CrawlResultItem, DocumentItem, FileItem } from '@/models/datasets'
import type { CrawlResult, CrawlResultItem, DocumentItem, FileItem } from '@/models/datasets'
import { CrawlStep } from '@/models/datasets'
import produce from 'immer'
import type { NotionPage } from '@/models/common'

@@ -150,15 +151,20 @@ export const useOnlineDocuments = () => {
export const useWebsiteCrawl = () => {
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [currentWebsite, setCurrentWebsite] = useState<CrawlResultItem | undefined>()
const [crawlResult, setCrawlResult] = useState<CrawlResult | undefined>()
const [step, setStep] = useState<CrawlStep>(CrawlStep.init)
const [previewIndex, setPreviewIndex] = useState<number>(-1)

const previewWebsitePage = useRef<CrawlResultItem>(websitePages[0])

const updateCurrentWebsite = useCallback((website: CrawlResultItem) => {
const updateCurrentWebsite = useCallback((website: CrawlResultItem, index: number) => {
setCurrentWebsite(website)
setPreviewIndex(index)
}, [])

const hideWebsitePreview = useCallback(() => {
setCurrentWebsite(undefined)
setPreviewIndex(-1)
}, [])

const updataCheckedCrawlResultChange = useCallback((checkedCrawlResult: CrawlResultItem[]) => {
@@ -168,10 +174,15 @@ export const useWebsiteCrawl = () => {

return {
websitePages,
crawlResult,
setCrawlResult,
step,
setStep,
previewWebsitePage,
updataCheckedCrawlResultChange,
currentWebsite,
updateCurrentWebsite,
previewIndex,
hideWebsitePreview,
}
}

+ 112
- 96
web/app/components/datasets/documents/create-from-pipeline/index.tsx 查看文件

@@ -69,10 +69,15 @@ const CreateFormPipeline = () => {
} = useOnlineDocuments()
const {
websitePages,
crawlResult,
setCrawlResult,
step,
setStep,
previewWebsitePage,
updataCheckedCrawlResultChange,
currentWebsite,
updateCurrentWebsite,
previewIndex,
hideWebsitePreview,
} = useWebsiteCrawl()

@@ -225,113 +230,124 @@ const CreateFormPipeline = () => {
<div
className='relative flex h-[calc(100vh-56px)] w-full min-w-[1024px] overflow-x-auto rounded-t-2xl border-t border-effects-highlight bg-background-default-subtle'
>
<div className='flex h-full flex-1 flex-col px-14'>
<LeftHeader
steps={steps}
title={t('datasetPipeline.addDocuments.title')}
currentStep={currentStep}
/>
<div className='grow overflow-y-auto'>
{
currentStep === 1 && (
<div className='flex flex-col gap-y-5 pt-4'>
<DataSourceOptions
datasourceNodeId={datasource?.nodeId || ''}
onSelect={setDatasource}
pipelineNodes={(pipelineInfo?.graph.nodes || []) as Node<DataSourceNodeType>[]}
/>
{datasource?.type === DatasourceType.localFile && (
<LocalFile
files={fileList}
allowedExtensions={datasource?.fileExtensions || []}
updateFile={updateFile}
updateFileList={updateFileList}
onPreview={updateCurrentFile}
notSupportBatchUpload={notSupportBatchUpload}
/>
)}
{datasource?.type === DatasourceType.onlineDocument && (
<OnlineDocuments
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
onlineDocuments={onlineDocuments}
updateOnlineDocuments={updateOnlineDocuments}
canPreview
onPreview={updateCurrentPage}
/>
)}
{datasource?.type === DatasourceType.websiteCrawl && (
<WebsiteCrawl
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updataCheckedCrawlResultChange}
onPreview={updateCurrentWebsite}
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col px-14'>
<LeftHeader
steps={steps}
title={t('datasetPipeline.addDocuments.title')}
currentStep={currentStep}
/>
<div className='grow overflow-y-auto'>
{
currentStep === 1 && (
<div className='flex flex-col gap-y-5 pt-4'>
<DataSourceOptions
datasourceNodeId={datasource?.nodeId || ''}
onSelect={setDatasource}
pipelineNodes={(pipelineInfo?.graph.nodes || []) as Node<DataSourceNodeType>[]}
/>
)}
{isShowVectorSpaceFull && (
<VectorSpaceFull />
)}
<Actions disabled={nextBtnDisabled} handleNextStep={handleNextStep} />
</div>
)
}
{
currentStep === 2 && (
<ProcessDocuments
ref={formRef}
dataSourceNodeId={datasource?.nodeId || ''}
onProcess={onClickProcess}
onPreview={onClickPreview}
onSubmit={handleSubmit}
onBack={handleBackStep}
/>
)
}
{
currentStep === 3 && (
<Processing
batchId={batchId}
documents={documents}
/>
)
}
{datasource?.type === DatasourceType.localFile && (
<LocalFile
files={fileList}
allowedExtensions={datasource?.fileExtensions || []}
updateFile={updateFile}
updateFileList={updateFileList}
onPreview={updateCurrentFile}
notSupportBatchUpload={notSupportBatchUpload}
/>
)}
{datasource?.type === DatasourceType.onlineDocument && (
<OnlineDocuments
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
onlineDocuments={onlineDocuments}
updateOnlineDocuments={updateOnlineDocuments}
canPreview
onPreview={updateCurrentPage}
/>
)}
{datasource?.type === DatasourceType.websiteCrawl && (
<WebsiteCrawl
nodeId={datasource?.nodeId || ''}
headerInfo={{
title: datasource.description,
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updataCheckedCrawlResultChange}
onPreview={updateCurrentWebsite}
previewIndex={previewIndex}
/>
)}
{isShowVectorSpaceFull && (
<VectorSpaceFull />
)}
<Actions disabled={nextBtnDisabled} handleNextStep={handleNextStep} />
</div>
)
}
{
currentStep === 2 && (
<ProcessDocuments
ref={formRef}
dataSourceNodeId={datasource?.nodeId || ''}
onProcess={onClickProcess}
onPreview={onClickPreview}
onSubmit={handleSubmit}
onBack={handleBackStep}
/>
)
}
{
currentStep === 3 && (
<Processing
batchId={batchId}
documents={documents}
/>
)
}
</div>
</div>
</div>
{/* Preview */}
{
currentStep === 1 && (
<div className='flex h-full flex-1 pl-2 pt-2'>
{currentFile && <FilePreview file={currentFile} hidePreview={hideFilePreview} />}
{currentDocuments && <OnlineDocumentPreview currentPage={currentDocuments} hidePreview={hideOnlineDocumentPreview} />}
{currentWebsite && <WebsitePreview payload={currentWebsite} hidePreview={hideWebsitePreview} />}
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col pl-2 pt-2'>
{currentFile && <FilePreview file={currentFile} hidePreview={hideFilePreview} />}
{currentDocuments && <OnlineDocumentPreview currentPage={currentDocuments} hidePreview={hideOnlineDocumentPreview} />}
{currentWebsite && <WebsitePreview payload={currentWebsite} hidePreview={hideWebsitePreview} />}
</div>
</div>
)
}
{
currentStep === 2 && (
<div className='flex h-full flex-1 pl-2 pt-2'>
<ChunkPreview
dataSourceType={datasource!.type}
files={fileList.map(file => file.file)}
onlineDocuments={onlineDocuments}
websitePages={websitePages}
isIdle={isIdle}
isPending={isPending && isPreview.current}
estimateData={estimateData}
onPreview={onClickPreview}
handlePreviewFileChange={handlePreviewFileChange}
handlePreviewOnlineDocumentChange={handlePreviewOnlineDocumentChange}
handlePreviewWebsitePageChange={handlePreviewWebsiteChange}
/>
<div className='h-full min-w-0 flex-1'>
<div className='flex h-full flex-col pl-2 pt-2'>
<ChunkPreview
dataSourceType={datasource!.type}
files={fileList.map(file => file.file)}
onlineDocuments={onlineDocuments}
websitePages={websitePages}
isIdle={isIdle}
isPending={isPending && isPreview.current}
estimateData={estimateData}
onPreview={onClickPreview}
handlePreviewFileChange={handlePreviewFileChange}
handlePreviewOnlineDocumentChange={handlePreviewOnlineDocumentChange}
handlePreviewWebsitePageChange={handlePreviewWebsiteChange}
/>
</div>
</div>
)
}

+ 13
- 8
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawled-result-item.tsx 查看文件

@@ -29,7 +29,10 @@ const CrawledResultItem = ({
onCheckChange(!isChecked)
}, [isChecked, onCheckChange])
return (
<div className={cn('flex cursor-pointer gap-x-2 rounded-lg p-2', isPreview ? 'bg-state-base-active' : 'group hover:bg-state-base-hover')}>
<div className={cn(
'relative flex cursor-pointer gap-x-2 rounded-lg p-2',
isPreview ? 'bg-state-base-active' : 'group hover:bg-state-base-hover',
)}>
<Checkbox
className='shrink-0'
checked={isChecked}
@@ -49,13 +52,15 @@ const CrawledResultItem = ({
{payload.source_url}
</div>
</div>
{showPreview && <Button
size='small'
onClick={onPreview}
className='system-xs-medium-uppercase right-0 top-0 hidden px-1.5 group-hover:absolute group-hover:block'
>
{t('datasetCreation.stepOne.website.preview')}
</Button>}
{showPreview && (
<Button
size='small'
onClick={onPreview}
className='system-xs-medium-uppercase right-2 top-2 hidden px-1.5 group-hover:absolute group-hover:block'
>
{t('datasetCreation.stepOne.website.preview')}
</Button>
)}
</div>
)
}

+ 5
- 5
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawled-result.tsx 查看文件

@@ -1,5 +1,5 @@
'use client'
import React, { useCallback, useState } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import cn from '@/utils/classnames'
import type { CrawlResultItem } from '@/models/datasets'
@@ -10,15 +10,17 @@ const I18N_PREFIX = 'datasetCreation.stepOne.website'

type CrawledResultProps = {
className?: string
previewIndex?: number
list: CrawlResultItem[]
checkedList: CrawlResultItem[]
onSelectedChange: (selected: CrawlResultItem[]) => void
onPreview?: (payload: CrawlResultItem) => void
onPreview?: (payload: CrawlResultItem, index: number) => void
usedTime: number
}

const CrawledResult = ({
className = '',
previewIndex,
list,
checkedList,
onSelectedChange,
@@ -26,7 +28,6 @@ const CrawledResult = ({
onPreview,
}: CrawledResultProps) => {
const { t } = useTranslation()
const [previewIndex, setPreviewIndex] = useState<number>(-1)

const isCheckAll = checkedList.length === list.length

@@ -50,8 +51,7 @@ const CrawledResult = ({

const handlePreview = useCallback((index: number) => {
if (!onPreview) return
setPreviewIndex(index)
onPreview(list[index])
onPreview(list[index], index)
}, [list, onPreview])

return (

+ 33
- 25
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/base/crawler.tsx 查看文件

@@ -1,7 +1,8 @@
'use client'
import React, { useCallback, useEffect, useRef, useState } from 'react'
import { useTranslation } from 'react-i18next'
import type { CrawlResultItem } from '@/models/datasets'
import type { CrawlResult, CrawlResultItem } from '@/models/datasets'
import { CrawlStep } from '@/models/datasets'
import Header from '@/app/components/datasets/create/website/base/header'
import Options from './options'
import Crawling from './crawling'
@@ -21,8 +22,12 @@ import type {

const I18N_PREFIX = 'datasetCreation.stepOne.website'

type CrawlerProps = {
export type CrawlerProps = {
nodeId: string
crawlResult: CrawlResult | undefined
setCrawlResult: (payload: CrawlResult) => void
step: CrawlStep
setStep: (step: CrawlStep) => void
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
headerInfo: {
@@ -30,26 +35,25 @@ type CrawlerProps = {
docTitle: string
docLink: string
}
onPreview?: (payload: CrawlResultItem) => void
previewIndex?: number
onPreview?: (payload: CrawlResultItem, index: number) => void
isInPipeline?: boolean
}

enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}

const Crawler = ({
nodeId,
crawlResult,
setCrawlResult,
step,
setStep,
checkedCrawlResult,
headerInfo,
onCheckedCrawlResultChange,
previewIndex,
onPreview,
isInPipeline = false,
}: CrawlerProps) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
const [totalNum, setTotalNum] = useState(0)
const [crawledNum, setCrawledNum] = useState(0)
@@ -62,17 +66,13 @@ const Crawler = ({
}, !!pipelineId && !!nodeId)

useEffect(() => {
if (step !== Step.init)
if (step !== CrawlStep.init)
setControlFoldOptions(Date.now())
}, [step])

const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const isInit = step === CrawlStep.init
const isCrawlFinished = step === CrawlStep.finished
const isRunning = step === CrawlStep.running
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage

@@ -81,7 +81,7 @@ const Crawler = ({
: `/rag/pipelines/${pipelineId}/workflows/draft/datasource/nodes/${nodeId}/run`

const handleRun = useCallback(async (value: Record<string, any>) => {
setStep(Step.running)
setStep(CrawlStep.running)
ssePost(
datasourceNodeRunURL,
{
@@ -98,21 +98,28 @@ const Crawler = ({
},
onDataSourceNodeCompleted: (data: DataSourceNodeCompletedResponse) => {
const { data: crawlData, time_consuming } = data
setCrawlResult({
data: crawlData as CrawlResultItem[],
const crawlResultData = {
data: crawlData.map((item: any) => {
const { content, ...rest } = item
return {
markdown: content || '',
...rest,
} as CrawlResultItem
}),
time_consuming: time_consuming ?? 0,
})
}
setCrawlResult(crawlResultData)
onCheckedCrawlResultChange(crawlData || []) // default select the crawl result
setCrawlErrorMessage('')
setStep(Step.finished)
setStep(CrawlStep.finished)
},
onError: (message: string) => {
setCrawlErrorMessage(message || t(`${I18N_PREFIX}.unknownError`))
setStep(Step.finished)
setStep(CrawlStep.finished)
},
},
)
}, [datasourceNodeRunURL, onCheckedCrawlResultChange, t])
}, [datasourceNodeRunURL, onCheckedCrawlResultChange, setCrawlResult, setStep, t])

const handleSubmit = useCallback((value: Record<string, any>) => {
handleRun(value)
@@ -155,6 +162,7 @@ const Crawler = ({
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
previewIndex={previewIndex}
onPreview={onPreview}
/>
)}

+ 12
- 13
web/app/components/rag-pipeline/components/panel/test-run/data-source/website-crawl/index.tsx 查看文件

@@ -1,35 +1,34 @@
'use client'
import React from 'react'
import type { CrawlResultItem } from '@/models/datasets'
import type { CrawlerProps } from './base/crawler'
import Crawler from './base/crawler'

type WebsiteCrawlProps = {
nodeId: string
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
headerInfo: {
title: string
docTitle: string
docLink: string
}
onPreview?: (payload: CrawlResultItem) => void
isInPipeline?: boolean
}
type WebsiteCrawlProps = CrawlerProps

const WebsiteCrawl = ({
nodeId,
crawlResult,
setCrawlResult,
step,
setStep,
checkedCrawlResult,
headerInfo,
onCheckedCrawlResultChange,
previewIndex,
onPreview,
isInPipeline,
}: WebsiteCrawlProps) => {
return (
<Crawler
nodeId={nodeId}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
checkedCrawlResult={checkedCrawlResult}
headerInfo={headerInfo}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
previewIndex={previewIndex}
onPreview={onPreview}
isInPipeline={isInPipeline}
/>

+ 8
- 1
web/app/components/rag-pipeline/components/panel/test-run/hooks.ts 查看文件

@@ -6,7 +6,8 @@ import { BlockEnum } from '@/app/components/workflow/types'
import type { DataSourceNodeType } from '@/app/components/workflow/nodes/data-source/types'
import { useCallback, useMemo, useState } from 'react'
import type { DatasourceType } from '@/models/pipeline'
import type { CrawlResultItem, FileItem } from '@/models/datasets'
import type { CrawlResult } from '@/models/datasets'
import { type CrawlResultItem, CrawlStep, type FileItem } from '@/models/datasets'
import produce from 'immer'
import type { NotionPage } from '@/models/common'

@@ -116,9 +117,15 @@ export const useOnlineDocuments = () => {

export const useWebsiteCrawl = () => {
const [websitePages, setWebsitePages] = useState<CrawlResultItem[]>([])
const [crawlResult, setCrawlResult] = useState<CrawlResult | undefined>()
const [step, setStep] = useState<CrawlStep>(CrawlStep.init)

return {
crawlResult,
setCrawlResult,
websitePages,
setWebsitePages,
step,
setStep,
}
}

+ 8
- 0
web/app/components/rag-pipeline/components/panel/test-run/index.tsx 查看文件

@@ -39,8 +39,12 @@ const TestRunPanel = () => {
updateOnlineDocuments,
} = useOnlineDocuments()
const {
crawlResult,
setCrawlResult,
websitePages,
setWebsitePages,
step,
setStep,
} = useWebsiteCrawl()
const { handleRun } = useWorkflowRun()

@@ -144,6 +148,10 @@ const TestRunPanel = () => {
docTitle: datasource.docTitle || '',
docLink: datasource.docLink || '',
}}
crawlResult={crawlResult}
setCrawlResult={setCrawlResult}
step={step}
setStep={setStep}
onCheckedCrawlResultChange={setWebsitePages}
isInPipeline
/>

+ 11
- 0
web/models/datasets.ts 查看文件

@@ -158,6 +158,17 @@ export type CrawlResultItem = {
source_url: string
}

export type CrawlResult = {
data: CrawlResultItem[]
time_consuming: number | string
}

export enum CrawlStep {
init = 'init',
running = 'running',
finished = 'finished',
}

export type FileItem = {
fileID: string
file: CustomFile

Loading…
取消
儲存