|
|
|
@@ -1,5 +1,5 @@ |
|
|
|
'use client' |
|
|
|
import React, { useEffect, useLayoutEffect, useRef, useState } from 'react' |
|
|
|
import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react' |
|
|
|
import { useTranslation } from 'react-i18next' |
|
|
|
import { useContext } from 'use-context-selector' |
|
|
|
import { useBoolean } from 'ahooks' |
|
|
|
@@ -13,6 +13,8 @@ import { groupBy } from 'lodash-es' |
|
|
|
import PreviewItem, { PreviewType } from './preview-item' |
|
|
|
import LanguageSelect from './language-select' |
|
|
|
import s from './index.module.css' |
|
|
|
import unescape from './unescape' |
|
|
|
import escape from './escape' |
|
|
|
import cn from '@/utils/classnames' |
|
|
|
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' |
|
|
|
import { |
|
|
|
@@ -78,6 +80,8 @@ enum IndexingType { |
|
|
|
ECONOMICAL = 'economy', |
|
|
|
} |
|
|
|
|
|
|
|
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' |
|
|
|
|
|
|
|
const StepTwo = ({ |
|
|
|
isSetting, |
|
|
|
documentDetail, |
|
|
|
@@ -110,8 +114,11 @@ const StepTwo = ({ |
|
|
|
const previewScrollRef = useRef<HTMLDivElement>(null) |
|
|
|
const [previewScrolled, setPreviewScrolled] = useState(false) |
|
|
|
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO) |
|
|
|
const [segmentIdentifier, setSegmentIdentifier] = useState('\\n') |
|
|
|
const [max, setMax] = useState(5000) // default chunk length |
|
|
|
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) |
|
|
|
const setSegmentIdentifier = useCallback((value: string) => { |
|
|
|
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER) |
|
|
|
}, []) |
|
|
|
const [max, setMax] = useState(4000) // default chunk length |
|
|
|
const [overlap, setOverlap] = useState(50) |
|
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([]) |
|
|
|
const [defaultConfig, setDefaultConfig] = useState<Rules>() |
|
|
|
@@ -183,7 +190,7 @@ const StepTwo = ({ |
|
|
|
} |
|
|
|
const resetRules = () => { |
|
|
|
if (defaultConfig) { |
|
|
|
setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n') |
|
|
|
setSegmentIdentifier(defaultConfig.segmentation.separator) |
|
|
|
setMax(defaultConfig.segmentation.max_tokens) |
|
|
|
setOverlap(defaultConfig.segmentation.chunk_overlap) |
|
|
|
setRules(defaultConfig.pre_processing_rules) |
|
|
|
@@ -217,7 +224,7 @@ const StepTwo = ({ |
|
|
|
const ruleObj = { |
|
|
|
pre_processing_rules: rules, |
|
|
|
segmentation: { |
|
|
|
separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier, |
|
|
|
separator: unescape(segmentIdentifier), |
|
|
|
max_tokens: max, |
|
|
|
chunk_overlap: overlap, |
|
|
|
}, |
|
|
|
@@ -394,7 +401,7 @@ const StepTwo = ({ |
|
|
|
try { |
|
|
|
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' }) |
|
|
|
const separator = res.rules.segmentation.separator |
|
|
|
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') |
|
|
|
setSegmentIdentifier(separator) |
|
|
|
setMax(res.rules.segmentation.max_tokens) |
|
|
|
setOverlap(res.rules.segmentation.chunk_overlap) |
|
|
|
setRules(res.rules.pre_processing_rules) |
|
|
|
@@ -411,7 +418,7 @@ const StepTwo = ({ |
|
|
|
const separator = rules.segmentation.separator |
|
|
|
const max = rules.segmentation.max_tokens |
|
|
|
const overlap = rules.segmentation.chunk_overlap |
|
|
|
setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') |
|
|
|
setSegmentIdentifier(separator) |
|
|
|
setMax(max) |
|
|
|
setOverlap(overlap) |
|
|
|
setRules(rules.pre_processing_rules) |
|
|
|
@@ -616,12 +623,22 @@ const StepTwo = ({ |
|
|
|
<div className={s.typeFormBody}> |
|
|
|
<div className={s.formRow}> |
|
|
|
<div className='w-full'> |
|
|
|
<div className={s.label}>{t('datasetCreation.stepTwo.separator')}</div> |
|
|
|
<div className={s.label}> |
|
|
|
{t('datasetCreation.stepTwo.separator')} |
|
|
|
<Tooltip |
|
|
|
popupContent={ |
|
|
|
<div className='max-w-[200px]'> |
|
|
|
{t('datasetCreation.stepTwo.separatorTip')} |
|
|
|
</div> |
|
|
|
} |
|
|
|
/> |
|
|
|
</div> |
|
|
|
<input |
|
|
|
type="text" |
|
|
|
className={s.input} |
|
|
|
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} value={segmentIdentifier} |
|
|
|
onChange={e => setSegmentIdentifier(e.target.value)} |
|
|
|
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''} |
|
|
|
value={segmentIdentifier} |
|
|
|
onChange={e => doSetSegmentIdentifier(e.target.value)} |
|
|
|
/> |
|
|
|
</div> |
|
|
|
</div> |