Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

index.tsx 47KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174
  1. 'use client'
  2. import type { FC, PropsWithChildren } from 'react'
  3. import React, { useCallback, useEffect, useRef, useState } from 'react'
  4. import { useTranslation } from 'react-i18next'
  5. import { useContext } from 'use-context-selector'
  6. import {
  7. RiAlertFill,
  8. RiArrowLeftLine,
  9. RiSearchEyeLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import Image from 'next/image'
  13. import { useHover } from 'ahooks'
  14. import SettingCog from '../assets/setting-gear-mod.svg'
  15. import OrangeEffect from '../assets/option-card-effect-orange.svg'
  16. import FamilyMod from '../assets/family-mod.svg'
  17. import Note from '../assets/note-mod.svg'
  18. import FileList from '../assets/file-list-3-fill.svg'
  19. import { indexMethodIcon } from '../icons'
  20. import { PreviewContainer } from '../../preview/container'
  21. import { ChunkContainer, QAPreview } from '../../chunk'
  22. import { PreviewHeader } from '../../preview/header'
  23. import { FormattedText } from '../../formatted-text/formatted'
  24. import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
  25. import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'
  26. import s from './index.module.css'
  27. import unescape from './unescape'
  28. import escape from './escape'
  29. import { OptionCard } from './option-card'
  30. import LanguageSelect from './language-select'
  31. import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
  32. import cn from '@/utils/classnames'
  33. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  34. import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
  35. import Button from '@/app/components/base/button'
  36. import FloatRightContainer from '@/app/components/base/float-right-container'
  37. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  38. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  39. import type { RetrievalConfig } from '@/types/app'
  40. import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  41. import Toast from '@/app/components/base/toast'
  42. import type { NotionPage } from '@/models/common'
  43. import { DataSourceProvider } from '@/models/common'
  44. import { useDatasetDetailContext } from '@/context/dataset-detail'
  45. import I18n from '@/context/i18n'
  46. import { RETRIEVE_METHOD } from '@/types/app'
  47. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import Checkbox from '@/app/components/base/checkbox'
  54. import RadioCard from '@/app/components/base/radio-card'
  55. import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
  56. import Divider from '@/app/components/base/divider'
  57. import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
  58. import Badge from '@/app/components/base/badge'
  59. import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
  60. import Tooltip from '@/app/components/base/tooltip'
  61. import CustomDialog from '@/app/components/base/dialog'
  62. import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'
  63. import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
  64. const TextLabel: FC<PropsWithChildren> = (props) => {
  65. return <label className='system-sm-semibold text-text-secondary'>{props.children}</label>
  66. }
  67. type StepTwoProps = {
  68. isSetting?: boolean
  69. documentDetail?: FullDocumentDetail
  70. isAPIKeySet: boolean
  71. onSetting: () => void
  72. datasetId?: string
  73. indexingType?: IndexingType
  74. retrievalMethod?: string
  75. dataSourceType: DataSourceType
  76. files: CustomFile[]
  77. notionPages?: NotionPage[]
  78. websitePages?: CrawlResultItem[]
  79. crawlOptions?: CrawlOptions
  80. websiteCrawlProvider?: DataSourceProvider
  81. websiteCrawlJobId?: string
  82. onStepChange?: (delta: number) => void
  83. updateIndexingTypeCache?: (type: string) => void
  84. updateRetrievalMethodCache?: (method: string) => void
  85. updateResultCache?: (res: createDocumentResponse) => void
  86. onSave?: () => void
  87. onCancel?: () => void
  88. }
  89. export enum IndexingType {
  90. QUALIFIED = 'high_quality',
  91. ECONOMICAL = 'economy',
  92. }
  93. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  94. const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
  95. const DEFAULT_OVERLAP = 50
  96. const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
  97. type ParentChildConfig = {
  98. chunkForContext: ParentMode
  99. parent: {
  100. delimiter: string
  101. maxLength: number
  102. }
  103. child: {
  104. delimiter: string
  105. maxLength: number
  106. }
  107. }
  108. const defaultParentChildConfig: ParentChildConfig = {
  109. chunkForContext: 'paragraph',
  110. parent: {
  111. delimiter: '\\n\\n',
  112. maxLength: 500,
  113. },
  114. child: {
  115. delimiter: '\\n',
  116. maxLength: 200,
  117. },
  118. }
  119. const StepTwo = ({
  120. isSetting,
  121. documentDetail,
  122. isAPIKeySet,
  123. datasetId,
  124. indexingType,
  125. dataSourceType: inCreatePageDataSourceType,
  126. files,
  127. notionPages = [],
  128. websitePages = [],
  129. crawlOptions,
  130. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  131. websiteCrawlJobId = '',
  132. onStepChange,
  133. updateIndexingTypeCache,
  134. updateResultCache,
  135. onSave,
  136. onCancel,
  137. updateRetrievalMethodCache,
  138. }: StepTwoProps) => {
  139. const { t } = useTranslation()
  140. const { locale } = useContext(I18n)
  141. const media = useBreakpoints()
  142. const isMobile = media === MediaType.mobile
  143. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  144. const isInUpload = Boolean(currentDataset)
  145. const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form
  146. const isNotUploadInEmptyDataset = !isUploadInEmptyDataset
  147. const isInInit = !isInUpload && !isSetting
  148. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  149. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  150. const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
  151. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  152. const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
  153. doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
  154. }, [])
  155. const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
  156. const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
  157. const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
  158. const [rules, setRules] = useState<PreProcessingRule[]>([])
  159. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  160. const hasSetIndexType = !!indexingType
  161. const [indexType, setIndexType] = useState<IndexingType>(() => {
  162. if (hasSetIndexType)
  163. return indexingType
  164. return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL
  165. })
  166. const [previewFile, setPreviewFile] = useState<DocumentItem>(
  167. (datasetId && documentDetail)
  168. ? documentDetail.file
  169. : files[0],
  170. )
  171. const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
  172. (datasetId && documentDetail)
  173. ? documentDetail.notion_page
  174. : notionPages[0],
  175. )
  176. const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
  177. (datasetId && documentDetail)
  178. ? documentDetail.website_page
  179. : websitePages[0],
  180. )
  181. // QA Related
  182. const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
  183. const [docForm, setDocForm] = useState<ChunkingMode>(
  184. (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
  185. )
  186. const handleChangeDocform = (value: ChunkingMode) => {
  187. if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {
  188. setIsQAConfirmDialogOpen(true)
  189. return
  190. }
  191. if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
  192. setIndexType(IndexingType.QUALIFIED)
  193. setDocForm(value)
  194. // eslint-disable-next-line ts/no-use-before-define
  195. currentEstimateMutation.reset()
  196. }
  197. const [docLanguage, setDocLanguage] = useState<string>(
  198. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'),
  199. )
  200. const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
  201. const getIndexing_technique = () => indexingType || indexType
  202. const currentDocForm = currentDataset?.doc_form || docForm
  203. const getProcessRule = (): ProcessRule => {
  204. if (currentDocForm === ChunkingMode.parentChild) {
  205. return {
  206. rules: {
  207. pre_processing_rules: rules,
  208. segmentation: {
  209. separator: unescape(
  210. parentChildConfig.parent.delimiter,
  211. ),
  212. max_tokens: parentChildConfig.parent.maxLength,
  213. },
  214. parent_mode: parentChildConfig.chunkForContext,
  215. subchunk_segmentation: {
  216. separator: unescape(parentChildConfig.child.delimiter),
  217. max_tokens: parentChildConfig.child.maxLength,
  218. },
  219. },
  220. mode: 'hierarchical',
  221. } as ProcessRule
  222. }
  223. return {
  224. rules: {
  225. pre_processing_rules: rules,
  226. segmentation: {
  227. separator: unescape(segmentIdentifier),
  228. max_tokens: maxChunkLength,
  229. chunk_overlap: overlap,
  230. },
  231. }, // api will check this. It will be removed after api refactored.
  232. mode: segmentationType,
  233. } as ProcessRule
  234. }
  235. const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
  236. docForm: currentDocForm,
  237. docLanguage,
  238. dataSourceType: DataSourceType.FILE,
  239. files: previewFile
  240. ? [files.find(file => file.name === previewFile.name)!]
  241. : files,
  242. indexingTechnique: getIndexing_technique() as any,
  243. processRule: getProcessRule(),
  244. dataset_id: datasetId!,
  245. })
  246. const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
  247. docForm: currentDocForm,
  248. docLanguage,
  249. dataSourceType: DataSourceType.NOTION,
  250. notionPages: [previewNotionPage],
  251. indexingTechnique: getIndexing_technique() as any,
  252. processRule: getProcessRule(),
  253. dataset_id: datasetId || '',
  254. })
  255. const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
  256. docForm: currentDocForm,
  257. docLanguage,
  258. dataSourceType: DataSourceType.WEB,
  259. websitePages: [previewWebsitePage],
  260. crawlOptions,
  261. websiteCrawlProvider,
  262. websiteCrawlJobId,
  263. indexingTechnique: getIndexing_technique() as any,
  264. processRule: getProcessRule(),
  265. dataset_id: datasetId || '',
  266. })
  267. const currentEstimateMutation = dataSourceType === DataSourceType.FILE
  268. ? fileIndexingEstimateQuery
  269. : dataSourceType === DataSourceType.NOTION
  270. ? notionIndexingEstimateQuery
  271. : websiteIndexingEstimateQuery
  272. const fetchEstimate = useCallback(() => {
  273. if (dataSourceType === DataSourceType.FILE)
  274. fileIndexingEstimateQuery.mutate()
  275. if (dataSourceType === DataSourceType.NOTION)
  276. notionIndexingEstimateQuery.mutate()
  277. if (dataSourceType === DataSourceType.WEB)
  278. websiteIndexingEstimateQuery.mutate()
  279. }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
  280. const estimate
  281. = dataSourceType === DataSourceType.FILE
  282. ? fileIndexingEstimateQuery.data
  283. : dataSourceType === DataSourceType.NOTION
  284. ? notionIndexingEstimateQuery.data
  285. : websiteIndexingEstimateQuery.data
  286. const getRuleName = (key: string) => {
  287. if (key === 'remove_extra_spaces')
  288. return t('datasetCreation.stepTwo.removeExtraSpaces')
  289. if (key === 'remove_urls_emails')
  290. return t('datasetCreation.stepTwo.removeUrlEmails')
  291. if (key === 'remove_stopwords')
  292. return t('datasetCreation.stepTwo.removeStopwords')
  293. }
  294. const ruleChangeHandle = (id: string) => {
  295. const newRules = rules.map((rule) => {
  296. if (rule.id === id) {
  297. return {
  298. id: rule.id,
  299. enabled: !rule.enabled,
  300. }
  301. }
  302. return rule
  303. })
  304. setRules(newRules)
  305. }
  306. const resetRules = () => {
  307. if (defaultConfig) {
  308. setSegmentIdentifier(defaultConfig.segmentation.separator)
  309. setMaxChunkLength(defaultConfig.segmentation.max_tokens)
  310. setOverlap(defaultConfig.segmentation.chunk_overlap!)
  311. setRules(defaultConfig.pre_processing_rules)
  312. }
  313. setParentChildConfig(defaultParentChildConfig)
  314. }
  315. const updatePreview = () => {
  316. if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
  317. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
  318. return
  319. }
  320. fetchEstimate()
  321. }
  322. const {
  323. modelList: rerankModelList,
  324. defaultModel: rerankDefaultModel,
  325. currentModel: isRerankDefaultModelValid,
  326. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  327. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  328. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  329. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  330. currentDataset?.embedding_model
  331. ? {
  332. provider: currentDataset.embedding_model_provider,
  333. model: currentDataset.embedding_model,
  334. }
  335. : {
  336. provider: defaultEmbeddingModel?.provider.provider || '',
  337. model: defaultEmbeddingModel?.model || '',
  338. },
  339. )
  340. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  341. search_method: RETRIEVE_METHOD.semantic,
  342. reranking_enable: false,
  343. reranking_model: {
  344. reranking_provider_name: '',
  345. reranking_model_name: '',
  346. },
  347. top_k: 3,
  348. score_threshold_enabled: false,
  349. score_threshold: 0.5,
  350. } as RetrievalConfig)
  351. useEffect(() => {
  352. if (currentDataset?.retrieval_model_dict)
  353. return
  354. setRetrievalConfig({
  355. search_method: RETRIEVE_METHOD.semantic,
  356. reranking_enable: !!isRerankDefaultModelValid,
  357. reranking_model: {
  358. reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
  359. reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
  360. },
  361. top_k: 3,
  362. score_threshold_enabled: false,
  363. score_threshold: 0.5,
  364. })
  365. // eslint-disable-next-line react-hooks/exhaustive-deps
  366. }, [rerankDefaultModel, isRerankDefaultModelValid])
  367. const getCreationParams = () => {
  368. let params
  369. if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
  370. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  371. return
  372. }
  373. if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
  374. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
  375. return
  376. }
  377. if (isSetting) {
  378. params = {
  379. original_document_id: documentDetail?.id,
  380. doc_form: currentDocForm,
  381. doc_language: docLanguage,
  382. process_rule: getProcessRule(),
  383. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  384. embedding_model: embeddingModel.model, // Readonly
  385. embedding_model_provider: embeddingModel.provider, // Readonly
  386. indexing_technique: getIndexing_technique(),
  387. } as CreateDocumentReq
  388. }
  389. else { // create
  390. const indexMethod = getIndexing_technique()
  391. if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
  392. Toast.notify({
  393. type: 'error',
  394. message: t('appDebug.datasetConfig.embeddingModelRequired'),
  395. })
  396. return
  397. }
  398. if (
  399. !isReRankModelSelected({
  400. rerankModelList,
  401. retrievalConfig,
  402. indexMethod: indexMethod as string,
  403. })
  404. ) {
  405. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  406. return
  407. }
  408. params = {
  409. data_source: {
  410. type: dataSourceType,
  411. info_list: {
  412. data_source_type: dataSourceType,
  413. },
  414. },
  415. indexing_technique: getIndexing_technique(),
  416. process_rule: getProcessRule(),
  417. doc_form: currentDocForm,
  418. doc_language: docLanguage,
  419. retrieval_model: retrievalConfig,
  420. embedding_model: embeddingModel.model,
  421. embedding_model_provider: embeddingModel.provider,
  422. } as CreateDocumentReq
  423. if (dataSourceType === DataSourceType.FILE) {
  424. params.data_source.info_list.file_info_list = {
  425. file_ids: files.map(file => file.id || '').filter(Boolean),
  426. }
  427. }
  428. if (dataSourceType === DataSourceType.NOTION)
  429. params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
  430. if (dataSourceType === DataSourceType.WEB) {
  431. params.data_source.info_list.website_info_list = getWebsiteInfo({
  432. websiteCrawlProvider,
  433. websiteCrawlJobId,
  434. websitePages,
  435. })
  436. }
  437. }
  438. return params
  439. }
  440. const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
  441. onSuccess(data) {
  442. const separator = data.rules.segmentation.separator
  443. setSegmentIdentifier(separator)
  444. setMaxChunkLength(data.rules.segmentation.max_tokens)
  445. setOverlap(data.rules.segmentation.chunk_overlap!)
  446. setRules(data.rules.pre_processing_rules)
  447. setDefaultConfig(data.rules)
  448. setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)
  449. },
  450. onError(error) {
  451. Toast.notify({
  452. type: 'error',
  453. message: `${error}`,
  454. })
  455. },
  456. })
  457. const getRulesFromDetail = () => {
  458. if (documentDetail) {
  459. const rules = documentDetail.dataset_process_rule.rules
  460. const separator = rules.segmentation.separator
  461. const max = rules.segmentation.max_tokens
  462. const overlap = rules.segmentation.chunk_overlap
  463. setSegmentIdentifier(separator)
  464. setMaxChunkLength(max)
  465. setOverlap(overlap!)
  466. setRules(rules.pre_processing_rules)
  467. setDefaultConfig(rules)
  468. }
  469. }
  470. const getDefaultMode = () => {
  471. if (documentDetail)
  472. setSegmentationType(documentDetail.dataset_process_rule.mode)
  473. }
  474. const createFirstDocumentMutation = useCreateFirstDocument({
  475. onError(error) {
  476. Toast.notify({
  477. type: 'error',
  478. message: `${error}`,
  479. })
  480. },
  481. })
  482. const createDocumentMutation = useCreateDocument(datasetId!, {
  483. onError(error) {
  484. Toast.notify({
  485. type: 'error',
  486. message: `${error}`,
  487. })
  488. },
  489. })
  490. const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
  491. const createHandle = async () => {
  492. const params = getCreationParams()
  493. if (!params)
  494. return false
  495. if (!datasetId) {
  496. await createFirstDocumentMutation.mutateAsync(
  497. params,
  498. {
  499. onSuccess(data) {
  500. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  501. updateResultCache && updateResultCache(data)
  502. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  503. },
  504. },
  505. )
  506. }
  507. else {
  508. await createDocumentMutation.mutateAsync(params, {
  509. onSuccess(data) {
  510. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  511. updateResultCache && updateResultCache(data)
  512. },
  513. })
  514. }
  515. if (mutateDatasetRes)
  516. mutateDatasetRes()
  517. onStepChange && onStepChange(+1)
  518. isSetting && onSave && onSave()
  519. }
  520. useEffect(() => {
  521. // fetch rules
  522. if (!isSetting) {
  523. fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
  524. }
  525. else {
  526. getRulesFromDetail()
  527. getDefaultMode()
  528. }
  529. // eslint-disable-next-line react-hooks/exhaustive-deps
  530. }, [])
  531. useEffect(() => {
  532. // get indexing type by props
  533. if (indexingType)
  534. setIndexType(indexingType as IndexingType)
  535. else
  536. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  537. }, [isAPIKeySet, indexingType, datasetId])
  538. const economyDomRef = useRef<HTMLDivElement>(null)
  539. const isHoveringEconomy = useHover(economyDomRef)
  540. const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type
  541. return (
  542. <div className='flex h-full w-full'>
  543. <div className={cn('relative h-full w-1/2 overflow-y-auto py-6', isMobile ? 'px-4' : 'px-12')}>
  544. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.segmentation')}</div>
  545. {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))
  546. || isUploadInEmptyDataset
  547. || isInInit)
  548. && <OptionCard
  549. className='mb-2 bg-background-section'
  550. title={t('datasetCreation.stepTwo.general')}
  551. icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
  552. activeHeaderClassName='bg-dataset-option-card-blue-gradient'
  553. description={t('datasetCreation.stepTwo.generalTip')}
  554. isActive={
  555. [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)
  556. }
  557. onSwitched={() =>
  558. handleChangeDocform(ChunkingMode.text)
  559. }
  560. actions={
  561. <>
  562. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  563. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  564. {t('datasetCreation.stepTwo.previewChunk')}
  565. </Button>
  566. <Button variant={'ghost'} onClick={resetRules}>
  567. {t('datasetCreation.stepTwo.reset')}
  568. </Button>
  569. </>
  570. }
  571. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  572. >
  573. <div className='flex flex-col gap-y-4'>
  574. <div className='flex gap-3'>
  575. <DelimiterInput
  576. value={segmentIdentifier}
  577. onChange={e => setSegmentIdentifier(e.target.value, true)}
  578. />
  579. <MaxLengthInput
  580. unit='tokens'
  581. value={maxChunkLength}
  582. onChange={setMaxChunkLength}
  583. />
  584. <OverlapInput
  585. unit='tokens'
  586. value={overlap}
  587. min={1}
  588. onChange={setOverlap}
  589. />
  590. </div>
  591. <div className='flex w-full flex-col'>
  592. <div className='flex items-center gap-x-2'>
  593. <div className='inline-flex shrink-0'>
  594. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  595. </div>
  596. <Divider className='grow' bgStyle='gradient' />
  597. </div>
  598. <div className='mt-1'>
  599. {rules.map(rule => (
  600. <div key={rule.id} className={s.ruleItem} onClick={() => {
  601. ruleChangeHandle(rule.id)
  602. }}>
  603. <Checkbox
  604. checked={rule.enabled}
  605. />
  606. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  607. </div>
  608. ))}
  609. {IS_CE_EDITION && <>
  610. <Divider type='horizontal' className='my-4 bg-divider-subtle' />
  611. <div className='flex items-center py-0.5'>
  612. <div className='flex items-center' onClick={() => {
  613. if (currentDataset?.doc_form)
  614. return
  615. if (docForm === ChunkingMode.qa)
  616. handleChangeDocform(ChunkingMode.text)
  617. else
  618. handleChangeDocform(ChunkingMode.qa)
  619. }}>
  620. <Checkbox
  621. checked={currentDocForm === ChunkingMode.qa}
  622. disabled={!!currentDataset?.doc_form}
  623. />
  624. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
  625. {t('datasetCreation.stepTwo.useQALanguage')}
  626. </label>
  627. </div>
  628. <LanguageSelect
  629. currentLanguage={docLanguage || locale}
  630. onSelect={setDocLanguage}
  631. disabled={currentDocForm !== ChunkingMode.qa}
  632. />
  633. <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
  634. </div>
  635. {currentDocForm === ChunkingMode.qa && (
  636. <div
  637. style={{
  638. background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
  639. }}
  640. className='mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]'
  641. >
  642. <RiAlertFill className='size-4 text-text-warning-secondary' />
  643. <span className='system-xs-medium text-text-primary'>
  644. {t('datasetCreation.stepTwo.QATip')}
  645. </span>
  646. </div>
  647. )}
  648. </>}
  649. </div>
  650. </div>
  651. </div>
  652. </OptionCard>}
  653. {
  654. (
  655. (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)
  656. || isUploadInEmptyDataset
  657. || isInInit
  658. )
  659. && <OptionCard
  660. title={t('datasetCreation.stepTwo.parentChild')}
  661. icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
  662. effectImg={OrangeEffect.src}
  663. activeHeaderClassName='bg-dataset-option-card-orange-gradient'
  664. description={t('datasetCreation.stepTwo.parentChildTip')}
  665. isActive={currentDocForm === ChunkingMode.parentChild}
  666. onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}
  667. actions={
  668. <>
  669. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  670. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  671. {t('datasetCreation.stepTwo.previewChunk')}
  672. </Button>
  673. <Button variant={'ghost'} onClick={resetRules}>
  674. {t('datasetCreation.stepTwo.reset')}
  675. </Button>
  676. </>
  677. }
  678. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  679. >
  680. <div className='flex flex-col gap-4'>
  681. <div>
  682. <div className='flex items-center gap-x-2'>
  683. <div className='inline-flex shrink-0'>
  684. <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>
  685. </div>
  686. <Divider className='grow' bgStyle='gradient' />
  687. </div>
  688. <RadioCard className='mt-1'
  689. icon={<Image src={Note} alt='' />}
  690. title={t('datasetCreation.stepTwo.paragraph')}
  691. description={t('datasetCreation.stepTwo.paragraphTip')}
  692. isChosen={parentChildConfig.chunkForContext === 'paragraph'}
  693. onChosen={() => setParentChildConfig(
  694. {
  695. ...parentChildConfig,
  696. chunkForContext: 'paragraph',
  697. },
  698. )}
  699. chosenConfig={
  700. <div className='flex gap-3'>
  701. <DelimiterInput
  702. value={parentChildConfig.parent.delimiter}
  703. tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}
  704. onChange={e => setParentChildConfig({
  705. ...parentChildConfig,
  706. parent: {
  707. ...parentChildConfig.parent,
  708. delimiter: e.target.value ? escape(e.target.value) : '',
  709. },
  710. })}
  711. />
  712. <MaxLengthInput
  713. unit='tokens'
  714. value={parentChildConfig.parent.maxLength}
  715. onChange={value => setParentChildConfig({
  716. ...parentChildConfig,
  717. parent: {
  718. ...parentChildConfig.parent,
  719. maxLength: value,
  720. },
  721. })}
  722. />
  723. </div>
  724. }
  725. />
  726. <RadioCard className='mt-2'
  727. icon={<Image src={FileList} alt='' />}
  728. title={t('datasetCreation.stepTwo.fullDoc')}
  729. description={t('datasetCreation.stepTwo.fullDocTip')}
  730. onChosen={() => setParentChildConfig(
  731. {
  732. ...parentChildConfig,
  733. chunkForContext: 'full-doc',
  734. },
  735. )}
  736. isChosen={parentChildConfig.chunkForContext === 'full-doc'}
  737. />
  738. </div>
  739. <div>
  740. <div className='flex items-center gap-x-2'>
  741. <div className='inline-flex shrink-0'>
  742. <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>
  743. </div>
  744. <Divider className='grow' bgStyle='gradient' />
  745. </div>
  746. <div className='mt-1 flex gap-3'>
  747. <DelimiterInput
  748. value={parentChildConfig.child.delimiter}
  749. tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}
  750. onChange={e => setParentChildConfig({
  751. ...parentChildConfig,
  752. child: {
  753. ...parentChildConfig.child,
  754. delimiter: e.target.value ? escape(e.target.value) : '',
  755. },
  756. })}
  757. />
  758. <MaxLengthInput
  759. unit='tokens'
  760. value={parentChildConfig.child.maxLength}
  761. onChange={value => setParentChildConfig({
  762. ...parentChildConfig,
  763. child: {
  764. ...parentChildConfig.child,
  765. maxLength: value,
  766. },
  767. })}
  768. />
  769. </div>
  770. </div>
  771. <div>
  772. <div className='flex items-center gap-x-2'>
  773. <div className='inline-flex shrink-0'>
  774. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  775. </div>
  776. <Divider className='grow' bgStyle='gradient' />
  777. </div>
  778. <div className='mt-1'>
  779. {rules.map(rule => (
  780. <div key={rule.id} className={s.ruleItem} onClick={() => {
  781. ruleChangeHandle(rule.id)
  782. }}>
  783. <Checkbox
  784. checked={rule.enabled}
  785. />
  786. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  787. </div>
  788. ))}
  789. </div>
  790. </div>
  791. </div>
  792. </OptionCard>}
  793. <Divider className='my-5' />
  794. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.indexMode')}</div>
  795. <div className='flex items-center gap-2'>
  796. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  797. <OptionCard className='flex-1 self-stretch'
  798. title={<div className='flex items-center'>
  799. {t('datasetCreation.stepTwo.qualified')}
  800. <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>
  801. {t('datasetCreation.stepTwo.recommend')}
  802. </Badge>
  803. <span className='ml-auto'>
  804. {!hasSetIndexType && <span className={cn(s.radio)} />}
  805. </span>
  806. </div>}
  807. description={t('datasetCreation.stepTwo.qualifiedTip')}
  808. icon={<Image src={indexMethodIcon.high_quality} alt='' />}
  809. isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
  810. disabled={hasSetIndexType}
  811. onSwitched={() => {
  812. setIndexType(IndexingType.QUALIFIED)
  813. }}
  814. />
  815. )}
  816. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  817. <>
  818. <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>
  819. <header className='mb-4 pt-6'>
  820. <h2 className='text-lg font-semibold'>
  821. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}
  822. </h2>
  823. <p className='mt-2 text-sm font-normal'>
  824. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}
  825. </p>
  826. </header>
  827. <div className='flex gap-2 pb-6'>
  828. <Button className='ml-auto' onClick={() => {
  829. setIsQAConfirmDialogOpen(false)
  830. }}>
  831. {t('datasetCreation.stepTwo.cancel')}
  832. </Button>
  833. <Button variant={'primary'} onClick={() => {
  834. setIsQAConfirmDialogOpen(false)
  835. setIndexType(IndexingType.QUALIFIED)
  836. setDocForm(ChunkingMode.qa)
  837. }}>
  838. {t('datasetCreation.stepTwo.switch')}
  839. </Button>
  840. </div>
  841. </CustomDialog>
  842. <PortalToFollowElem
  843. open={
  844. isHoveringEconomy && docForm !== ChunkingMode.text
  845. }
  846. placement={'top'}
  847. >
  848. <PortalToFollowElemTrigger asChild>
  849. <OptionCard className='flex-1 self-stretch'
  850. title={t('datasetCreation.stepTwo.economical')}
  851. description={t('datasetCreation.stepTwo.economicalTip')}
  852. icon={<Image src={indexMethodIcon.economical} alt='' />}
  853. isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
  854. disabled={hasSetIndexType || docForm !== ChunkingMode.text}
  855. ref={economyDomRef}
  856. onSwitched={() => {
  857. setIndexType(IndexingType.ECONOMICAL)
  858. }}
  859. />
  860. </PortalToFollowElemTrigger>
  861. <PortalToFollowElemContent>
  862. <div className='rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg'>
  863. {
  864. docForm === ChunkingMode.qa
  865. ? t('datasetCreation.stepTwo.notAvailableForQA')
  866. : t('datasetCreation.stepTwo.notAvailableForParentChild')
  867. }
  868. </div>
  869. </PortalToFollowElemContent>
  870. </PortalToFollowElem>
  871. </>)}
  872. </div>
  873. {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
  874. <div className='mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]'>
  875. <div className='absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40'></div>
  876. <div className='p-1'>
  877. <AlertTriangle className='size-4 text-text-warning-secondary' />
  878. </div>
  879. <span className='system-xs-medium text-text-primary'>{t('datasetCreation.stepTwo.highQualityTip')}</span>
  880. </div>
  881. )}
  882. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  883. <div className='system-xs-medium mt-2'>
  884. {t('datasetCreation.stepTwo.indexSettingTip')}
  885. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  886. </div>
  887. )}
  888. {/* Embedding model */}
  889. {indexType === IndexingType.QUALIFIED && (
  890. <div className='mt-5'>
  891. <div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>{t('datasetSettings.form.embeddingModel')}</div>
  892. <ModelSelector
  893. readonly={isModelAndRetrievalConfigDisabled}
  894. triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}
  895. defaultModel={embeddingModel}
  896. modelList={embeddingModelList}
  897. onSelect={(model: DefaultModel) => {
  898. setEmbeddingModel(model)
  899. }}
  900. />
  901. {isModelAndRetrievalConfigDisabled && (
  902. <div className='system-xs-medium mt-2 text-text-tertiary'>
  903. {t('datasetCreation.stepTwo.indexSettingTip')}
  904. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  905. </div>
  906. )}
  907. </div>
  908. )}
  909. <Divider className='my-5' />
  910. {/* Retrieval Method Config */}
  911. <div>
  912. {!isModelAndRetrievalConfigDisabled
  913. ? (
  914. <div className={'mb-1'}>
  915. <div className='system-md-semibold mb-0.5 text-text-secondary'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  916. <div className='body-xs-regular text-text-tertiary'>
  917. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  918. {t('datasetSettings.form.retrievalSetting.longDescription')}
  919. </div>
  920. </div>
  921. )
  922. : (
  923. <div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>
  924. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  925. </div>
  926. )}
  927. <div className=''>
  928. {
  929. getIndexing_technique() === IndexingType.QUALIFIED
  930. ? (
  931. <RetrievalMethodConfig
  932. disabled={isModelAndRetrievalConfigDisabled}
  933. value={retrievalConfig}
  934. onChange={setRetrievalConfig}
  935. />
  936. )
  937. : (
  938. <EconomicalRetrievalMethodConfig
  939. disabled={isModelAndRetrievalConfigDisabled}
  940. value={retrievalConfig}
  941. onChange={setRetrievalConfig}
  942. />
  943. )
  944. }
  945. </div>
  946. </div>
  947. {!isSetting
  948. ? (
  949. <div className='mt-8 flex items-center py-2'>
  950. <Button onClick={() => onStepChange && onStepChange(-1)}>
  951. <RiArrowLeftLine className='mr-1 h-4 w-4' />
  952. {t('datasetCreation.stepTwo.previousStep')}
  953. </Button>
  954. <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  955. </div>
  956. )
  957. : (
  958. <div className='mt-8 flex items-center py-2'>
  959. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  960. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  961. </div>
  962. )}
  963. </div>
  964. <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
  965. <PreviewContainer
  966. header={<PreviewHeader
  967. title={t('datasetCreation.stepTwo.preview')}
  968. >
  969. <div className='flex items-center gap-1'>
  970. {dataSourceType === DataSourceType.FILE
  971. && <PreviewDocumentPicker
  972. files={files as Array<Required<CustomFile>>}
  973. onChange={(selected) => {
  974. currentEstimateMutation.reset()
  975. setPreviewFile(selected)
  976. currentEstimateMutation.mutate()
  977. }}
  978. // when it is from setting, it just has one file
  979. value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}
  980. />
  981. }
  982. {dataSourceType === DataSourceType.NOTION
  983. && <PreviewDocumentPicker
  984. files={
  985. notionPages.map(page => ({
  986. id: page.page_id,
  987. name: page.page_name,
  988. extension: 'md',
  989. }))
  990. }
  991. onChange={(selected) => {
  992. currentEstimateMutation.reset()
  993. const selectedPage = notionPages.find(page => page.page_id === selected.id)
  994. setPreviewNotionPage(selectedPage!)
  995. currentEstimateMutation.mutate()
  996. }}
  997. value={{
  998. id: previewNotionPage?.page_id || '',
  999. name: previewNotionPage?.page_name || '',
  1000. extension: 'md',
  1001. }}
  1002. />
  1003. }
  1004. {dataSourceType === DataSourceType.WEB
  1005. && <PreviewDocumentPicker
  1006. files={
  1007. websitePages.map(page => ({
  1008. id: page.source_url,
  1009. name: page.title,
  1010. extension: 'md',
  1011. }))
  1012. }
  1013. onChange={(selected) => {
  1014. currentEstimateMutation.reset()
  1015. const selectedPage = websitePages.find(page => page.source_url === selected.id)
  1016. setPreviewWebsitePage(selectedPage!)
  1017. currentEstimateMutation.mutate()
  1018. }}
  1019. value={
  1020. {
  1021. id: previewWebsitePage?.source_url || '',
  1022. name: previewWebsitePage?.title || '',
  1023. extension: 'md',
  1024. }
  1025. }
  1026. />
  1027. }
  1028. {
  1029. currentDocForm !== ChunkingMode.qa
  1030. && <Badge text={t('datasetCreation.stepTwo.previewChunkCount', {
  1031. count: estimate?.total_segments || 0,
  1032. }) as string}
  1033. />
  1034. }
  1035. </div>
  1036. </PreviewHeader>}
  1037. className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}
  1038. mainClassName='space-y-6'
  1039. >
  1040. {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
  1041. estimate?.qa_preview.map((item, index) => (
  1042. <ChunkContainer
  1043. key={item.question}
  1044. label={`Chunk-${index + 1}`}
  1045. characterCount={item.question.length + item.answer.length}
  1046. >
  1047. <QAPreview qa={item} />
  1048. </ChunkContainer>
  1049. ))
  1050. )}
  1051. {currentDocForm === ChunkingMode.text && estimate?.preview && (
  1052. estimate?.preview.map((item, index) => (
  1053. <ChunkContainer
  1054. key={item.content}
  1055. label={`Chunk-${index + 1}`}
  1056. characterCount={item.content.length}
  1057. >
  1058. {item.content}
  1059. </ChunkContainer>
  1060. ))
  1061. )}
  1062. {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
  1063. estimate?.preview?.map((item, index) => {
  1064. const indexForLabel = index + 1
  1065. const childChunks = parentChildConfig.chunkForContext === 'full-doc'
  1066. ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
  1067. : item.child_chunks
  1068. return (
  1069. <ChunkContainer
  1070. key={item.content}
  1071. label={`Chunk-${indexForLabel}`}
  1072. characterCount={item.content.length}
  1073. >
  1074. <FormattedText>
  1075. {childChunks.map((child, index) => {
  1076. const indexForLabel = index + 1
  1077. return (
  1078. <PreviewSlice
  1079. key={child}
  1080. label={`C-${indexForLabel}`}
  1081. text={child}
  1082. tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}
  1083. labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'
  1084. dividerClassName='leading-7'
  1085. />
  1086. )
  1087. })}
  1088. </FormattedText>
  1089. </ChunkContainer>
  1090. )
  1091. })
  1092. )}
  1093. {currentEstimateMutation.isIdle && (
  1094. <div className='flex h-full w-full items-center justify-center'>
  1095. <div className='flex flex-col items-center justify-center gap-3'>
  1096. <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />
  1097. <p className='text-sm text-text-tertiary'>
  1098. {t('datasetCreation.stepTwo.previewChunkTip')}
  1099. </p>
  1100. </div>
  1101. </div>
  1102. )}
  1103. {currentEstimateMutation.isPending && (
  1104. <div className='space-y-6'>
  1105. {Array.from({ length: 10 }, (_, i) => (
  1106. <SkeletonContainer key={i}>
  1107. <SkeletonRow>
  1108. <SkeletonRectangle className="w-20" />
  1109. <SkeletonPoint />
  1110. <SkeletonRectangle className="w-24" />
  1111. </SkeletonRow>
  1112. <SkeletonRectangle className="w-full" />
  1113. <SkeletonRectangle className="w-full" />
  1114. <SkeletonRectangle className="w-[422px]" />
  1115. </SkeletonContainer>
  1116. ))}
  1117. </div>
  1118. )}
  1119. </PreviewContainer>
  1120. </FloatRightContainer>
  1121. </div>
  1122. )
  1123. }
  1124. export default StepTwo