You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175
  1. 'use client'
  2. import type { FC, PropsWithChildren } from 'react'
  3. import React, { useCallback, useEffect, useRef, useState } from 'react'
  4. import { useTranslation } from 'react-i18next'
  5. import { useContext } from 'use-context-selector'
  6. import {
  7. RiAlertFill,
  8. RiArrowLeftLine,
  9. RiSearchEyeLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import Image from 'next/image'
  13. import { useHover } from 'ahooks'
  14. import SettingCog from '../assets/setting-gear-mod.svg'
  15. import OrangeEffect from '../assets/option-card-effect-orange.svg'
  16. import FamilyMod from '../assets/family-mod.svg'
  17. import Note from '../assets/note-mod.svg'
  18. import FileList from '../assets/file-list-3-fill.svg'
  19. import { indexMethodIcon } from '../icons'
  20. import { PreviewContainer } from '../../preview/container'
  21. import { ChunkContainer, QAPreview } from '../../chunk'
  22. import { PreviewHeader } from '../../preview/header'
  23. import { FormattedText } from '../../formatted-text/formatted'
  24. import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
  25. import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'
  26. import s from './index.module.css'
  27. import unescape from './unescape'
  28. import escape from './escape'
  29. import { OptionCard } from './option-card'
  30. import LanguageSelect from './language-select'
  31. import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
  32. import cn from '@/utils/classnames'
  33. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  34. import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
  35. import Button from '@/app/components/base/button'
  36. import FloatRightContainer from '@/app/components/base/float-right-container'
  37. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  38. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  39. import type { RetrievalConfig } from '@/types/app'
  40. import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  41. import Toast from '@/app/components/base/toast'
  42. import type { NotionPage } from '@/models/common'
  43. import { DataSourceProvider } from '@/models/common'
  44. import { useDatasetDetailContext } from '@/context/dataset-detail'
  45. import I18n from '@/context/i18n'
  46. import { RETRIEVE_METHOD } from '@/types/app'
  47. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import Checkbox from '@/app/components/base/checkbox'
  54. import RadioCard from '@/app/components/base/radio-card'
  55. import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
  56. import Divider from '@/app/components/base/divider'
  57. import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
  58. import Badge from '@/app/components/base/badge'
  59. import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
  60. import Tooltip from '@/app/components/base/tooltip'
  61. import CustomDialog from '@/app/components/base/dialog'
  62. import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'
  63. import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
  64. import { noop } from 'lodash-es'
  65. const TextLabel: FC<PropsWithChildren> = (props) => {
  66. return <label className='system-sm-semibold text-text-secondary'>{props.children}</label>
  67. }
  68. type StepTwoProps = {
  69. isSetting?: boolean
  70. documentDetail?: FullDocumentDetail
  71. isAPIKeySet: boolean
  72. onSetting: () => void
  73. datasetId?: string
  74. indexingType?: IndexingType
  75. retrievalMethod?: string
  76. dataSourceType: DataSourceType
  77. files: CustomFile[]
  78. notionPages?: NotionPage[]
  79. websitePages?: CrawlResultItem[]
  80. crawlOptions?: CrawlOptions
  81. websiteCrawlProvider?: DataSourceProvider
  82. websiteCrawlJobId?: string
  83. onStepChange?: (delta: number) => void
  84. updateIndexingTypeCache?: (type: string) => void
  85. updateRetrievalMethodCache?: (method: string) => void
  86. updateResultCache?: (res: createDocumentResponse) => void
  87. onSave?: () => void
  88. onCancel?: () => void
  89. }
  90. export enum IndexingType {
  91. QUALIFIED = 'high_quality',
  92. ECONOMICAL = 'economy',
  93. }
  94. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  95. const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
  96. const DEFAULT_OVERLAP = 50
  97. const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
  98. type ParentChildConfig = {
  99. chunkForContext: ParentMode
  100. parent: {
  101. delimiter: string
  102. maxLength: number
  103. }
  104. child: {
  105. delimiter: string
  106. maxLength: number
  107. }
  108. }
  109. const defaultParentChildConfig: ParentChildConfig = {
  110. chunkForContext: 'paragraph',
  111. parent: {
  112. delimiter: '\\n\\n',
  113. maxLength: 500,
  114. },
  115. child: {
  116. delimiter: '\\n',
  117. maxLength: 200,
  118. },
  119. }
  120. const StepTwo = ({
  121. isSetting,
  122. documentDetail,
  123. isAPIKeySet,
  124. datasetId,
  125. indexingType,
  126. dataSourceType: inCreatePageDataSourceType,
  127. files,
  128. notionPages = [],
  129. websitePages = [],
  130. crawlOptions,
  131. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  132. websiteCrawlJobId = '',
  133. onStepChange,
  134. updateIndexingTypeCache,
  135. updateResultCache,
  136. onSave,
  137. onCancel,
  138. updateRetrievalMethodCache,
  139. }: StepTwoProps) => {
  140. const { t } = useTranslation()
  141. const { locale } = useContext(I18n)
  142. const media = useBreakpoints()
  143. const isMobile = media === MediaType.mobile
  144. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  145. const isInUpload = Boolean(currentDataset)
  146. const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form
  147. const isNotUploadInEmptyDataset = !isUploadInEmptyDataset
  148. const isInInit = !isInUpload && !isSetting
  149. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  150. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  151. const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
  152. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  153. const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
  154. doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
  155. }, [])
  156. const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
  157. const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
  158. const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
  159. const [rules, setRules] = useState<PreProcessingRule[]>([])
  160. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  161. const hasSetIndexType = !!indexingType
  162. const [indexType, setIndexType] = useState<IndexingType>(() => {
  163. if (hasSetIndexType)
  164. return indexingType
  165. return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL
  166. })
  167. const [previewFile, setPreviewFile] = useState<DocumentItem>(
  168. (datasetId && documentDetail)
  169. ? documentDetail.file
  170. : files[0],
  171. )
  172. const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
  173. (datasetId && documentDetail)
  174. ? documentDetail.notion_page
  175. : notionPages[0],
  176. )
  177. const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
  178. (datasetId && documentDetail)
  179. ? documentDetail.website_page
  180. : websitePages[0],
  181. )
  182. // QA Related
  183. const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
  184. const [docForm, setDocForm] = useState<ChunkingMode>(
  185. (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
  186. )
  187. const handleChangeDocform = (value: ChunkingMode) => {
  188. if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {
  189. setIsQAConfirmDialogOpen(true)
  190. return
  191. }
  192. if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
  193. setIndexType(IndexingType.QUALIFIED)
  194. setDocForm(value)
  195. // eslint-disable-next-line ts/no-use-before-define
  196. currentEstimateMutation.reset()
  197. }
  198. const [docLanguage, setDocLanguage] = useState<string>(
  199. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'),
  200. )
  201. const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
  202. const getIndexing_technique = () => indexingType || indexType
  203. const currentDocForm = currentDataset?.doc_form || docForm
  204. const getProcessRule = (): ProcessRule => {
  205. if (currentDocForm === ChunkingMode.parentChild) {
  206. return {
  207. rules: {
  208. pre_processing_rules: rules,
  209. segmentation: {
  210. separator: unescape(
  211. parentChildConfig.parent.delimiter,
  212. ),
  213. max_tokens: parentChildConfig.parent.maxLength,
  214. },
  215. parent_mode: parentChildConfig.chunkForContext,
  216. subchunk_segmentation: {
  217. separator: unescape(parentChildConfig.child.delimiter),
  218. max_tokens: parentChildConfig.child.maxLength,
  219. },
  220. },
  221. mode: 'hierarchical',
  222. } as ProcessRule
  223. }
  224. return {
  225. rules: {
  226. pre_processing_rules: rules,
  227. segmentation: {
  228. separator: unescape(segmentIdentifier),
  229. max_tokens: maxChunkLength,
  230. chunk_overlap: overlap,
  231. },
  232. }, // api will check this. It will be removed after api refactored.
  233. mode: segmentationType,
  234. } as ProcessRule
  235. }
  236. const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
  237. docForm: currentDocForm,
  238. docLanguage,
  239. dataSourceType: DataSourceType.FILE,
  240. files: previewFile
  241. ? [files.find(file => file.name === previewFile.name)!]
  242. : files,
  243. indexingTechnique: getIndexing_technique() as any,
  244. processRule: getProcessRule(),
  245. dataset_id: datasetId!,
  246. })
  247. const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
  248. docForm: currentDocForm,
  249. docLanguage,
  250. dataSourceType: DataSourceType.NOTION,
  251. notionPages: [previewNotionPage],
  252. indexingTechnique: getIndexing_technique() as any,
  253. processRule: getProcessRule(),
  254. dataset_id: datasetId || '',
  255. })
  256. const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
  257. docForm: currentDocForm,
  258. docLanguage,
  259. dataSourceType: DataSourceType.WEB,
  260. websitePages: [previewWebsitePage],
  261. crawlOptions,
  262. websiteCrawlProvider,
  263. websiteCrawlJobId,
  264. indexingTechnique: getIndexing_technique() as any,
  265. processRule: getProcessRule(),
  266. dataset_id: datasetId || '',
  267. })
  268. const currentEstimateMutation = dataSourceType === DataSourceType.FILE
  269. ? fileIndexingEstimateQuery
  270. : dataSourceType === DataSourceType.NOTION
  271. ? notionIndexingEstimateQuery
  272. : websiteIndexingEstimateQuery
  273. const fetchEstimate = useCallback(() => {
  274. if (dataSourceType === DataSourceType.FILE)
  275. fileIndexingEstimateQuery.mutate()
  276. if (dataSourceType === DataSourceType.NOTION)
  277. notionIndexingEstimateQuery.mutate()
  278. if (dataSourceType === DataSourceType.WEB)
  279. websiteIndexingEstimateQuery.mutate()
  280. }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
  281. const estimate
  282. = dataSourceType === DataSourceType.FILE
  283. ? fileIndexingEstimateQuery.data
  284. : dataSourceType === DataSourceType.NOTION
  285. ? notionIndexingEstimateQuery.data
  286. : websiteIndexingEstimateQuery.data
  287. const getRuleName = (key: string) => {
  288. if (key === 'remove_extra_spaces')
  289. return t('datasetCreation.stepTwo.removeExtraSpaces')
  290. if (key === 'remove_urls_emails')
  291. return t('datasetCreation.stepTwo.removeUrlEmails')
  292. if (key === 'remove_stopwords')
  293. return t('datasetCreation.stepTwo.removeStopwords')
  294. }
  295. const ruleChangeHandle = (id: string) => {
  296. const newRules = rules.map((rule) => {
  297. if (rule.id === id) {
  298. return {
  299. id: rule.id,
  300. enabled: !rule.enabled,
  301. }
  302. }
  303. return rule
  304. })
  305. setRules(newRules)
  306. }
  307. const resetRules = () => {
  308. if (defaultConfig) {
  309. setSegmentIdentifier(defaultConfig.segmentation.separator)
  310. setMaxChunkLength(defaultConfig.segmentation.max_tokens)
  311. setOverlap(defaultConfig.segmentation.chunk_overlap!)
  312. setRules(defaultConfig.pre_processing_rules)
  313. }
  314. setParentChildConfig(defaultParentChildConfig)
  315. }
  316. const updatePreview = () => {
  317. if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
  318. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
  319. return
  320. }
  321. fetchEstimate()
  322. }
  323. const {
  324. modelList: rerankModelList,
  325. defaultModel: rerankDefaultModel,
  326. currentModel: isRerankDefaultModelValid,
  327. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  328. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  329. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  330. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  331. currentDataset?.embedding_model
  332. ? {
  333. provider: currentDataset.embedding_model_provider,
  334. model: currentDataset.embedding_model,
  335. }
  336. : {
  337. provider: defaultEmbeddingModel?.provider.provider || '',
  338. model: defaultEmbeddingModel?.model || '',
  339. },
  340. )
  341. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  342. search_method: RETRIEVE_METHOD.semantic,
  343. reranking_enable: false,
  344. reranking_model: {
  345. reranking_provider_name: '',
  346. reranking_model_name: '',
  347. },
  348. top_k: 3,
  349. score_threshold_enabled: false,
  350. score_threshold: 0.5,
  351. } as RetrievalConfig)
  352. useEffect(() => {
  353. if (currentDataset?.retrieval_model_dict)
  354. return
  355. setRetrievalConfig({
  356. search_method: RETRIEVE_METHOD.semantic,
  357. reranking_enable: !!isRerankDefaultModelValid,
  358. reranking_model: {
  359. reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
  360. reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
  361. },
  362. top_k: 3,
  363. score_threshold_enabled: false,
  364. score_threshold: 0.5,
  365. })
  366. // eslint-disable-next-line react-hooks/exhaustive-deps
  367. }, [rerankDefaultModel, isRerankDefaultModelValid])
  368. const getCreationParams = () => {
  369. let params
  370. if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
  371. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  372. return
  373. }
  374. if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
  375. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
  376. return
  377. }
  378. if (isSetting) {
  379. params = {
  380. original_document_id: documentDetail?.id,
  381. doc_form: currentDocForm,
  382. doc_language: docLanguage,
  383. process_rule: getProcessRule(),
  384. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  385. embedding_model: embeddingModel.model, // Readonly
  386. embedding_model_provider: embeddingModel.provider, // Readonly
  387. indexing_technique: getIndexing_technique(),
  388. } as CreateDocumentReq
  389. }
  390. else { // create
  391. const indexMethod = getIndexing_technique()
  392. if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
  393. Toast.notify({
  394. type: 'error',
  395. message: t('appDebug.datasetConfig.embeddingModelRequired'),
  396. })
  397. return
  398. }
  399. if (
  400. !isReRankModelSelected({
  401. rerankModelList,
  402. retrievalConfig,
  403. indexMethod: indexMethod as string,
  404. })
  405. ) {
  406. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  407. return
  408. }
  409. params = {
  410. data_source: {
  411. type: dataSourceType,
  412. info_list: {
  413. data_source_type: dataSourceType,
  414. },
  415. },
  416. indexing_technique: getIndexing_technique(),
  417. process_rule: getProcessRule(),
  418. doc_form: currentDocForm,
  419. doc_language: docLanguage,
  420. retrieval_model: retrievalConfig,
  421. embedding_model: embeddingModel.model,
  422. embedding_model_provider: embeddingModel.provider,
  423. } as CreateDocumentReq
  424. if (dataSourceType === DataSourceType.FILE) {
  425. params.data_source.info_list.file_info_list = {
  426. file_ids: files.map(file => file.id || '').filter(Boolean),
  427. }
  428. }
  429. if (dataSourceType === DataSourceType.NOTION)
  430. params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
  431. if (dataSourceType === DataSourceType.WEB) {
  432. params.data_source.info_list.website_info_list = getWebsiteInfo({
  433. websiteCrawlProvider,
  434. websiteCrawlJobId,
  435. websitePages,
  436. })
  437. }
  438. }
  439. return params
  440. }
  441. const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
  442. onSuccess(data) {
  443. const separator = data.rules.segmentation.separator
  444. setSegmentIdentifier(separator)
  445. setMaxChunkLength(data.rules.segmentation.max_tokens)
  446. setOverlap(data.rules.segmentation.chunk_overlap!)
  447. setRules(data.rules.pre_processing_rules)
  448. setDefaultConfig(data.rules)
  449. setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)
  450. },
  451. onError(error) {
  452. Toast.notify({
  453. type: 'error',
  454. message: `${error}`,
  455. })
  456. },
  457. })
  458. const getRulesFromDetail = () => {
  459. if (documentDetail) {
  460. const rules = documentDetail.dataset_process_rule.rules
  461. const separator = rules.segmentation.separator
  462. const max = rules.segmentation.max_tokens
  463. const overlap = rules.segmentation.chunk_overlap
  464. setSegmentIdentifier(separator)
  465. setMaxChunkLength(max)
  466. setOverlap(overlap!)
  467. setRules(rules.pre_processing_rules)
  468. setDefaultConfig(rules)
  469. }
  470. }
  471. const getDefaultMode = () => {
  472. if (documentDetail)
  473. setSegmentationType(documentDetail.dataset_process_rule.mode)
  474. }
  475. const createFirstDocumentMutation = useCreateFirstDocument({
  476. onError(error) {
  477. Toast.notify({
  478. type: 'error',
  479. message: `${error}`,
  480. })
  481. },
  482. })
  483. const createDocumentMutation = useCreateDocument(datasetId!, {
  484. onError(error) {
  485. Toast.notify({
  486. type: 'error',
  487. message: `${error}`,
  488. })
  489. },
  490. })
  491. const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
  492. const createHandle = async () => {
  493. const params = getCreationParams()
  494. if (!params)
  495. return false
  496. if (!datasetId) {
  497. await createFirstDocumentMutation.mutateAsync(
  498. params,
  499. {
  500. onSuccess(data) {
  501. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  502. updateResultCache && updateResultCache(data)
  503. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  504. },
  505. },
  506. )
  507. }
  508. else {
  509. await createDocumentMutation.mutateAsync(params, {
  510. onSuccess(data) {
  511. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  512. updateResultCache && updateResultCache(data)
  513. },
  514. })
  515. }
  516. if (mutateDatasetRes)
  517. mutateDatasetRes()
  518. onStepChange && onStepChange(+1)
  519. isSetting && onSave && onSave()
  520. }
  521. useEffect(() => {
  522. // fetch rules
  523. if (!isSetting) {
  524. fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
  525. }
  526. else {
  527. getRulesFromDetail()
  528. getDefaultMode()
  529. }
  530. // eslint-disable-next-line react-hooks/exhaustive-deps
  531. }, [])
  532. useEffect(() => {
  533. // get indexing type by props
  534. if (indexingType)
  535. setIndexType(indexingType as IndexingType)
  536. else
  537. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  538. }, [isAPIKeySet, indexingType, datasetId])
  539. const economyDomRef = useRef<HTMLDivElement>(null)
  540. const isHoveringEconomy = useHover(economyDomRef)
  541. const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type
  542. return (
  543. <div className='flex h-full w-full'>
  544. <div className={cn('relative h-full w-1/2 overflow-y-auto py-6', isMobile ? 'px-4' : 'px-12')}>
  545. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.segmentation')}</div>
  546. {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))
  547. || isUploadInEmptyDataset
  548. || isInInit)
  549. && <OptionCard
  550. className='mb-2 bg-background-section'
  551. title={t('datasetCreation.stepTwo.general')}
  552. icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
  553. activeHeaderClassName='bg-dataset-option-card-blue-gradient'
  554. description={t('datasetCreation.stepTwo.generalTip')}
  555. isActive={
  556. [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)
  557. }
  558. onSwitched={() =>
  559. handleChangeDocform(ChunkingMode.text)
  560. }
  561. actions={
  562. <>
  563. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  564. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  565. {t('datasetCreation.stepTwo.previewChunk')}
  566. </Button>
  567. <Button variant={'ghost'} onClick={resetRules}>
  568. {t('datasetCreation.stepTwo.reset')}
  569. </Button>
  570. </>
  571. }
  572. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  573. >
  574. <div className='flex flex-col gap-y-4'>
  575. <div className='flex gap-3'>
  576. <DelimiterInput
  577. value={segmentIdentifier}
  578. onChange={e => setSegmentIdentifier(e.target.value, true)}
  579. />
  580. <MaxLengthInput
  581. unit='tokens'
  582. value={maxChunkLength}
  583. onChange={setMaxChunkLength}
  584. />
  585. <OverlapInput
  586. unit='tokens'
  587. value={overlap}
  588. min={1}
  589. onChange={setOverlap}
  590. />
  591. </div>
  592. <div className='flex w-full flex-col'>
  593. <div className='flex items-center gap-x-2'>
  594. <div className='inline-flex shrink-0'>
  595. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  596. </div>
  597. <Divider className='grow' bgStyle='gradient' />
  598. </div>
  599. <div className='mt-1'>
  600. {rules.map(rule => (
  601. <div key={rule.id} className={s.ruleItem} onClick={() => {
  602. ruleChangeHandle(rule.id)
  603. }}>
  604. <Checkbox
  605. checked={rule.enabled}
  606. />
  607. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  608. </div>
  609. ))}
  610. {IS_CE_EDITION && <>
  611. <Divider type='horizontal' className='my-4 bg-divider-subtle' />
  612. <div className='flex items-center py-0.5'>
  613. <div className='flex items-center' onClick={() => {
  614. if (currentDataset?.doc_form)
  615. return
  616. if (docForm === ChunkingMode.qa)
  617. handleChangeDocform(ChunkingMode.text)
  618. else
  619. handleChangeDocform(ChunkingMode.qa)
  620. }}>
  621. <Checkbox
  622. checked={currentDocForm === ChunkingMode.qa}
  623. disabled={!!currentDataset?.doc_form}
  624. />
  625. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
  626. {t('datasetCreation.stepTwo.useQALanguage')}
  627. </label>
  628. </div>
  629. <LanguageSelect
  630. currentLanguage={docLanguage || locale}
  631. onSelect={setDocLanguage}
  632. disabled={currentDocForm !== ChunkingMode.qa}
  633. />
  634. <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
  635. </div>
  636. {currentDocForm === ChunkingMode.qa && (
  637. <div
  638. style={{
  639. background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
  640. }}
  641. className='mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]'
  642. >
  643. <RiAlertFill className='size-4 text-text-warning-secondary' />
  644. <span className='system-xs-medium text-text-primary'>
  645. {t('datasetCreation.stepTwo.QATip')}
  646. </span>
  647. </div>
  648. )}
  649. </>}
  650. </div>
  651. </div>
  652. </div>
  653. </OptionCard>}
  654. {
  655. (
  656. (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)
  657. || isUploadInEmptyDataset
  658. || isInInit
  659. )
  660. && <OptionCard
  661. title={t('datasetCreation.stepTwo.parentChild')}
  662. icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
  663. effectImg={OrangeEffect.src}
  664. activeHeaderClassName='bg-dataset-option-card-orange-gradient'
  665. description={t('datasetCreation.stepTwo.parentChildTip')}
  666. isActive={currentDocForm === ChunkingMode.parentChild}
  667. onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}
  668. actions={
  669. <>
  670. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  671. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  672. {t('datasetCreation.stepTwo.previewChunk')}
  673. </Button>
  674. <Button variant={'ghost'} onClick={resetRules}>
  675. {t('datasetCreation.stepTwo.reset')}
  676. </Button>
  677. </>
  678. }
  679. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  680. >
  681. <div className='flex flex-col gap-4'>
  682. <div>
  683. <div className='flex items-center gap-x-2'>
  684. <div className='inline-flex shrink-0'>
  685. <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>
  686. </div>
  687. <Divider className='grow' bgStyle='gradient' />
  688. </div>
  689. <RadioCard className='mt-1'
  690. icon={<Image src={Note} alt='' />}
  691. title={t('datasetCreation.stepTwo.paragraph')}
  692. description={t('datasetCreation.stepTwo.paragraphTip')}
  693. isChosen={parentChildConfig.chunkForContext === 'paragraph'}
  694. onChosen={() => setParentChildConfig(
  695. {
  696. ...parentChildConfig,
  697. chunkForContext: 'paragraph',
  698. },
  699. )}
  700. chosenConfig={
  701. <div className='flex gap-3'>
  702. <DelimiterInput
  703. value={parentChildConfig.parent.delimiter}
  704. tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}
  705. onChange={e => setParentChildConfig({
  706. ...parentChildConfig,
  707. parent: {
  708. ...parentChildConfig.parent,
  709. delimiter: e.target.value ? escape(e.target.value) : '',
  710. },
  711. })}
  712. />
  713. <MaxLengthInput
  714. unit='tokens'
  715. value={parentChildConfig.parent.maxLength}
  716. onChange={value => setParentChildConfig({
  717. ...parentChildConfig,
  718. parent: {
  719. ...parentChildConfig.parent,
  720. maxLength: value,
  721. },
  722. })}
  723. />
  724. </div>
  725. }
  726. />
  727. <RadioCard className='mt-2'
  728. icon={<Image src={FileList} alt='' />}
  729. title={t('datasetCreation.stepTwo.fullDoc')}
  730. description={t('datasetCreation.stepTwo.fullDocTip')}
  731. onChosen={() => setParentChildConfig(
  732. {
  733. ...parentChildConfig,
  734. chunkForContext: 'full-doc',
  735. },
  736. )}
  737. isChosen={parentChildConfig.chunkForContext === 'full-doc'}
  738. />
  739. </div>
  740. <div>
  741. <div className='flex items-center gap-x-2'>
  742. <div className='inline-flex shrink-0'>
  743. <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>
  744. </div>
  745. <Divider className='grow' bgStyle='gradient' />
  746. </div>
  747. <div className='mt-1 flex gap-3'>
  748. <DelimiterInput
  749. value={parentChildConfig.child.delimiter}
  750. tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}
  751. onChange={e => setParentChildConfig({
  752. ...parentChildConfig,
  753. child: {
  754. ...parentChildConfig.child,
  755. delimiter: e.target.value ? escape(e.target.value) : '',
  756. },
  757. })}
  758. />
  759. <MaxLengthInput
  760. unit='tokens'
  761. value={parentChildConfig.child.maxLength}
  762. onChange={value => setParentChildConfig({
  763. ...parentChildConfig,
  764. child: {
  765. ...parentChildConfig.child,
  766. maxLength: value,
  767. },
  768. })}
  769. />
  770. </div>
  771. </div>
  772. <div>
  773. <div className='flex items-center gap-x-2'>
  774. <div className='inline-flex shrink-0'>
  775. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  776. </div>
  777. <Divider className='grow' bgStyle='gradient' />
  778. </div>
  779. <div className='mt-1'>
  780. {rules.map(rule => (
  781. <div key={rule.id} className={s.ruleItem} onClick={() => {
  782. ruleChangeHandle(rule.id)
  783. }}>
  784. <Checkbox
  785. checked={rule.enabled}
  786. />
  787. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  788. </div>
  789. ))}
  790. </div>
  791. </div>
  792. </div>
  793. </OptionCard>}
  794. <Divider className='my-5' />
  795. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.indexMode')}</div>
  796. <div className='flex items-center gap-2'>
  797. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  798. <OptionCard className='flex-1 self-stretch'
  799. title={<div className='flex items-center'>
  800. {t('datasetCreation.stepTwo.qualified')}
  801. <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>
  802. {t('datasetCreation.stepTwo.recommend')}
  803. </Badge>
  804. <span className='ml-auto'>
  805. {!hasSetIndexType && <span className={cn(s.radio)} />}
  806. </span>
  807. </div>}
  808. description={t('datasetCreation.stepTwo.qualifiedTip')}
  809. icon={<Image src={indexMethodIcon.high_quality} alt='' />}
  810. isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
  811. disabled={hasSetIndexType}
  812. onSwitched={() => {
  813. setIndexType(IndexingType.QUALIFIED)
  814. }}
  815. />
  816. )}
  817. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  818. <>
  819. <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>
  820. <header className='mb-4 pt-6'>
  821. <h2 className='text-lg font-semibold'>
  822. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}
  823. </h2>
  824. <p className='mt-2 text-sm font-normal'>
  825. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}
  826. </p>
  827. </header>
  828. <div className='flex gap-2 pb-6'>
  829. <Button className='ml-auto' onClick={() => {
  830. setIsQAConfirmDialogOpen(false)
  831. }}>
  832. {t('datasetCreation.stepTwo.cancel')}
  833. </Button>
  834. <Button variant={'primary'} onClick={() => {
  835. setIsQAConfirmDialogOpen(false)
  836. setIndexType(IndexingType.QUALIFIED)
  837. setDocForm(ChunkingMode.qa)
  838. }}>
  839. {t('datasetCreation.stepTwo.switch')}
  840. </Button>
  841. </div>
  842. </CustomDialog>
  843. <PortalToFollowElem
  844. open={
  845. isHoveringEconomy && docForm !== ChunkingMode.text
  846. }
  847. placement={'top'}
  848. >
  849. <PortalToFollowElemTrigger asChild>
  850. <OptionCard className='flex-1 self-stretch'
  851. title={t('datasetCreation.stepTwo.economical')}
  852. description={t('datasetCreation.stepTwo.economicalTip')}
  853. icon={<Image src={indexMethodIcon.economical} alt='' />}
  854. isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
  855. disabled={hasSetIndexType || docForm !== ChunkingMode.text}
  856. ref={economyDomRef}
  857. onSwitched={() => {
  858. setIndexType(IndexingType.ECONOMICAL)
  859. }}
  860. />
  861. </PortalToFollowElemTrigger>
  862. <PortalToFollowElemContent>
  863. <div className='rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg'>
  864. {
  865. docForm === ChunkingMode.qa
  866. ? t('datasetCreation.stepTwo.notAvailableForQA')
  867. : t('datasetCreation.stepTwo.notAvailableForParentChild')
  868. }
  869. </div>
  870. </PortalToFollowElemContent>
  871. </PortalToFollowElem>
  872. </>)}
  873. </div>
  874. {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
  875. <div className='mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]'>
  876. <div className='absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40'></div>
  877. <div className='p-1'>
  878. <AlertTriangle className='size-4 text-text-warning-secondary' />
  879. </div>
  880. <span className='system-xs-medium text-text-primary'>{t('datasetCreation.stepTwo.highQualityTip')}</span>
  881. </div>
  882. )}
  883. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  884. <div className='system-xs-medium mt-2'>
  885. {t('datasetCreation.stepTwo.indexSettingTip')}
  886. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  887. </div>
  888. )}
  889. {/* Embedding model */}
  890. {indexType === IndexingType.QUALIFIED && (
  891. <div className='mt-5'>
  892. <div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>{t('datasetSettings.form.embeddingModel')}</div>
  893. <ModelSelector
  894. readonly={isModelAndRetrievalConfigDisabled}
  895. triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}
  896. defaultModel={embeddingModel}
  897. modelList={embeddingModelList}
  898. onSelect={(model: DefaultModel) => {
  899. setEmbeddingModel(model)
  900. }}
  901. />
  902. {isModelAndRetrievalConfigDisabled && (
  903. <div className='system-xs-medium mt-2 text-text-tertiary'>
  904. {t('datasetCreation.stepTwo.indexSettingTip')}
  905. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  906. </div>
  907. )}
  908. </div>
  909. )}
  910. <Divider className='my-5' />
  911. {/* Retrieval Method Config */}
  912. <div>
  913. {!isModelAndRetrievalConfigDisabled
  914. ? (
  915. <div className={'mb-1'}>
  916. <div className='system-md-semibold mb-0.5 text-text-secondary'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  917. <div className='body-xs-regular text-text-tertiary'>
  918. <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  919. {t('datasetSettings.form.retrievalSetting.longDescription')}
  920. </div>
  921. </div>
  922. )
  923. : (
  924. <div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>
  925. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  926. </div>
  927. )}
  928. <div className=''>
  929. {
  930. getIndexing_technique() === IndexingType.QUALIFIED
  931. ? (
  932. <RetrievalMethodConfig
  933. disabled={isModelAndRetrievalConfigDisabled}
  934. value={retrievalConfig}
  935. onChange={setRetrievalConfig}
  936. />
  937. )
  938. : (
  939. <EconomicalRetrievalMethodConfig
  940. disabled={isModelAndRetrievalConfigDisabled}
  941. value={retrievalConfig}
  942. onChange={setRetrievalConfig}
  943. />
  944. )
  945. }
  946. </div>
  947. </div>
  948. {!isSetting
  949. ? (
  950. <div className='mt-8 flex items-center py-2'>
  951. <Button onClick={() => onStepChange && onStepChange(-1)}>
  952. <RiArrowLeftLine className='mr-1 h-4 w-4' />
  953. {t('datasetCreation.stepTwo.previousStep')}
  954. </Button>
  955. <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  956. </div>
  957. )
  958. : (
  959. <div className='mt-8 flex items-center py-2'>
  960. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  961. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  962. </div>
  963. )}
  964. </div>
  965. <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={noop} footer={null}>
  966. <PreviewContainer
  967. header={<PreviewHeader
  968. title={t('datasetCreation.stepTwo.preview')}
  969. >
  970. <div className='flex items-center gap-1'>
  971. {dataSourceType === DataSourceType.FILE
  972. && <PreviewDocumentPicker
  973. files={files as Array<Required<CustomFile>>}
  974. onChange={(selected) => {
  975. currentEstimateMutation.reset()
  976. setPreviewFile(selected)
  977. currentEstimateMutation.mutate()
  978. }}
  979. // when it is from setting, it just has one file
  980. value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}
  981. />
  982. }
  983. {dataSourceType === DataSourceType.NOTION
  984. && <PreviewDocumentPicker
  985. files={
  986. notionPages.map(page => ({
  987. id: page.page_id,
  988. name: page.page_name,
  989. extension: 'md',
  990. }))
  991. }
  992. onChange={(selected) => {
  993. currentEstimateMutation.reset()
  994. const selectedPage = notionPages.find(page => page.page_id === selected.id)
  995. setPreviewNotionPage(selectedPage!)
  996. currentEstimateMutation.mutate()
  997. }}
  998. value={{
  999. id: previewNotionPage?.page_id || '',
  1000. name: previewNotionPage?.page_name || '',
  1001. extension: 'md',
  1002. }}
  1003. />
  1004. }
  1005. {dataSourceType === DataSourceType.WEB
  1006. && <PreviewDocumentPicker
  1007. files={
  1008. websitePages.map(page => ({
  1009. id: page.source_url,
  1010. name: page.title,
  1011. extension: 'md',
  1012. }))
  1013. }
  1014. onChange={(selected) => {
  1015. currentEstimateMutation.reset()
  1016. const selectedPage = websitePages.find(page => page.source_url === selected.id)
  1017. setPreviewWebsitePage(selectedPage!)
  1018. currentEstimateMutation.mutate()
  1019. }}
  1020. value={
  1021. {
  1022. id: previewWebsitePage?.source_url || '',
  1023. name: previewWebsitePage?.title || '',
  1024. extension: 'md',
  1025. }
  1026. }
  1027. />
  1028. }
  1029. {
  1030. currentDocForm !== ChunkingMode.qa
  1031. && <Badge text={t('datasetCreation.stepTwo.previewChunkCount', {
  1032. count: estimate?.total_segments || 0,
  1033. }) as string}
  1034. />
  1035. }
  1036. </div>
  1037. </PreviewHeader>}
  1038. className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}
  1039. mainClassName='space-y-6'
  1040. >
  1041. {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
  1042. estimate?.qa_preview.map((item, index) => (
  1043. <ChunkContainer
  1044. key={item.question}
  1045. label={`Chunk-${index + 1}`}
  1046. characterCount={item.question.length + item.answer.length}
  1047. >
  1048. <QAPreview qa={item} />
  1049. </ChunkContainer>
  1050. ))
  1051. )}
  1052. {currentDocForm === ChunkingMode.text && estimate?.preview && (
  1053. estimate?.preview.map((item, index) => (
  1054. <ChunkContainer
  1055. key={item.content}
  1056. label={`Chunk-${index + 1}`}
  1057. characterCount={item.content.length}
  1058. >
  1059. {item.content}
  1060. </ChunkContainer>
  1061. ))
  1062. )}
  1063. {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
  1064. estimate?.preview?.map((item, index) => {
  1065. const indexForLabel = index + 1
  1066. const childChunks = parentChildConfig.chunkForContext === 'full-doc'
  1067. ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
  1068. : item.child_chunks
  1069. return (
  1070. <ChunkContainer
  1071. key={item.content}
  1072. label={`Chunk-${indexForLabel}`}
  1073. characterCount={item.content.length}
  1074. >
  1075. <FormattedText>
  1076. {childChunks.map((child, index) => {
  1077. const indexForLabel = index + 1
  1078. return (
  1079. <PreviewSlice
  1080. key={child}
  1081. label={`C-${indexForLabel}`}
  1082. text={child}
  1083. tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}
  1084. labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'
  1085. dividerClassName='leading-7'
  1086. />
  1087. )
  1088. })}
  1089. </FormattedText>
  1090. </ChunkContainer>
  1091. )
  1092. })
  1093. )}
  1094. {currentEstimateMutation.isIdle && (
  1095. <div className='flex h-full w-full items-center justify-center'>
  1096. <div className='flex flex-col items-center justify-center gap-3'>
  1097. <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />
  1098. <p className='text-sm text-text-tertiary'>
  1099. {t('datasetCreation.stepTwo.previewChunkTip')}
  1100. </p>
  1101. </div>
  1102. </div>
  1103. )}
  1104. {currentEstimateMutation.isPending && (
  1105. <div className='space-y-6'>
  1106. {Array.from({ length: 10 }, (_, i) => (
  1107. <SkeletonContainer key={i}>
  1108. <SkeletonRow>
  1109. <SkeletonRectangle className="w-20" />
  1110. <SkeletonPoint />
  1111. <SkeletonRectangle className="w-24" />
  1112. </SkeletonRow>
  1113. <SkeletonRectangle className="w-full" />
  1114. <SkeletonRectangle className="w-full" />
  1115. <SkeletonRectangle className="w-[422px]" />
  1116. </SkeletonContainer>
  1117. ))}
  1118. </div>
  1119. )}
  1120. </PreviewContainer>
  1121. </FloatRightContainer>
  1122. </div>
  1123. )
  1124. }
  1125. export default StepTwo