Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

index.tsx 47KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. 'use client'
  2. import type { FC, PropsWithChildren } from 'react'
  3. import React, { useCallback, useEffect, useRef, useState } from 'react'
  4. import { useTranslation } from 'react-i18next'
  5. import { useContext } from 'use-context-selector'
  6. import {
  7. RiAlertFill,
  8. RiArrowLeftLine,
  9. RiSearchEyeLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import Image from 'next/image'
  13. import { useHover } from 'ahooks'
  14. import SettingCog from '../assets/setting-gear-mod.svg'
  15. import OrangeEffect from '../assets/option-card-effect-orange.svg'
  16. import FamilyMod from '../assets/family-mod.svg'
  17. import Note from '../assets/note-mod.svg'
  18. import FileList from '../assets/file-list-3-fill.svg'
  19. import { indexMethodIcon } from '../icons'
  20. import { PreviewContainer } from '../../preview/container'
  21. import { ChunkContainer, QAPreview } from '../../chunk'
  22. import { PreviewHeader } from '../../preview/header'
  23. import { FormattedText } from '../../formatted-text/formatted'
  24. import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
  25. import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'
  26. import s from './index.module.css'
  27. import unescape from './unescape'
  28. import escape from './escape'
  29. import { OptionCard } from './option-card'
  30. import LanguageSelect from './language-select'
  31. import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
  32. import cn from '@/utils/classnames'
  33. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  34. import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
  35. import Button from '@/app/components/base/button'
  36. import FloatRightContainer from '@/app/components/base/float-right-container'
  37. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  38. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  39. import type { RetrievalConfig } from '@/types/app'
  40. import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  41. import Toast from '@/app/components/base/toast'
  42. import type { NotionPage } from '@/models/common'
  43. import { DataSourceProvider } from '@/models/common'
  44. import { useDatasetDetailContext } from '@/context/dataset-detail'
  45. import I18n from '@/context/i18n'
  46. import { RETRIEVE_METHOD } from '@/types/app'
  47. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import Checkbox from '@/app/components/base/checkbox'
  54. import RadioCard from '@/app/components/base/radio-card'
  55. import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
  56. import Divider from '@/app/components/base/divider'
  57. import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
  58. import Badge from '@/app/components/base/badge'
  59. import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
  60. import Tooltip from '@/app/components/base/tooltip'
  61. import CustomDialog from '@/app/components/base/dialog'
  62. import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'
  63. import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
  64. import { noop } from 'lodash-es'
  65. import { useDocLink } from '@/context/i18n'
  66. const TextLabel: FC<PropsWithChildren> = (props) => {
  67. return <label className='system-sm-semibold text-text-secondary'>{props.children}</label>
  68. }
  69. type StepTwoProps = {
  70. isSetting?: boolean
  71. documentDetail?: FullDocumentDetail
  72. isAPIKeySet: boolean
  73. onSetting: () => void
  74. datasetId?: string
  75. indexingType?: IndexingType
  76. retrievalMethod?: string
  77. dataSourceType: DataSourceType
  78. files: CustomFile[]
  79. notionPages?: NotionPage[]
  80. websitePages?: CrawlResultItem[]
  81. crawlOptions?: CrawlOptions
  82. websiteCrawlProvider?: DataSourceProvider
  83. websiteCrawlJobId?: string
  84. onStepChange?: (delta: number) => void
  85. updateIndexingTypeCache?: (type: string) => void
  86. updateRetrievalMethodCache?: (method: string) => void
  87. updateResultCache?: (res: createDocumentResponse) => void
  88. onSave?: () => void
  89. onCancel?: () => void
  90. }
  91. export enum IndexingType {
  92. QUALIFIED = 'high_quality',
  93. ECONOMICAL = 'economy',
  94. }
  95. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  96. const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
  97. const DEFAULT_OVERLAP = 50
  98. const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
  99. type ParentChildConfig = {
  100. chunkForContext: ParentMode
  101. parent: {
  102. delimiter: string
  103. maxLength: number
  104. }
  105. child: {
  106. delimiter: string
  107. maxLength: number
  108. }
  109. }
  110. const defaultParentChildConfig: ParentChildConfig = {
  111. chunkForContext: 'paragraph',
  112. parent: {
  113. delimiter: '\\n\\n',
  114. maxLength: 1024,
  115. },
  116. child: {
  117. delimiter: '\\n',
  118. maxLength: 512,
  119. },
  120. }
  121. const StepTwo = ({
  122. isSetting,
  123. documentDetail,
  124. isAPIKeySet,
  125. datasetId,
  126. indexingType,
  127. dataSourceType: inCreatePageDataSourceType,
  128. files,
  129. notionPages = [],
  130. websitePages = [],
  131. crawlOptions,
  132. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  133. websiteCrawlJobId = '',
  134. onStepChange,
  135. updateIndexingTypeCache,
  136. updateResultCache,
  137. onSave,
  138. onCancel,
  139. updateRetrievalMethodCache,
  140. }: StepTwoProps) => {
  141. const { t } = useTranslation()
  142. const docLink = useDocLink()
  143. const { locale } = useContext(I18n)
  144. const media = useBreakpoints()
  145. const isMobile = media === MediaType.mobile
  146. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  147. const isInUpload = Boolean(currentDataset)
  148. const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form
  149. const isNotUploadInEmptyDataset = !isUploadInEmptyDataset
  150. const isInInit = !isInUpload && !isSetting
  151. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  152. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  153. const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
  154. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  155. const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
  156. doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
  157. }, [])
  158. const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
  159. const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
  160. const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
  161. const [rules, setRules] = useState<PreProcessingRule[]>([])
  162. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  163. const hasSetIndexType = !!indexingType
  164. const [indexType, setIndexType] = useState<IndexingType>(() => {
  165. if (hasSetIndexType)
  166. return indexingType
  167. return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL
  168. })
  169. const [previewFile, setPreviewFile] = useState<DocumentItem>(
  170. (datasetId && documentDetail)
  171. ? documentDetail.file
  172. : files[0],
  173. )
  174. const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
  175. (datasetId && documentDetail)
  176. ? documentDetail.notion_page
  177. : notionPages[0],
  178. )
  179. const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
  180. (datasetId && documentDetail)
  181. ? documentDetail.website_page
  182. : websitePages[0],
  183. )
  184. // QA Related
  185. const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
  186. const [docForm, setDocForm] = useState<ChunkingMode>(
  187. (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
  188. )
  189. const handleChangeDocform = (value: ChunkingMode) => {
  190. if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {
  191. setIsQAConfirmDialogOpen(true)
  192. return
  193. }
  194. if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
  195. setIndexType(IndexingType.QUALIFIED)
  196. setDocForm(value)
  197. // eslint-disable-next-line ts/no-use-before-define
  198. currentEstimateMutation.reset()
  199. }
  200. const [docLanguage, setDocLanguage] = useState<string>(
  201. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'),
  202. )
  203. const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
  204. const getIndexing_technique = () => indexingType || indexType
  205. const currentDocForm = currentDataset?.doc_form || docForm
  206. const getProcessRule = (): ProcessRule => {
  207. if (currentDocForm === ChunkingMode.parentChild) {
  208. return {
  209. rules: {
  210. pre_processing_rules: rules,
  211. segmentation: {
  212. separator: unescape(
  213. parentChildConfig.parent.delimiter,
  214. ),
  215. max_tokens: parentChildConfig.parent.maxLength,
  216. },
  217. parent_mode: parentChildConfig.chunkForContext,
  218. subchunk_segmentation: {
  219. separator: unescape(parentChildConfig.child.delimiter),
  220. max_tokens: parentChildConfig.child.maxLength,
  221. },
  222. },
  223. mode: 'hierarchical',
  224. } as ProcessRule
  225. }
  226. return {
  227. rules: {
  228. pre_processing_rules: rules,
  229. segmentation: {
  230. separator: unescape(segmentIdentifier),
  231. max_tokens: maxChunkLength,
  232. chunk_overlap: overlap,
  233. },
  234. }, // api will check this. It will be removed after api refactored.
  235. mode: segmentationType,
  236. } as ProcessRule
  237. }
  238. const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
  239. docForm: currentDocForm,
  240. docLanguage,
  241. dataSourceType: DataSourceType.FILE,
  242. files: previewFile
  243. ? [files.find(file => file.name === previewFile.name)!]
  244. : files,
  245. indexingTechnique: getIndexing_technique() as any,
  246. processRule: getProcessRule(),
  247. dataset_id: datasetId!,
  248. })
  249. const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
  250. docForm: currentDocForm,
  251. docLanguage,
  252. dataSourceType: DataSourceType.NOTION,
  253. notionPages: [previewNotionPage],
  254. indexingTechnique: getIndexing_technique() as any,
  255. processRule: getProcessRule(),
  256. dataset_id: datasetId || '',
  257. })
  258. const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
  259. docForm: currentDocForm,
  260. docLanguage,
  261. dataSourceType: DataSourceType.WEB,
  262. websitePages: [previewWebsitePage],
  263. crawlOptions,
  264. websiteCrawlProvider,
  265. websiteCrawlJobId,
  266. indexingTechnique: getIndexing_technique() as any,
  267. processRule: getProcessRule(),
  268. dataset_id: datasetId || '',
  269. })
  270. const currentEstimateMutation = dataSourceType === DataSourceType.FILE
  271. ? fileIndexingEstimateQuery
  272. : dataSourceType === DataSourceType.NOTION
  273. ? notionIndexingEstimateQuery
  274. : websiteIndexingEstimateQuery
  275. const fetchEstimate = useCallback(() => {
  276. if (dataSourceType === DataSourceType.FILE)
  277. fileIndexingEstimateQuery.mutate()
  278. if (dataSourceType === DataSourceType.NOTION)
  279. notionIndexingEstimateQuery.mutate()
  280. if (dataSourceType === DataSourceType.WEB)
  281. websiteIndexingEstimateQuery.mutate()
  282. }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
  283. const estimate
  284. = dataSourceType === DataSourceType.FILE
  285. ? fileIndexingEstimateQuery.data
  286. : dataSourceType === DataSourceType.NOTION
  287. ? notionIndexingEstimateQuery.data
  288. : websiteIndexingEstimateQuery.data
  289. const getRuleName = (key: string) => {
  290. if (key === 'remove_extra_spaces')
  291. return t('datasetCreation.stepTwo.removeExtraSpaces')
  292. if (key === 'remove_urls_emails')
  293. return t('datasetCreation.stepTwo.removeUrlEmails')
  294. if (key === 'remove_stopwords')
  295. return t('datasetCreation.stepTwo.removeStopwords')
  296. }
  297. const ruleChangeHandle = (id: string) => {
  298. const newRules = rules.map((rule) => {
  299. if (rule.id === id) {
  300. return {
  301. id: rule.id,
  302. enabled: !rule.enabled,
  303. }
  304. }
  305. return rule
  306. })
  307. setRules(newRules)
  308. }
  309. const resetRules = () => {
  310. if (defaultConfig) {
  311. setSegmentIdentifier(defaultConfig.segmentation.separator)
  312. setMaxChunkLength(defaultConfig.segmentation.max_tokens)
  313. setOverlap(defaultConfig.segmentation.chunk_overlap!)
  314. setRules(defaultConfig.pre_processing_rules)
  315. }
  316. setParentChildConfig(defaultParentChildConfig)
  317. }
  318. const updatePreview = () => {
  319. if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
  320. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
  321. return
  322. }
  323. fetchEstimate()
  324. }
  325. const {
  326. modelList: rerankModelList,
  327. defaultModel: rerankDefaultModel,
  328. currentModel: isRerankDefaultModelValid,
  329. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  330. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  331. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  332. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  333. currentDataset?.embedding_model
  334. ? {
  335. provider: currentDataset.embedding_model_provider,
  336. model: currentDataset.embedding_model,
  337. }
  338. : {
  339. provider: defaultEmbeddingModel?.provider.provider || '',
  340. model: defaultEmbeddingModel?.model || '',
  341. },
  342. )
  343. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  344. search_method: RETRIEVE_METHOD.semantic,
  345. reranking_enable: false,
  346. reranking_model: {
  347. reranking_provider_name: '',
  348. reranking_model_name: '',
  349. },
  350. top_k: 3,
  351. score_threshold_enabled: false,
  352. score_threshold: 0.5,
  353. } as RetrievalConfig)
  354. useEffect(() => {
  355. if (currentDataset?.retrieval_model_dict)
  356. return
  357. setRetrievalConfig({
  358. search_method: RETRIEVE_METHOD.semantic,
  359. reranking_enable: !!isRerankDefaultModelValid,
  360. reranking_model: {
  361. reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
  362. reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
  363. },
  364. top_k: 3,
  365. score_threshold_enabled: false,
  366. score_threshold: 0.5,
  367. })
  368. // eslint-disable-next-line react-hooks/exhaustive-deps
  369. }, [rerankDefaultModel, isRerankDefaultModelValid])
  370. const getCreationParams = () => {
  371. let params
  372. if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
  373. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  374. return
  375. }
  376. if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
  377. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
  378. return
  379. }
  380. if (isSetting) {
  381. params = {
  382. original_document_id: documentDetail?.id,
  383. doc_form: currentDocForm,
  384. doc_language: docLanguage,
  385. process_rule: getProcessRule(),
  386. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  387. embedding_model: embeddingModel.model, // Readonly
  388. embedding_model_provider: embeddingModel.provider, // Readonly
  389. indexing_technique: getIndexing_technique(),
  390. } as CreateDocumentReq
  391. }
  392. else { // create
  393. const indexMethod = getIndexing_technique()
  394. if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
  395. Toast.notify({
  396. type: 'error',
  397. message: t('appDebug.datasetConfig.embeddingModelRequired'),
  398. })
  399. return
  400. }
  401. if (
  402. !isReRankModelSelected({
  403. rerankModelList,
  404. retrievalConfig,
  405. indexMethod: indexMethod as string,
  406. })
  407. ) {
  408. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  409. return
  410. }
  411. params = {
  412. data_source: {
  413. type: dataSourceType,
  414. info_list: {
  415. data_source_type: dataSourceType,
  416. },
  417. },
  418. indexing_technique: getIndexing_technique(),
  419. process_rule: getProcessRule(),
  420. doc_form: currentDocForm,
  421. doc_language: docLanguage,
  422. retrieval_model: retrievalConfig,
  423. embedding_model: embeddingModel.model,
  424. embedding_model_provider: embeddingModel.provider,
  425. } as CreateDocumentReq
  426. if (dataSourceType === DataSourceType.FILE) {
  427. params.data_source.info_list.file_info_list = {
  428. file_ids: files.map(file => file.id || '').filter(Boolean),
  429. }
  430. }
  431. if (dataSourceType === DataSourceType.NOTION)
  432. params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
  433. if (dataSourceType === DataSourceType.WEB) {
  434. params.data_source.info_list.website_info_list = getWebsiteInfo({
  435. websiteCrawlProvider,
  436. websiteCrawlJobId,
  437. websitePages,
  438. })
  439. }
  440. }
  441. return params
  442. }
  443. const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
  444. onSuccess(data) {
  445. const separator = data.rules.segmentation.separator
  446. setSegmentIdentifier(separator)
  447. setMaxChunkLength(data.rules.segmentation.max_tokens)
  448. setOverlap(data.rules.segmentation.chunk_overlap!)
  449. setRules(data.rules.pre_processing_rules)
  450. setDefaultConfig(data.rules)
  451. setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)
  452. },
  453. onError(error) {
  454. Toast.notify({
  455. type: 'error',
  456. message: `${error}`,
  457. })
  458. },
  459. })
  460. const getRulesFromDetail = () => {
  461. if (documentDetail) {
  462. const rules = documentDetail.dataset_process_rule.rules
  463. const separator = rules.segmentation.separator
  464. const max = rules.segmentation.max_tokens
  465. const overlap = rules.segmentation.chunk_overlap
  466. setSegmentIdentifier(separator)
  467. setMaxChunkLength(max)
  468. setOverlap(overlap!)
  469. setRules(rules.pre_processing_rules)
  470. setDefaultConfig(rules)
  471. }
  472. }
  473. const getDefaultMode = () => {
  474. if (documentDetail)
  475. setSegmentationType(documentDetail.dataset_process_rule.mode)
  476. }
  477. const createFirstDocumentMutation = useCreateFirstDocument({
  478. onError(error) {
  479. Toast.notify({
  480. type: 'error',
  481. message: `${error}`,
  482. })
  483. },
  484. })
  485. const createDocumentMutation = useCreateDocument(datasetId!, {
  486. onError(error) {
  487. Toast.notify({
  488. type: 'error',
  489. message: `${error}`,
  490. })
  491. },
  492. })
  493. const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
  494. const createHandle = async () => {
  495. const params = getCreationParams()
  496. if (!params)
  497. return false
  498. if (!datasetId) {
  499. await createFirstDocumentMutation.mutateAsync(
  500. params,
  501. {
  502. onSuccess(data) {
  503. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  504. updateResultCache && updateResultCache(data)
  505. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  506. },
  507. },
  508. )
  509. }
  510. else {
  511. await createDocumentMutation.mutateAsync(params, {
  512. onSuccess(data) {
  513. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  514. updateResultCache && updateResultCache(data)
  515. },
  516. })
  517. }
  518. if (mutateDatasetRes)
  519. mutateDatasetRes()
  520. onStepChange && onStepChange(+1)
  521. isSetting && onSave && onSave()
  522. }
  523. useEffect(() => {
  524. // fetch rules
  525. if (!isSetting) {
  526. fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
  527. }
  528. else {
  529. getRulesFromDetail()
  530. getDefaultMode()
  531. }
  532. // eslint-disable-next-line react-hooks/exhaustive-deps
  533. }, [])
  534. useEffect(() => {
  535. // get indexing type by props
  536. if (indexingType)
  537. setIndexType(indexingType as IndexingType)
  538. else
  539. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  540. }, [isAPIKeySet, indexingType, datasetId])
  541. const economyDomRef = useRef<HTMLDivElement>(null)
  542. const isHoveringEconomy = useHover(economyDomRef)
  543. const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type
  544. return (
  545. <div className='flex h-full w-full'>
  546. <div className={cn('relative h-full w-1/2 overflow-y-auto py-6', isMobile ? 'px-4' : 'px-12')}>
  547. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.segmentation')}</div>
  548. {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))
  549. || isUploadInEmptyDataset
  550. || isInInit)
  551. && <OptionCard
  552. className='mb-2 bg-background-section'
  553. title={t('datasetCreation.stepTwo.general')}
  554. icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
  555. activeHeaderClassName='bg-dataset-option-card-blue-gradient'
  556. description={t('datasetCreation.stepTwo.generalTip')}
  557. isActive={
  558. [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)
  559. }
  560. onSwitched={() =>
  561. handleChangeDocform(ChunkingMode.text)
  562. }
  563. actions={
  564. <>
  565. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  566. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  567. {t('datasetCreation.stepTwo.previewChunk')}
  568. </Button>
  569. <Button variant={'ghost'} onClick={resetRules}>
  570. {t('datasetCreation.stepTwo.reset')}
  571. </Button>
  572. </>
  573. }
  574. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  575. >
  576. <div className='flex flex-col gap-y-4'>
  577. <div className='flex gap-3'>
  578. <DelimiterInput
  579. value={segmentIdentifier}
  580. onChange={e => setSegmentIdentifier(e.target.value, true)}
  581. />
  582. <MaxLengthInput
  583. unit='characters'
  584. value={maxChunkLength}
  585. onChange={setMaxChunkLength}
  586. />
  587. <OverlapInput
  588. unit='characters'
  589. value={overlap}
  590. min={1}
  591. onChange={setOverlap}
  592. />
  593. </div>
  594. <div className='flex w-full flex-col'>
  595. <div className='flex items-center gap-x-2'>
  596. <div className='inline-flex shrink-0'>
  597. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  598. </div>
  599. <Divider className='grow' bgStyle='gradient' />
  600. </div>
  601. <div className='mt-1'>
  602. {rules.map(rule => (
  603. <div key={rule.id} className={s.ruleItem} onClick={() => {
  604. ruleChangeHandle(rule.id)
  605. }}>
  606. <Checkbox
  607. checked={rule.enabled}
  608. />
  609. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  610. </div>
  611. ))}
  612. {IS_CE_EDITION && <>
  613. <Divider type='horizontal' className='my-4 bg-divider-subtle' />
  614. <div className='flex items-center py-0.5'>
  615. <div className='flex items-center' onClick={() => {
  616. if (currentDataset?.doc_form)
  617. return
  618. if (docForm === ChunkingMode.qa)
  619. handleChangeDocform(ChunkingMode.text)
  620. else
  621. handleChangeDocform(ChunkingMode.qa)
  622. }}>
  623. <Checkbox
  624. checked={currentDocForm === ChunkingMode.qa}
  625. disabled={!!currentDataset?.doc_form}
  626. />
  627. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
  628. {t('datasetCreation.stepTwo.useQALanguage')}
  629. </label>
  630. </div>
  631. <LanguageSelect
  632. currentLanguage={docLanguage || locale}
  633. onSelect={setDocLanguage}
  634. disabled={currentDocForm !== ChunkingMode.qa}
  635. />
  636. <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
  637. </div>
  638. {currentDocForm === ChunkingMode.qa && (
  639. <div
  640. style={{
  641. background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
  642. }}
  643. className='mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]'
  644. >
  645. <RiAlertFill className='size-4 text-text-warning-secondary' />
  646. <span className='system-xs-medium text-text-primary'>
  647. {t('datasetCreation.stepTwo.QATip')}
  648. </span>
  649. </div>
  650. )}
  651. </>}
  652. </div>
  653. </div>
  654. </div>
  655. </OptionCard>}
  656. {
  657. (
  658. (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)
  659. || isUploadInEmptyDataset
  660. || isInInit
  661. )
  662. && <OptionCard
  663. title={t('datasetCreation.stepTwo.parentChild')}
  664. icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
  665. effectImg={OrangeEffect.src}
  666. activeHeaderClassName='bg-dataset-option-card-orange-gradient'
  667. description={t('datasetCreation.stepTwo.parentChildTip')}
  668. isActive={currentDocForm === ChunkingMode.parentChild}
  669. onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}
  670. actions={
  671. <>
  672. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  673. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  674. {t('datasetCreation.stepTwo.previewChunk')}
  675. </Button>
  676. <Button variant={'ghost'} onClick={resetRules}>
  677. {t('datasetCreation.stepTwo.reset')}
  678. </Button>
  679. </>
  680. }
  681. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  682. >
  683. <div className='flex flex-col gap-4'>
  684. <div>
  685. <div className='flex items-center gap-x-2'>
  686. <div className='inline-flex shrink-0'>
  687. <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>
  688. </div>
  689. <Divider className='grow' bgStyle='gradient' />
  690. </div>
  691. <RadioCard className='mt-1'
  692. icon={<Image src={Note} alt='' />}
  693. title={t('datasetCreation.stepTwo.paragraph')}
  694. description={t('datasetCreation.stepTwo.paragraphTip')}
  695. isChosen={parentChildConfig.chunkForContext === 'paragraph'}
  696. onChosen={() => setParentChildConfig(
  697. {
  698. ...parentChildConfig,
  699. chunkForContext: 'paragraph',
  700. },
  701. )}
  702. chosenConfig={
  703. <div className='flex gap-3'>
  704. <DelimiterInput
  705. value={parentChildConfig.parent.delimiter}
  706. tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}
  707. onChange={e => setParentChildConfig({
  708. ...parentChildConfig,
  709. parent: {
  710. ...parentChildConfig.parent,
  711. delimiter: e.target.value ? escape(e.target.value) : '',
  712. },
  713. })}
  714. />
  715. <MaxLengthInput
  716. unit='characters'
  717. value={parentChildConfig.parent.maxLength}
  718. onChange={value => setParentChildConfig({
  719. ...parentChildConfig,
  720. parent: {
  721. ...parentChildConfig.parent,
  722. maxLength: value,
  723. },
  724. })}
  725. />
  726. </div>
  727. }
  728. />
  729. <RadioCard className='mt-2'
  730. icon={<Image src={FileList} alt='' />}
  731. title={t('datasetCreation.stepTwo.fullDoc')}
  732. description={t('datasetCreation.stepTwo.fullDocTip')}
  733. onChosen={() => setParentChildConfig(
  734. {
  735. ...parentChildConfig,
  736. chunkForContext: 'full-doc',
  737. },
  738. )}
  739. isChosen={parentChildConfig.chunkForContext === 'full-doc'}
  740. />
  741. </div>
  742. <div>
  743. <div className='flex items-center gap-x-2'>
  744. <div className='inline-flex shrink-0'>
  745. <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>
  746. </div>
  747. <Divider className='grow' bgStyle='gradient' />
  748. </div>
  749. <div className='mt-1 flex gap-3'>
  750. <DelimiterInput
  751. value={parentChildConfig.child.delimiter}
  752. tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}
  753. onChange={e => setParentChildConfig({
  754. ...parentChildConfig,
  755. child: {
  756. ...parentChildConfig.child,
  757. delimiter: e.target.value ? escape(e.target.value) : '',
  758. },
  759. })}
  760. />
  761. <MaxLengthInput
  762. unit='characters'
  763. value={parentChildConfig.child.maxLength}
  764. onChange={value => setParentChildConfig({
  765. ...parentChildConfig,
  766. child: {
  767. ...parentChildConfig.child,
  768. maxLength: value,
  769. },
  770. })}
  771. />
  772. </div>
  773. </div>
  774. <div>
  775. <div className='flex items-center gap-x-2'>
  776. <div className='inline-flex shrink-0'>
  777. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  778. </div>
  779. <Divider className='grow' bgStyle='gradient' />
  780. </div>
  781. <div className='mt-1'>
  782. {rules.map(rule => (
  783. <div key={rule.id} className={s.ruleItem} onClick={() => {
  784. ruleChangeHandle(rule.id)
  785. }}>
  786. <Checkbox
  787. checked={rule.enabled}
  788. />
  789. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  790. </div>
  791. ))}
  792. </div>
  793. </div>
  794. </div>
  795. </OptionCard>}
  796. <Divider className='my-5' />
  797. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.indexMode')}</div>
  798. <div className='flex items-center gap-2'>
  799. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  800. <OptionCard className='flex-1 self-stretch'
  801. title={<div className='flex items-center'>
  802. {t('datasetCreation.stepTwo.qualified')}
  803. <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>
  804. {t('datasetCreation.stepTwo.recommend')}
  805. </Badge>
  806. <span className='ml-auto'>
  807. {!hasSetIndexType && <span className={cn(s.radio)} />}
  808. </span>
  809. </div>}
  810. description={t('datasetCreation.stepTwo.qualifiedTip')}
  811. icon={<Image src={indexMethodIcon.high_quality} alt='' />}
  812. isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
  813. disabled={hasSetIndexType}
  814. onSwitched={() => {
  815. setIndexType(IndexingType.QUALIFIED)
  816. }}
  817. />
  818. )}
  819. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  820. <>
  821. <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>
  822. <header className='mb-4 pt-6'>
  823. <h2 className='text-lg font-semibold text-text-primary'>
  824. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}
  825. </h2>
  826. <p className='mt-2 text-sm font-normal text-text-secondary'>
  827. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}
  828. </p>
  829. </header>
  830. <div className='flex gap-2 pb-6'>
  831. <Button className='ml-auto' onClick={() => {
  832. setIsQAConfirmDialogOpen(false)
  833. }}>
  834. {t('datasetCreation.stepTwo.cancel')}
  835. </Button>
  836. <Button variant={'primary'} onClick={() => {
  837. setIsQAConfirmDialogOpen(false)
  838. setIndexType(IndexingType.QUALIFIED)
  839. setDocForm(ChunkingMode.qa)
  840. }}>
  841. {t('datasetCreation.stepTwo.switch')}
  842. </Button>
  843. </div>
  844. </CustomDialog>
  845. <PortalToFollowElem
  846. open={
  847. isHoveringEconomy && docForm !== ChunkingMode.text
  848. }
  849. placement={'top'}
  850. >
  851. <PortalToFollowElemTrigger asChild>
  852. <OptionCard className='flex-1 self-stretch'
  853. title={t('datasetCreation.stepTwo.economical')}
  854. description={t('datasetCreation.stepTwo.economicalTip')}
  855. icon={<Image src={indexMethodIcon.economical} alt='' />}
  856. isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
  857. disabled={hasSetIndexType || docForm !== ChunkingMode.text}
  858. ref={economyDomRef}
  859. onSwitched={() => {
  860. setIndexType(IndexingType.ECONOMICAL)
  861. }}
  862. />
  863. </PortalToFollowElemTrigger>
  864. <PortalToFollowElemContent>
  865. <div className='rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg'>
  866. {
  867. docForm === ChunkingMode.qa
  868. ? t('datasetCreation.stepTwo.notAvailableForQA')
  869. : t('datasetCreation.stepTwo.notAvailableForParentChild')
  870. }
  871. </div>
  872. </PortalToFollowElemContent>
  873. </PortalToFollowElem>
  874. </>)}
  875. </div>
  876. {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
  877. <div className='mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]'>
  878. <div className='absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40'></div>
  879. <div className='p-1'>
  880. <AlertTriangle className='size-4 text-text-warning-secondary' />
  881. </div>
  882. <span className='system-xs-medium text-text-primary'>{t('datasetCreation.stepTwo.highQualityTip')}</span>
  883. </div>
  884. )}
  885. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  886. <div className='system-xs-medium mt-2 text-text-tertiary'>
  887. {t('datasetCreation.stepTwo.indexSettingTip')}
  888. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  889. </div>
  890. )}
  891. {/* Embedding model */}
  892. {indexType === IndexingType.QUALIFIED && (
  893. <div className='mt-5'>
  894. <div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>{t('datasetSettings.form.embeddingModel')}</div>
  895. <ModelSelector
  896. readonly={isModelAndRetrievalConfigDisabled}
  897. triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}
  898. defaultModel={embeddingModel}
  899. modelList={embeddingModelList}
  900. onSelect={(model: DefaultModel) => {
  901. setEmbeddingModel(model)
  902. }}
  903. />
  904. {isModelAndRetrievalConfigDisabled && (
  905. <div className='system-xs-medium mt-2 text-text-tertiary'>
  906. {t('datasetCreation.stepTwo.indexSettingTip')}
  907. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  908. </div>
  909. )}
  910. </div>
  911. )}
  912. <Divider className='my-5' />
  913. {/* Retrieval Method Config */}
  914. <div>
  915. {!isModelAndRetrievalConfigDisabled
  916. ? (
  917. <div className={'mb-1'}>
  918. <div className='system-md-semibold mb-0.5 text-text-secondary'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  919. <div className='body-xs-regular text-text-tertiary'>
  920. <a target='_blank' rel='noopener noreferrer'
  921. href={docLink('/guides/knowledge-base/create-knowledge-and-upload-documents')}
  922. className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  923. {t('datasetSettings.form.retrievalSetting.longDescription')}
  924. </div>
  925. </div>
  926. )
  927. : (
  928. <div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>
  929. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  930. </div>
  931. )}
  932. <div className=''>
  933. {
  934. getIndexing_technique() === IndexingType.QUALIFIED
  935. ? (
  936. <RetrievalMethodConfig
  937. disabled={isModelAndRetrievalConfigDisabled}
  938. value={retrievalConfig}
  939. onChange={setRetrievalConfig}
  940. />
  941. )
  942. : (
  943. <EconomicalRetrievalMethodConfig
  944. disabled={isModelAndRetrievalConfigDisabled}
  945. value={retrievalConfig}
  946. onChange={setRetrievalConfig}
  947. />
  948. )
  949. }
  950. </div>
  951. </div>
  952. {!isSetting
  953. ? (
  954. <div className='mt-8 flex items-center py-2'>
  955. <Button onClick={() => onStepChange && onStepChange(-1)}>
  956. <RiArrowLeftLine className='mr-1 h-4 w-4' />
  957. {t('datasetCreation.stepTwo.previousStep')}
  958. </Button>
  959. <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  960. </div>
  961. )
  962. : (
  963. <div className='mt-8 flex items-center py-2'>
  964. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  965. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  966. </div>
  967. )}
  968. </div>
  969. <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={noop} footer={null}>
  970. <PreviewContainer
  971. header={<PreviewHeader
  972. title={t('datasetCreation.stepTwo.preview')}
  973. >
  974. <div className='flex items-center gap-1'>
  975. {dataSourceType === DataSourceType.FILE
  976. && <PreviewDocumentPicker
  977. files={files as Array<Required<CustomFile>>}
  978. onChange={(selected) => {
  979. currentEstimateMutation.reset()
  980. setPreviewFile(selected)
  981. currentEstimateMutation.mutate()
  982. }}
  983. // when it is from setting, it just has one file
  984. value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}
  985. />
  986. }
  987. {dataSourceType === DataSourceType.NOTION
  988. && <PreviewDocumentPicker
  989. files={
  990. notionPages.map(page => ({
  991. id: page.page_id,
  992. name: page.page_name,
  993. extension: 'md',
  994. }))
  995. }
  996. onChange={(selected) => {
  997. currentEstimateMutation.reset()
  998. const selectedPage = notionPages.find(page => page.page_id === selected.id)
  999. setPreviewNotionPage(selectedPage!)
  1000. currentEstimateMutation.mutate()
  1001. }}
  1002. value={{
  1003. id: previewNotionPage?.page_id || '',
  1004. name: previewNotionPage?.page_name || '',
  1005. extension: 'md',
  1006. }}
  1007. />
  1008. }
  1009. {dataSourceType === DataSourceType.WEB
  1010. && <PreviewDocumentPicker
  1011. files={
  1012. websitePages.map(page => ({
  1013. id: page.source_url,
  1014. name: page.title,
  1015. extension: 'md',
  1016. }))
  1017. }
  1018. onChange={(selected) => {
  1019. currentEstimateMutation.reset()
  1020. const selectedPage = websitePages.find(page => page.source_url === selected.id)
  1021. setPreviewWebsitePage(selectedPage!)
  1022. currentEstimateMutation.mutate()
  1023. }}
  1024. value={
  1025. {
  1026. id: previewWebsitePage?.source_url || '',
  1027. name: previewWebsitePage?.title || '',
  1028. extension: 'md',
  1029. }
  1030. }
  1031. />
  1032. }
  1033. {
  1034. currentDocForm !== ChunkingMode.qa
  1035. && <Badge text={t('datasetCreation.stepTwo.previewChunkCount', {
  1036. count: estimate?.total_segments || 0,
  1037. }) as string}
  1038. />
  1039. }
  1040. </div>
  1041. </PreviewHeader>}
  1042. className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}
  1043. mainClassName='space-y-6'
  1044. >
  1045. {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
  1046. estimate?.qa_preview.map((item, index) => (
  1047. <ChunkContainer
  1048. key={item.question}
  1049. label={`Chunk-${index + 1}`}
  1050. characterCount={item.question.length + item.answer.length}
  1051. >
  1052. <QAPreview qa={item} />
  1053. </ChunkContainer>
  1054. ))
  1055. )}
  1056. {currentDocForm === ChunkingMode.text && estimate?.preview && (
  1057. estimate?.preview.map((item, index) => (
  1058. <ChunkContainer
  1059. key={item.content}
  1060. label={`Chunk-${index + 1}`}
  1061. characterCount={item.content.length}
  1062. >
  1063. {item.content}
  1064. </ChunkContainer>
  1065. ))
  1066. )}
  1067. {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
  1068. estimate?.preview?.map((item, index) => {
  1069. const indexForLabel = index + 1
  1070. const childChunks = parentChildConfig.chunkForContext === 'full-doc'
  1071. ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
  1072. : item.child_chunks
  1073. return (
  1074. <ChunkContainer
  1075. key={item.content}
  1076. label={`Chunk-${indexForLabel}`}
  1077. characterCount={item.content.length}
  1078. >
  1079. <FormattedText>
  1080. {childChunks.map((child, index) => {
  1081. const indexForLabel = index + 1
  1082. return (
  1083. <PreviewSlice
  1084. key={child}
  1085. label={`C-${indexForLabel}`}
  1086. text={child}
  1087. tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}
  1088. labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'
  1089. dividerClassName='leading-7'
  1090. />
  1091. )
  1092. })}
  1093. </FormattedText>
  1094. </ChunkContainer>
  1095. )
  1096. })
  1097. )}
  1098. {currentEstimateMutation.isIdle && (
  1099. <div className='flex h-full w-full items-center justify-center'>
  1100. <div className='flex flex-col items-center justify-center gap-3'>
  1101. <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />
  1102. <p className='text-sm text-text-tertiary'>
  1103. {t('datasetCreation.stepTwo.previewChunkTip')}
  1104. </p>
  1105. </div>
  1106. </div>
  1107. )}
  1108. {currentEstimateMutation.isPending && (
  1109. <div className='space-y-6'>
  1110. {Array.from({ length: 10 }, (_, i) => (
  1111. <SkeletonContainer key={i}>
  1112. <SkeletonRow>
  1113. <SkeletonRectangle className="w-20" />
  1114. <SkeletonPoint />
  1115. <SkeletonRectangle className="w-24" />
  1116. </SkeletonRow>
  1117. <SkeletonRectangle className="w-full" />
  1118. <SkeletonRectangle className="w-full" />
  1119. <SkeletonRectangle className="w-[422px]" />
  1120. </SkeletonContainer>
  1121. ))}
  1122. </div>
  1123. )}
  1124. </PreviewContainer>
  1125. </FloatRightContainer>
  1126. </div>
  1127. )
  1128. }
  1129. export default StepTwo