Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

index.tsx 48KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. 'use client'
  2. import type { FC, PropsWithChildren } from 'react'
  3. import React, { useCallback, useEffect, useRef, useState } from 'react'
  4. import { useTranslation } from 'react-i18next'
  5. import { useContext } from 'use-context-selector'
  6. import {
  7. RiAlertFill,
  8. RiArrowLeftLine,
  9. RiSearchEyeLine,
  10. } from '@remixicon/react'
  11. import Link from 'next/link'
  12. import Image from 'next/image'
  13. import { useHover } from 'ahooks'
  14. import SettingCog from '../assets/setting-gear-mod.svg'
  15. import OrangeEffect from '../assets/option-card-effect-orange.svg'
  16. import FamilyMod from '../assets/family-mod.svg'
  17. import Note from '../assets/note-mod.svg'
  18. import FileList from '../assets/file-list-3-fill.svg'
  19. import { indexMethodIcon } from '../icons'
  20. import { PreviewContainer } from '../../preview/container'
  21. import { ChunkContainer, QAPreview } from '../../chunk'
  22. import { PreviewHeader } from '../../preview/header'
  23. import { FormattedText } from '../../formatted-text/formatted'
  24. import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
  25. import PreviewDocumentPicker from '../../common/document-picker/preview-document-picker'
  26. import s from './index.module.css'
  27. import unescape from './unescape'
  28. import escape from './escape'
  29. import { OptionCard } from './option-card'
  30. import LanguageSelect from './language-select'
  31. import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
  32. import cn from '@/utils/classnames'
  33. import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
  34. import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
  35. import Button from '@/app/components/base/button'
  36. import FloatRightContainer from '@/app/components/base/float-right-container'
  37. import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
  38. import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
  39. import type { RetrievalConfig } from '@/types/app'
  40. import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
  41. import Toast from '@/app/components/base/toast'
  42. import type { NotionPage } from '@/models/common'
  43. import { DataSourceProvider } from '@/models/common'
  44. import { useDatasetDetailContext } from '@/context/dataset-detail'
  45. import I18n from '@/context/i18n'
  46. import { RETRIEVE_METHOD } from '@/types/app'
  47. import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
  48. import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
  49. import { LanguagesSupported } from '@/i18n-config/language'
  50. import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
  51. import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
  52. import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
  53. import Checkbox from '@/app/components/base/checkbox'
  54. import RadioCard from '@/app/components/base/radio-card'
  55. import { FULL_DOC_PREVIEW_LENGTH, IS_CE_EDITION } from '@/config'
  56. import Divider from '@/app/components/base/divider'
  57. import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/knowledge/use-create-dataset'
  58. import Badge from '@/app/components/base/badge'
  59. import { SkeletonContainer, SkeletonPoint, SkeletonRectangle, SkeletonRow } from '@/app/components/base/skeleton'
  60. import Tooltip from '@/app/components/base/tooltip'
  61. import CustomDialog from '@/app/components/base/dialog'
  62. import { PortalToFollowElem, PortalToFollowElemContent, PortalToFollowElemTrigger } from '@/app/components/base/portal-to-follow-elem'
  63. import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback'
  64. import { noop } from 'lodash-es'
  65. import { useDocLink } from '@/context/i18n'
  66. const TextLabel: FC<PropsWithChildren> = (props) => {
  67. return <label className='system-sm-semibold text-text-secondary'>{props.children}</label>
  68. }
  69. type StepTwoProps = {
  70. isSetting?: boolean
  71. documentDetail?: FullDocumentDetail
  72. isAPIKeySet: boolean
  73. onSetting: () => void
  74. datasetId?: string
  75. indexingType?: IndexingType
  76. retrievalMethod?: string
  77. dataSourceType: DataSourceType
  78. files: CustomFile[]
  79. notionPages?: NotionPage[]
  80. websitePages?: CrawlResultItem[]
  81. crawlOptions?: CrawlOptions
  82. websiteCrawlProvider?: DataSourceProvider
  83. websiteCrawlJobId?: string
  84. onStepChange?: (delta: number) => void
  85. updateIndexingTypeCache?: (type: string) => void
  86. updateRetrievalMethodCache?: (method: string) => void
  87. updateResultCache?: (res: createDocumentResponse) => void
  88. onSave?: () => void
  89. onCancel?: () => void
  90. }
  91. export enum IndexingType {
  92. QUALIFIED = 'high_quality',
  93. ECONOMICAL = 'economy',
  94. }
  95. const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
  96. const DEFAULT_MAXIMUM_CHUNK_LENGTH = 1024
  97. const DEFAULT_OVERLAP = 50
  98. const MAXIMUM_CHUNK_TOKEN_LENGTH = Number.parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
  99. type ParentChildConfig = {
  100. chunkForContext: ParentMode
  101. parent: {
  102. delimiter: string
  103. maxLength: number
  104. }
  105. child: {
  106. delimiter: string
  107. maxLength: number
  108. }
  109. }
  110. const defaultParentChildConfig: ParentChildConfig = {
  111. chunkForContext: 'paragraph',
  112. parent: {
  113. delimiter: '\\n\\n',
  114. maxLength: 1024,
  115. },
  116. child: {
  117. delimiter: '\\n',
  118. maxLength: 512,
  119. },
  120. }
  121. const StepTwo = ({
  122. isSetting,
  123. documentDetail,
  124. isAPIKeySet,
  125. datasetId,
  126. indexingType,
  127. dataSourceType: inCreatePageDataSourceType,
  128. files,
  129. notionPages = [],
  130. websitePages = [],
  131. crawlOptions,
  132. websiteCrawlProvider = DataSourceProvider.fireCrawl,
  133. websiteCrawlJobId = '',
  134. onStepChange,
  135. updateIndexingTypeCache,
  136. updateResultCache,
  137. onSave,
  138. onCancel,
  139. updateRetrievalMethodCache,
  140. }: StepTwoProps) => {
  141. const { t } = useTranslation()
  142. const docLink = useDocLink()
  143. const { locale } = useContext(I18n)
  144. const media = useBreakpoints()
  145. const isMobile = media === MediaType.mobile
  146. const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
  147. const isInUpload = Boolean(currentDataset)
  148. const isUploadInEmptyDataset = isInUpload && !currentDataset?.doc_form
  149. const isNotUploadInEmptyDataset = !isUploadInEmptyDataset
  150. const isInInit = !isInUpload && !isSetting
  151. const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
  152. const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
  153. const [segmentationType, setSegmentationType] = useState<ProcessMode>(
  154. currentDataset?.doc_form === ChunkingMode.parentChild ? ProcessMode.parentChild : ProcessMode.general,
  155. )
  156. const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
  157. const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
  158. doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
  159. }, [])
  160. const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
  161. const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
  162. const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
  163. const [rules, setRules] = useState<PreProcessingRule[]>([])
  164. const [defaultConfig, setDefaultConfig] = useState<Rules>()
  165. const hasSetIndexType = !!indexingType
  166. const [indexType, setIndexType] = useState<IndexingType>(() => {
  167. if (hasSetIndexType)
  168. return indexingType
  169. return isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL
  170. })
  171. const [previewFile, setPreviewFile] = useState<DocumentItem>(
  172. (datasetId && documentDetail)
  173. ? documentDetail.file
  174. : files[0],
  175. )
  176. const [previewNotionPage, setPreviewNotionPage] = useState<NotionPage>(
  177. (datasetId && documentDetail)
  178. ? documentDetail.notion_page
  179. : notionPages[0],
  180. )
  181. const [previewWebsitePage, setPreviewWebsitePage] = useState<CrawlResultItem>(
  182. (datasetId && documentDetail)
  183. ? documentDetail.website_page
  184. : websitePages[0],
  185. )
  186. // QA Related
  187. const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
  188. const [docForm, setDocForm] = useState<ChunkingMode>(
  189. (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
  190. )
  191. const handleChangeDocform = (value: ChunkingMode) => {
  192. if (value === ChunkingMode.qa && indexType === IndexingType.ECONOMICAL) {
  193. setIsQAConfirmDialogOpen(true)
  194. return
  195. }
  196. if (value === ChunkingMode.parentChild && indexType === IndexingType.ECONOMICAL)
  197. setIndexType(IndexingType.QUALIFIED)
  198. setDocForm(value)
  199. if (value === ChunkingMode.parentChild)
  200. setSegmentationType(ProcessMode.parentChild)
  201. else
  202. setSegmentationType(ProcessMode.general)
  203. // eslint-disable-next-line ts/no-use-before-define
  204. currentEstimateMutation.reset()
  205. }
  206. const [docLanguage, setDocLanguage] = useState<string>(
  207. (datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese Simplified'),
  208. )
  209. const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
  210. const getIndexing_technique = () => indexingType || indexType
  211. const currentDocForm = currentDataset?.doc_form || docForm
  212. const getProcessRule = (): ProcessRule => {
  213. if (currentDocForm === ChunkingMode.parentChild) {
  214. return {
  215. rules: {
  216. pre_processing_rules: rules,
  217. segmentation: {
  218. separator: unescape(
  219. parentChildConfig.parent.delimiter,
  220. ),
  221. max_tokens: parentChildConfig.parent.maxLength,
  222. },
  223. parent_mode: parentChildConfig.chunkForContext,
  224. subchunk_segmentation: {
  225. separator: unescape(parentChildConfig.child.delimiter),
  226. max_tokens: parentChildConfig.child.maxLength,
  227. },
  228. },
  229. mode: 'hierarchical',
  230. } as ProcessRule
  231. }
  232. return {
  233. rules: {
  234. pre_processing_rules: rules,
  235. segmentation: {
  236. separator: unescape(segmentIdentifier),
  237. max_tokens: maxChunkLength,
  238. chunk_overlap: overlap,
  239. },
  240. }, // api will check this. It will be removed after api refactored.
  241. mode: segmentationType,
  242. } as ProcessRule
  243. }
  244. const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
  245. docForm: currentDocForm,
  246. docLanguage,
  247. dataSourceType: DataSourceType.FILE,
  248. files: previewFile
  249. ? [files.find(file => file.name === previewFile.name)!]
  250. : files,
  251. indexingTechnique: getIndexing_technique() as any,
  252. processRule: getProcessRule(),
  253. dataset_id: datasetId!,
  254. })
  255. const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
  256. docForm: currentDocForm,
  257. docLanguage,
  258. dataSourceType: DataSourceType.NOTION,
  259. notionPages: [previewNotionPage],
  260. indexingTechnique: getIndexing_technique() as any,
  261. processRule: getProcessRule(),
  262. dataset_id: datasetId || '',
  263. })
  264. const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
  265. docForm: currentDocForm,
  266. docLanguage,
  267. dataSourceType: DataSourceType.WEB,
  268. websitePages: [previewWebsitePage],
  269. crawlOptions,
  270. websiteCrawlProvider,
  271. websiteCrawlJobId,
  272. indexingTechnique: getIndexing_technique() as any,
  273. processRule: getProcessRule(),
  274. dataset_id: datasetId || '',
  275. })
  276. const currentEstimateMutation = dataSourceType === DataSourceType.FILE
  277. ? fileIndexingEstimateQuery
  278. : dataSourceType === DataSourceType.NOTION
  279. ? notionIndexingEstimateQuery
  280. : websiteIndexingEstimateQuery
  281. const fetchEstimate = useCallback(() => {
  282. if (dataSourceType === DataSourceType.FILE)
  283. fileIndexingEstimateQuery.mutate()
  284. if (dataSourceType === DataSourceType.NOTION)
  285. notionIndexingEstimateQuery.mutate()
  286. if (dataSourceType === DataSourceType.WEB)
  287. websiteIndexingEstimateQuery.mutate()
  288. }, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
  289. const estimate
  290. = dataSourceType === DataSourceType.FILE
  291. ? fileIndexingEstimateQuery.data
  292. : dataSourceType === DataSourceType.NOTION
  293. ? notionIndexingEstimateQuery.data
  294. : websiteIndexingEstimateQuery.data
  295. const getRuleName = (key: string) => {
  296. if (key === 'remove_extra_spaces')
  297. return t('datasetCreation.stepTwo.removeExtraSpaces')
  298. if (key === 'remove_urls_emails')
  299. return t('datasetCreation.stepTwo.removeUrlEmails')
  300. if (key === 'remove_stopwords')
  301. return t('datasetCreation.stepTwo.removeStopwords')
  302. }
  303. const ruleChangeHandle = (id: string) => {
  304. const newRules = rules.map((rule) => {
  305. if (rule.id === id) {
  306. return {
  307. id: rule.id,
  308. enabled: !rule.enabled,
  309. }
  310. }
  311. return rule
  312. })
  313. setRules(newRules)
  314. }
  315. const resetRules = () => {
  316. if (defaultConfig) {
  317. setSegmentIdentifier(defaultConfig.segmentation.separator)
  318. setMaxChunkLength(defaultConfig.segmentation.max_tokens)
  319. setOverlap(defaultConfig.segmentation.chunk_overlap!)
  320. setRules(defaultConfig.pre_processing_rules)
  321. }
  322. setParentChildConfig(defaultParentChildConfig)
  323. }
  324. const updatePreview = () => {
  325. if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
  326. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
  327. return
  328. }
  329. fetchEstimate()
  330. }
  331. const {
  332. modelList: rerankModelList,
  333. defaultModel: rerankDefaultModel,
  334. currentModel: isRerankDefaultModelValid,
  335. } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
  336. const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
  337. const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
  338. const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
  339. currentDataset?.embedding_model
  340. ? {
  341. provider: currentDataset.embedding_model_provider,
  342. model: currentDataset.embedding_model,
  343. }
  344. : {
  345. provider: defaultEmbeddingModel?.provider.provider || '',
  346. model: defaultEmbeddingModel?.model || '',
  347. },
  348. )
  349. const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
  350. search_method: RETRIEVE_METHOD.semantic,
  351. reranking_enable: false,
  352. reranking_model: {
  353. reranking_provider_name: '',
  354. reranking_model_name: '',
  355. },
  356. top_k: 3,
  357. score_threshold_enabled: false,
  358. score_threshold: 0.5,
  359. } as RetrievalConfig)
  360. useEffect(() => {
  361. if (currentDataset?.retrieval_model_dict)
  362. return
  363. setRetrievalConfig({
  364. search_method: RETRIEVE_METHOD.semantic,
  365. reranking_enable: !!isRerankDefaultModelValid,
  366. reranking_model: {
  367. reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
  368. reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
  369. },
  370. top_k: 3,
  371. score_threshold_enabled: false,
  372. score_threshold: 0.5,
  373. })
  374. }, [rerankDefaultModel, isRerankDefaultModelValid])
  375. const getCreationParams = () => {
  376. let params
  377. if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
  378. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
  379. return
  380. }
  381. if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
  382. Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
  383. return
  384. }
  385. if (isSetting) {
  386. params = {
  387. original_document_id: documentDetail?.id,
  388. doc_form: currentDocForm,
  389. doc_language: docLanguage,
  390. process_rule: getProcessRule(),
  391. retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
  392. embedding_model: embeddingModel.model, // Readonly
  393. embedding_model_provider: embeddingModel.provider, // Readonly
  394. indexing_technique: getIndexing_technique(),
  395. } as CreateDocumentReq
  396. }
  397. else { // create
  398. const indexMethod = getIndexing_technique()
  399. if (indexMethod === IndexingType.QUALIFIED && (!embeddingModel.model || !embeddingModel.provider)) {
  400. Toast.notify({
  401. type: 'error',
  402. message: t('appDebug.datasetConfig.embeddingModelRequired'),
  403. })
  404. return
  405. }
  406. if (
  407. !isReRankModelSelected({
  408. rerankModelList,
  409. retrievalConfig,
  410. indexMethod: indexMethod as string,
  411. })
  412. ) {
  413. Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
  414. return
  415. }
  416. params = {
  417. data_source: {
  418. type: dataSourceType,
  419. info_list: {
  420. data_source_type: dataSourceType,
  421. },
  422. },
  423. indexing_technique: getIndexing_technique(),
  424. process_rule: getProcessRule(),
  425. doc_form: currentDocForm,
  426. doc_language: docLanguage,
  427. retrieval_model: retrievalConfig,
  428. embedding_model: embeddingModel.model,
  429. embedding_model_provider: embeddingModel.provider,
  430. } as CreateDocumentReq
  431. if (dataSourceType === DataSourceType.FILE) {
  432. params.data_source.info_list.file_info_list = {
  433. file_ids: files.map(file => file.id || '').filter(Boolean),
  434. }
  435. }
  436. if (dataSourceType === DataSourceType.NOTION)
  437. params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
  438. if (dataSourceType === DataSourceType.WEB) {
  439. params.data_source.info_list.website_info_list = getWebsiteInfo({
  440. websiteCrawlProvider,
  441. websiteCrawlJobId,
  442. websitePages,
  443. })
  444. }
  445. }
  446. return params
  447. }
  448. const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
  449. onSuccess(data) {
  450. const separator = data.rules.segmentation.separator
  451. setSegmentIdentifier(separator)
  452. setMaxChunkLength(data.rules.segmentation.max_tokens)
  453. setOverlap(data.rules.segmentation.chunk_overlap!)
  454. setRules(data.rules.pre_processing_rules)
  455. setDefaultConfig(data.rules)
  456. setLimitMaxChunkLength(data.limits.indexing_max_segmentation_tokens_length)
  457. },
  458. onError(error) {
  459. Toast.notify({
  460. type: 'error',
  461. message: `${error}`,
  462. })
  463. },
  464. })
  465. const getRulesFromDetail = () => {
  466. if (documentDetail) {
  467. const rules = documentDetail.dataset_process_rule.rules
  468. const separator = rules.segmentation.separator
  469. const max = rules.segmentation.max_tokens
  470. const overlap = rules.segmentation.chunk_overlap
  471. const isHierarchicalDocument = documentDetail.doc_form === ChunkingMode.parentChild
  472. || (rules.parent_mode && rules.subchunk_segmentation)
  473. setSegmentIdentifier(separator)
  474. setMaxChunkLength(max)
  475. setOverlap(overlap!)
  476. setRules(rules.pre_processing_rules)
  477. setDefaultConfig(rules)
  478. if (isHierarchicalDocument) {
  479. setParentChildConfig({
  480. chunkForContext: rules.parent_mode || 'paragraph',
  481. parent: {
  482. delimiter: escape(rules.segmentation.separator),
  483. maxLength: rules.segmentation.max_tokens,
  484. },
  485. child: {
  486. delimiter: escape(rules.subchunk_segmentation.separator),
  487. maxLength: rules.subchunk_segmentation.max_tokens,
  488. },
  489. })
  490. }
  491. }
  492. }
  493. const getDefaultMode = () => {
  494. if (documentDetail)
  495. setSegmentationType(documentDetail.dataset_process_rule.mode)
  496. }
  497. const createFirstDocumentMutation = useCreateFirstDocument({
  498. onError(error) {
  499. Toast.notify({
  500. type: 'error',
  501. message: `${error}`,
  502. })
  503. },
  504. })
  505. const createDocumentMutation = useCreateDocument(datasetId!, {
  506. onError(error) {
  507. Toast.notify({
  508. type: 'error',
  509. message: `${error}`,
  510. })
  511. },
  512. })
  513. const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
  514. const createHandle = async () => {
  515. const params = getCreationParams()
  516. if (!params)
  517. return false
  518. if (!datasetId) {
  519. await createFirstDocumentMutation.mutateAsync(
  520. params,
  521. {
  522. onSuccess(data) {
  523. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  524. updateResultCache && updateResultCache(data)
  525. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  526. },
  527. },
  528. )
  529. }
  530. else {
  531. await createDocumentMutation.mutateAsync(params, {
  532. onSuccess(data) {
  533. updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
  534. updateResultCache && updateResultCache(data)
  535. updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
  536. },
  537. })
  538. }
  539. if (mutateDatasetRes)
  540. mutateDatasetRes()
  541. onStepChange && onStepChange(+1)
  542. isSetting && onSave && onSave()
  543. }
  544. useEffect(() => {
  545. // fetch rules
  546. if (!isSetting) {
  547. fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
  548. }
  549. else {
  550. getRulesFromDetail()
  551. getDefaultMode()
  552. }
  553. }, [])
  554. useEffect(() => {
  555. // get indexing type by props
  556. if (indexingType)
  557. setIndexType(indexingType as IndexingType)
  558. else
  559. setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
  560. }, [isAPIKeySet, indexingType, datasetId])
  561. const economyDomRef = useRef<HTMLDivElement>(null)
  562. const isHoveringEconomy = useHover(economyDomRef)
  563. const isModelAndRetrievalConfigDisabled = !!datasetId && !!currentDataset?.data_source_type
  564. return (
  565. <div className='flex h-full w-full'>
  566. <div className={cn('relative h-full w-1/2 overflow-y-auto py-6', isMobile ? 'px-4' : 'px-12')}>
  567. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.segmentation')}</div>
  568. {((isInUpload && [ChunkingMode.text, ChunkingMode.qa].includes(currentDataset!.doc_form))
  569. || isUploadInEmptyDataset
  570. || isInInit)
  571. && <OptionCard
  572. className='mb-2 bg-background-section'
  573. title={t('datasetCreation.stepTwo.general')}
  574. icon={<Image width={20} height={20} src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
  575. activeHeaderClassName='bg-dataset-option-card-blue-gradient'
  576. description={t('datasetCreation.stepTwo.generalTip')}
  577. isActive={
  578. [ChunkingMode.text, ChunkingMode.qa].includes(currentDocForm)
  579. }
  580. onSwitched={() =>
  581. handleChangeDocform(ChunkingMode.text)
  582. }
  583. actions={
  584. <>
  585. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  586. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  587. {t('datasetCreation.stepTwo.previewChunk')}
  588. </Button>
  589. <Button variant={'ghost'} onClick={resetRules}>
  590. {t('datasetCreation.stepTwo.reset')}
  591. </Button>
  592. </>
  593. }
  594. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  595. >
  596. <div className='flex flex-col gap-y-4'>
  597. <div className='flex gap-3'>
  598. <DelimiterInput
  599. value={segmentIdentifier}
  600. onChange={e => setSegmentIdentifier(e.target.value, true)}
  601. />
  602. <MaxLengthInput
  603. unit='characters'
  604. value={maxChunkLength}
  605. onChange={setMaxChunkLength}
  606. />
  607. <OverlapInput
  608. unit='characters'
  609. value={overlap}
  610. min={1}
  611. onChange={setOverlap}
  612. />
  613. </div>
  614. <div className='flex w-full flex-col'>
  615. <div className='flex items-center gap-x-2'>
  616. <div className='inline-flex shrink-0'>
  617. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  618. </div>
  619. <Divider className='grow' bgStyle='gradient' />
  620. </div>
  621. <div className='mt-1'>
  622. {rules.map(rule => (
  623. <div key={rule.id} className={s.ruleItem} onClick={() => {
  624. ruleChangeHandle(rule.id)
  625. }}>
  626. <Checkbox
  627. checked={rule.enabled}
  628. />
  629. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  630. </div>
  631. ))}
  632. {IS_CE_EDITION && <>
  633. <Divider type='horizontal' className='my-4 bg-divider-subtle' />
  634. <div className='flex items-center py-0.5'>
  635. <div className='flex items-center' onClick={() => {
  636. if (currentDataset?.doc_form)
  637. return
  638. if (docForm === ChunkingMode.qa)
  639. handleChangeDocform(ChunkingMode.text)
  640. else
  641. handleChangeDocform(ChunkingMode.qa)
  642. }}>
  643. <Checkbox
  644. checked={currentDocForm === ChunkingMode.qa}
  645. disabled={!!currentDataset?.doc_form}
  646. />
  647. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">
  648. {t('datasetCreation.stepTwo.useQALanguage')}
  649. </label>
  650. </div>
  651. <LanguageSelect
  652. currentLanguage={docLanguage || locale}
  653. onSelect={setDocLanguage}
  654. disabled={currentDocForm !== ChunkingMode.qa}
  655. />
  656. <Tooltip popupContent={t('datasetCreation.stepTwo.QATip')} />
  657. </div>
  658. {currentDocForm === ChunkingMode.qa && (
  659. <div
  660. style={{
  661. background: 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)',
  662. }}
  663. className='mt-2 flex h-10 items-center gap-2 rounded-xl border border-components-panel-border px-3 text-xs shadow-xs backdrop-blur-[5px]'
  664. >
  665. <RiAlertFill className='size-4 text-text-warning-secondary' />
  666. <span className='system-xs-medium text-text-primary'>
  667. {t('datasetCreation.stepTwo.QATip')}
  668. </span>
  669. </div>
  670. )}
  671. </>}
  672. </div>
  673. </div>
  674. </div>
  675. </OptionCard>}
  676. {
  677. (
  678. (isInUpload && currentDataset!.doc_form === ChunkingMode.parentChild)
  679. || isUploadInEmptyDataset
  680. || isInInit
  681. )
  682. && <OptionCard
  683. title={t('datasetCreation.stepTwo.parentChild')}
  684. icon={<Image width={20} height={20} src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
  685. effectImg={OrangeEffect.src}
  686. activeHeaderClassName='bg-dataset-option-card-orange-gradient'
  687. description={t('datasetCreation.stepTwo.parentChildTip')}
  688. isActive={currentDocForm === ChunkingMode.parentChild}
  689. onSwitched={() => handleChangeDocform(ChunkingMode.parentChild)}
  690. actions={
  691. <>
  692. <Button variant={'secondary-accent'} onClick={() => updatePreview()}>
  693. <RiSearchEyeLine className='mr-0.5 h-4 w-4' />
  694. {t('datasetCreation.stepTwo.previewChunk')}
  695. </Button>
  696. <Button variant={'ghost'} onClick={resetRules}>
  697. {t('datasetCreation.stepTwo.reset')}
  698. </Button>
  699. </>
  700. }
  701. noHighlight={isInUpload && isNotUploadInEmptyDataset}
  702. >
  703. <div className='flex flex-col gap-4'>
  704. <div>
  705. <div className='flex items-center gap-x-2'>
  706. <div className='inline-flex shrink-0'>
  707. <TextLabel>{t('datasetCreation.stepTwo.parentChunkForContext')}</TextLabel>
  708. </div>
  709. <Divider className='grow' bgStyle='gradient' />
  710. </div>
  711. <RadioCard className='mt-1'
  712. icon={<Image src={Note} alt='' />}
  713. title={t('datasetCreation.stepTwo.paragraph')}
  714. description={t('datasetCreation.stepTwo.paragraphTip')}
  715. isChosen={parentChildConfig.chunkForContext === 'paragraph'}
  716. onChosen={() => setParentChildConfig(
  717. {
  718. ...parentChildConfig,
  719. chunkForContext: 'paragraph',
  720. },
  721. )}
  722. chosenConfig={
  723. <div className='flex gap-3'>
  724. <DelimiterInput
  725. value={parentChildConfig.parent.delimiter}
  726. tooltip={t('datasetCreation.stepTwo.parentChildDelimiterTip')!}
  727. onChange={e => setParentChildConfig({
  728. ...parentChildConfig,
  729. parent: {
  730. ...parentChildConfig.parent,
  731. delimiter: e.target.value ? escape(e.target.value) : '',
  732. },
  733. })}
  734. />
  735. <MaxLengthInput
  736. unit='characters'
  737. value={parentChildConfig.parent.maxLength}
  738. onChange={value => setParentChildConfig({
  739. ...parentChildConfig,
  740. parent: {
  741. ...parentChildConfig.parent,
  742. maxLength: value,
  743. },
  744. })}
  745. />
  746. </div>
  747. }
  748. />
  749. <RadioCard className='mt-2'
  750. icon={<Image src={FileList} alt='' />}
  751. title={t('datasetCreation.stepTwo.fullDoc')}
  752. description={t('datasetCreation.stepTwo.fullDocTip')}
  753. onChosen={() => setParentChildConfig(
  754. {
  755. ...parentChildConfig,
  756. chunkForContext: 'full-doc',
  757. },
  758. )}
  759. isChosen={parentChildConfig.chunkForContext === 'full-doc'}
  760. />
  761. </div>
  762. <div>
  763. <div className='flex items-center gap-x-2'>
  764. <div className='inline-flex shrink-0'>
  765. <TextLabel>{t('datasetCreation.stepTwo.childChunkForRetrieval')}</TextLabel>
  766. </div>
  767. <Divider className='grow' bgStyle='gradient' />
  768. </div>
  769. <div className='mt-1 flex gap-3'>
  770. <DelimiterInput
  771. value={parentChildConfig.child.delimiter}
  772. tooltip={t('datasetCreation.stepTwo.parentChildChunkDelimiterTip')!}
  773. onChange={e => setParentChildConfig({
  774. ...parentChildConfig,
  775. child: {
  776. ...parentChildConfig.child,
  777. delimiter: e.target.value ? escape(e.target.value) : '',
  778. },
  779. })}
  780. />
  781. <MaxLengthInput
  782. unit='characters'
  783. value={parentChildConfig.child.maxLength}
  784. onChange={value => setParentChildConfig({
  785. ...parentChildConfig,
  786. child: {
  787. ...parentChildConfig.child,
  788. maxLength: value,
  789. },
  790. })}
  791. />
  792. </div>
  793. </div>
  794. <div>
  795. <div className='flex items-center gap-x-2'>
  796. <div className='inline-flex shrink-0'>
  797. <TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
  798. </div>
  799. <Divider className='grow' bgStyle='gradient' />
  800. </div>
  801. <div className='mt-1'>
  802. {rules.map(rule => (
  803. <div key={rule.id} className={s.ruleItem} onClick={() => {
  804. ruleChangeHandle(rule.id)
  805. }}>
  806. <Checkbox
  807. checked={rule.enabled}
  808. />
  809. <label className="system-sm-regular ml-2 cursor-pointer text-text-secondary">{getRuleName(rule.id)}</label>
  810. </div>
  811. ))}
  812. </div>
  813. </div>
  814. </div>
  815. </OptionCard>}
  816. <Divider className='my-5' />
  817. <div className={'system-md-semibold mb-1 text-text-secondary'}>{t('datasetCreation.stepTwo.indexMode')}</div>
  818. <div className='flex items-center gap-2'>
  819. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
  820. <OptionCard className='flex-1 self-stretch'
  821. title={<div className='flex items-center'>
  822. {t('datasetCreation.stepTwo.qualified')}
  823. <Badge className={cn('ml-1 h-[18px]', (!hasSetIndexType && indexType === IndexingType.QUALIFIED) ? 'border-text-accent-secondary text-text-accent-secondary' : '')} uppercase>
  824. {t('datasetCreation.stepTwo.recommend')}
  825. </Badge>
  826. <span className='ml-auto'>
  827. {!hasSetIndexType && <span className={cn(s.radio)} />}
  828. </span>
  829. </div>}
  830. description={t('datasetCreation.stepTwo.qualifiedTip')}
  831. icon={<Image src={indexMethodIcon.high_quality} alt='' />}
  832. isActive={!hasSetIndexType && indexType === IndexingType.QUALIFIED}
  833. disabled={hasSetIndexType}
  834. onSwitched={() => {
  835. setIndexType(IndexingType.QUALIFIED)
  836. }}
  837. />
  838. )}
  839. {(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
  840. <>
  841. <CustomDialog show={isQAConfirmDialogOpen} onClose={() => setIsQAConfirmDialogOpen(false)} className='w-[432px]'>
  842. <header className='mb-4 pt-6'>
  843. <h2 className='text-lg font-semibold text-text-primary'>
  844. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipTitle')}
  845. </h2>
  846. <p className='mt-2 text-sm font-normal text-text-secondary'>
  847. {t('datasetCreation.stepTwo.qaSwitchHighQualityTipContent')}
  848. </p>
  849. </header>
  850. <div className='flex gap-2 pb-6'>
  851. <Button className='ml-auto' onClick={() => {
  852. setIsQAConfirmDialogOpen(false)
  853. }}>
  854. {t('datasetCreation.stepTwo.cancel')}
  855. </Button>
  856. <Button variant={'primary'} onClick={() => {
  857. setIsQAConfirmDialogOpen(false)
  858. setIndexType(IndexingType.QUALIFIED)
  859. setDocForm(ChunkingMode.qa)
  860. }}>
  861. {t('datasetCreation.stepTwo.switch')}
  862. </Button>
  863. </div>
  864. </CustomDialog>
  865. <PortalToFollowElem
  866. open={
  867. isHoveringEconomy && docForm !== ChunkingMode.text
  868. }
  869. placement={'top'}
  870. >
  871. <PortalToFollowElemTrigger asChild>
  872. <OptionCard className='flex-1 self-stretch'
  873. title={t('datasetCreation.stepTwo.economical')}
  874. description={t('datasetCreation.stepTwo.economicalTip')}
  875. icon={<Image src={indexMethodIcon.economical} alt='' />}
  876. isActive={!hasSetIndexType && indexType === IndexingType.ECONOMICAL}
  877. disabled={hasSetIndexType || docForm !== ChunkingMode.text}
  878. ref={economyDomRef}
  879. onSwitched={() => {
  880. setIndexType(IndexingType.ECONOMICAL)
  881. }}
  882. />
  883. </PortalToFollowElemTrigger>
  884. <PortalToFollowElemContent>
  885. <div className='rounded-lg border-components-panel-border bg-components-tooltip-bg p-3 text-xs font-medium text-text-secondary shadow-lg'>
  886. {
  887. docForm === ChunkingMode.qa
  888. ? t('datasetCreation.stepTwo.notAvailableForQA')
  889. : t('datasetCreation.stepTwo.notAvailableForParentChild')
  890. }
  891. </div>
  892. </PortalToFollowElemContent>
  893. </PortalToFollowElem>
  894. </>)}
  895. </div>
  896. {!hasSetIndexType && indexType === IndexingType.QUALIFIED && (
  897. <div className='mt-2 flex h-10 items-center gap-x-0.5 overflow-hidden rounded-xl border-[0.5px] border-components-panel-border bg-components-panel-bg-blur p-2 shadow-xs backdrop-blur-[5px]'>
  898. <div className='absolute bottom-0 left-0 right-0 top-0 bg-dataset-warning-message-bg opacity-40'></div>
  899. <div className='p-1'>
  900. <AlertTriangle className='size-4 text-text-warning-secondary' />
  901. </div>
  902. <span className='system-xs-medium text-text-primary'>{t('datasetCreation.stepTwo.highQualityTip')}</span>
  903. </div>
  904. )}
  905. {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
  906. <div className='system-xs-medium mt-2 text-text-tertiary'>
  907. {t('datasetCreation.stepTwo.indexSettingTip')}
  908. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  909. </div>
  910. )}
  911. {/* Embedding model */}
  912. {indexType === IndexingType.QUALIFIED && (
  913. <div className='mt-5'>
  914. <div className={cn('system-md-semibold mb-1 text-text-secondary', datasetId && 'flex items-center justify-between')}>{t('datasetSettings.form.embeddingModel')}</div>
  915. <ModelSelector
  916. readonly={isModelAndRetrievalConfigDisabled}
  917. triggerClassName={isModelAndRetrievalConfigDisabled ? 'opacity-50' : ''}
  918. defaultModel={embeddingModel}
  919. modelList={embeddingModelList}
  920. onSelect={(model: DefaultModel) => {
  921. setEmbeddingModel(model)
  922. }}
  923. />
  924. {isModelAndRetrievalConfigDisabled && (
  925. <div className='system-xs-medium mt-2 text-text-tertiary'>
  926. {t('datasetCreation.stepTwo.indexSettingTip')}
  927. <Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
  928. </div>
  929. )}
  930. </div>
  931. )}
  932. <Divider className='my-5' />
  933. {/* Retrieval Method Config */}
  934. <div>
  935. {!isModelAndRetrievalConfigDisabled
  936. ? (
  937. <div className={'mb-1'}>
  938. <div className='system-md-semibold mb-0.5 text-text-secondary'>{t('datasetSettings.form.retrievalSetting.title')}</div>
  939. <div className='body-xs-regular text-text-tertiary'>
  940. <a target='_blank' rel='noopener noreferrer'
  941. href={docLink('/guides/knowledge-base/create-knowledge-and-upload-documents')}
  942. className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
  943. {t('datasetSettings.form.retrievalSetting.longDescription')}
  944. </div>
  945. </div>
  946. )
  947. : (
  948. <div className={cn('system-md-semibold mb-0.5 text-text-secondary', 'flex items-center justify-between')}>
  949. <div>{t('datasetSettings.form.retrievalSetting.title')}</div>
  950. </div>
  951. )}
  952. <div className=''>
  953. {
  954. getIndexing_technique() === IndexingType.QUALIFIED
  955. ? (
  956. <RetrievalMethodConfig
  957. disabled={isModelAndRetrievalConfigDisabled}
  958. value={retrievalConfig}
  959. onChange={setRetrievalConfig}
  960. />
  961. )
  962. : (
  963. <EconomicalRetrievalMethodConfig
  964. disabled={isModelAndRetrievalConfigDisabled}
  965. value={retrievalConfig}
  966. onChange={setRetrievalConfig}
  967. />
  968. )
  969. }
  970. </div>
  971. </div>
  972. {!isSetting
  973. ? (
  974. <div className='mt-8 flex items-center py-2'>
  975. <Button onClick={() => onStepChange && onStepChange(-1)}>
  976. <RiArrowLeftLine className='mr-1 h-4 w-4' />
  977. {t('datasetCreation.stepTwo.previousStep')}
  978. </Button>
  979. <Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
  980. </div>
  981. )
  982. : (
  983. <div className='mt-8 flex items-center py-2'>
  984. <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
  985. <Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
  986. </div>
  987. )}
  988. </div>
  989. <FloatRightContainer isMobile={isMobile} isOpen={true} onClose={noop} footer={null}>
  990. <PreviewContainer
  991. header={<PreviewHeader
  992. title={t('datasetCreation.stepTwo.preview')}
  993. >
  994. <div className='flex items-center gap-1'>
  995. {dataSourceType === DataSourceType.FILE
  996. && <PreviewDocumentPicker
  997. files={files as Array<Required<CustomFile>>}
  998. onChange={(selected) => {
  999. currentEstimateMutation.reset()
  1000. setPreviewFile(selected)
  1001. currentEstimateMutation.mutate()
  1002. }}
  1003. // when it is from setting, it just has one file
  1004. value={isSetting ? (files[0]! as Required<CustomFile>) : previewFile}
  1005. />
  1006. }
  1007. {dataSourceType === DataSourceType.NOTION
  1008. && <PreviewDocumentPicker
  1009. files={
  1010. notionPages.map(page => ({
  1011. id: page.page_id,
  1012. name: page.page_name,
  1013. extension: 'md',
  1014. }))
  1015. }
  1016. onChange={(selected) => {
  1017. currentEstimateMutation.reset()
  1018. const selectedPage = notionPages.find(page => page.page_id === selected.id)
  1019. setPreviewNotionPage(selectedPage!)
  1020. currentEstimateMutation.mutate()
  1021. }}
  1022. value={{
  1023. id: previewNotionPage?.page_id || '',
  1024. name: previewNotionPage?.page_name || '',
  1025. extension: 'md',
  1026. }}
  1027. />
  1028. }
  1029. {dataSourceType === DataSourceType.WEB
  1030. && <PreviewDocumentPicker
  1031. files={
  1032. websitePages.map(page => ({
  1033. id: page.source_url,
  1034. name: page.title,
  1035. extension: 'md',
  1036. }))
  1037. }
  1038. onChange={(selected) => {
  1039. currentEstimateMutation.reset()
  1040. const selectedPage = websitePages.find(page => page.source_url === selected.id)
  1041. setPreviewWebsitePage(selectedPage!)
  1042. currentEstimateMutation.mutate()
  1043. }}
  1044. value={
  1045. {
  1046. id: previewWebsitePage?.source_url || '',
  1047. name: previewWebsitePage?.title || '',
  1048. extension: 'md',
  1049. }
  1050. }
  1051. />
  1052. }
  1053. {
  1054. currentDocForm !== ChunkingMode.qa
  1055. && <Badge text={t('datasetCreation.stepTwo.previewChunkCount', {
  1056. count: estimate?.total_segments || 0,
  1057. }) as string}
  1058. />
  1059. }
  1060. </div>
  1061. </PreviewHeader>}
  1062. className={cn('relative flex h-full w-1/2 shrink-0 p-4 pr-0', isMobile && 'w-full max-w-[524px]')}
  1063. mainClassName='space-y-6'
  1064. >
  1065. {currentDocForm === ChunkingMode.qa && estimate?.qa_preview && (
  1066. estimate?.qa_preview.map((item, index) => (
  1067. <ChunkContainer
  1068. key={item.question}
  1069. label={`Chunk-${index + 1}`}
  1070. characterCount={item.question.length + item.answer.length}
  1071. >
  1072. <QAPreview qa={item} />
  1073. </ChunkContainer>
  1074. ))
  1075. )}
  1076. {currentDocForm === ChunkingMode.text && estimate?.preview && (
  1077. estimate?.preview.map((item, index) => (
  1078. <ChunkContainer
  1079. key={item.content}
  1080. label={`Chunk-${index + 1}`}
  1081. characterCount={item.content.length}
  1082. >
  1083. {item.content}
  1084. </ChunkContainer>
  1085. ))
  1086. )}
  1087. {currentDocForm === ChunkingMode.parentChild && currentEstimateMutation.data?.preview && (
  1088. estimate?.preview?.map((item, index) => {
  1089. const indexForLabel = index + 1
  1090. const childChunks = parentChildConfig.chunkForContext === 'full-doc'
  1091. ? item.child_chunks.slice(0, FULL_DOC_PREVIEW_LENGTH)
  1092. : item.child_chunks
  1093. return (
  1094. <ChunkContainer
  1095. key={item.content}
  1096. label={`Chunk-${indexForLabel}`}
  1097. characterCount={item.content.length}
  1098. >
  1099. <FormattedText>
  1100. {childChunks.map((child, index) => {
  1101. const indexForLabel = index + 1
  1102. return (
  1103. <PreviewSlice
  1104. key={`C-${indexForLabel}-${child}`}
  1105. label={`C-${indexForLabel}`}
  1106. text={child}
  1107. tooltip={`Child-chunk-${indexForLabel} · ${child.length} Characters`}
  1108. labelInnerClassName='text-[10px] font-semibold align-bottom leading-7'
  1109. dividerClassName='leading-7'
  1110. />
  1111. )
  1112. })}
  1113. </FormattedText>
  1114. </ChunkContainer>
  1115. )
  1116. })
  1117. )}
  1118. {currentEstimateMutation.isIdle && (
  1119. <div className='flex h-full w-full items-center justify-center'>
  1120. <div className='flex flex-col items-center justify-center gap-3'>
  1121. <RiSearchEyeLine className='size-10 text-text-empty-state-icon' />
  1122. <p className='text-sm text-text-tertiary'>
  1123. {t('datasetCreation.stepTwo.previewChunkTip')}
  1124. </p>
  1125. </div>
  1126. </div>
  1127. )}
  1128. {currentEstimateMutation.isPending && (
  1129. <div className='space-y-6'>
  1130. {Array.from({ length: 10 }, (_, i) => (
  1131. <SkeletonContainer key={i}>
  1132. <SkeletonRow>
  1133. <SkeletonRectangle className="w-20" />
  1134. <SkeletonPoint />
  1135. <SkeletonRectangle className="w-24" />
  1136. </SkeletonRow>
  1137. <SkeletonRectangle className="w-full" />
  1138. <SkeletonRectangle className="w-full" />
  1139. <SkeletonRectangle className="w-[422px]" />
  1140. </SkeletonContainer>
  1141. ))}
  1142. </div>
  1143. )}
  1144. </PreviewContainer>
  1145. </FloatRightContainer>
  1146. </div>
  1147. )
  1148. }
  1149. export default StepTwo