You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

datasets.ts 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig, TransferMethod } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. import { ExternalKnowledgeBase, General, ParentChild, Qa } from '@/app/components/base/icons/src/public/knowledge/dataset-card'
  8. import { GeneralChunk, ParentChildChunk, QuestionAndAnswer } from '@/app/components/base/icons/src/vender/knowledge'
  9. import type { DatasourceType } from './pipeline'
  10. export enum DataSourceType {
  11. FILE = 'upload_file',
  12. NOTION = 'notion_import',
  13. WEB = 'website_crawl',
  14. }
  15. export enum DatasetPermission {
  16. onlyMe = 'only_me',
  17. allTeamMembers = 'all_team_members',
  18. partialMembers = 'partial_members',
  19. }
  20. export enum ChunkingMode {
  21. text = 'text_model', // General text
  22. qa = 'qa_model', // General QA
  23. parentChild = 'hierarchical_model', // Parent-Child
  24. // graph = 'graph', // todo: Graph RAG
  25. }
  26. export type MetadataInDoc = {
  27. value: string
  28. id: string
  29. type: MetadataFilteringVariableType
  30. name: string
  31. }
  32. export type IconInfo = {
  33. icon: string
  34. icon_background?: string
  35. icon_type: AppIconType
  36. icon_url?: string
  37. }
  38. export type DataSet = {
  39. id: string
  40. name: string
  41. indexing_status: DocumentIndexingStatus
  42. icon_info: IconInfo
  43. description: string
  44. permission: DatasetPermission
  45. data_source_type: DataSourceType
  46. indexing_technique: IndexingType
  47. created_by: string
  48. updated_by: string
  49. updated_at: number
  50. app_count: number
  51. doc_form: ChunkingMode
  52. document_count: number
  53. total_document_count: number
  54. total_available_documents?: number
  55. word_count: number
  56. provider: string
  57. embedding_model: string
  58. embedding_model_provider: string
  59. embedding_available: boolean
  60. retrieval_model_dict: RetrievalConfig
  61. retrieval_model: RetrievalConfig
  62. tags: Tag[]
  63. partial_member_list?: string[]
  64. external_knowledge_info: {
  65. external_knowledge_id: string
  66. external_knowledge_api_id: string
  67. external_knowledge_api_name: string
  68. external_knowledge_api_endpoint: string
  69. }
  70. external_retrieval_model: {
  71. top_k: number
  72. score_threshold: number
  73. score_threshold_enabled: boolean
  74. }
  75. built_in_field_enabled: boolean
  76. doc_metadata?: MetadataInDoc[]
  77. keyword_number?: number
  78. pipeline_id?: string
  79. is_published?: boolean // Indicates if the pipeline is published
  80. runtime_mode: 'rag_pipeline' | 'general'
  81. }
  82. export type ExternalAPIItem = {
  83. id: string
  84. tenant_id: string
  85. name: string
  86. description: string
  87. settings: {
  88. endpoint: string
  89. api_key: string
  90. }
  91. dataset_bindings: { id: string; name: string }[]
  92. created_by: string
  93. created_at: string
  94. }
  95. export type ExternalKnowledgeItem = {
  96. id: string
  97. name: string
  98. description: string | null
  99. provider: 'external'
  100. permission: DatasetPermission
  101. data_source_type: null
  102. indexing_technique: null
  103. app_count: number
  104. document_count: number
  105. word_count: number
  106. created_by: string
  107. created_at: string
  108. updated_by: string
  109. updated_at: string
  110. tags: Tag[]
  111. }
  112. export type ExternalAPIDeleteResponse = {
  113. result: 'success' | 'error'
  114. }
  115. export type ExternalAPIUsage = {
  116. is_using: boolean
  117. count: number
  118. }
  119. export type CustomFile = File & {
  120. id?: string
  121. extension?: string
  122. mime_type?: string
  123. created_by?: string
  124. created_at?: number
  125. }
  126. export type DocumentItem = {
  127. id: string
  128. name: string
  129. extension: string
  130. }
  131. export type CrawlOptions = {
  132. crawl_sub_pages: boolean
  133. only_main_content: boolean
  134. includes: string
  135. excludes: string
  136. limit: number | string
  137. max_depth: number | string
  138. use_sitemap: boolean
  139. }
  140. export type CrawlResultItem = {
  141. title: string
  142. content: string
  143. description: string
  144. source_url: string
  145. }
  146. export type CrawlResult = {
  147. data: CrawlResultItem[]
  148. time_consuming: number | string
  149. }
  150. export enum CrawlStep {
  151. init = 'init',
  152. running = 'running',
  153. finished = 'finished',
  154. }
  155. export type FileItem = {
  156. fileID: string
  157. file: CustomFile
  158. progress: number
  159. }
  160. export type FetchDatasetsParams = {
  161. url: string
  162. params: {
  163. page: number
  164. ids?: string[]
  165. tag_ids?: string[]
  166. limit?: number
  167. include_all?: boolean
  168. keyword?: string
  169. }
  170. }
  171. export type DatasetListRequest = {
  172. initialPage: number
  173. tag_ids?: string[]
  174. limit: number
  175. include_all?: boolean
  176. keyword?: string
  177. }
  178. export type DataSetListResponse = {
  179. data: DataSet[]
  180. has_more: boolean
  181. limit: number
  182. page: number
  183. total: number
  184. }
  185. export type ExternalAPIListResponse = {
  186. data: ExternalAPIItem[]
  187. has_more: boolean
  188. limit: number
  189. page: number
  190. total: number
  191. }
  192. export type QA = {
  193. question: string
  194. answer: string
  195. }
  196. export type IndexingEstimateResponse = {
  197. tokens: number
  198. total_price: number
  199. currency: string
  200. total_segments: number
  201. preview: Array<{ content: string; child_chunks: string[] }>
  202. qa_preview?: QA[]
  203. }
  204. export type FileIndexingEstimateResponse = {
  205. total_nodes: number
  206. } & IndexingEstimateResponse
  207. export type IndexingStatusResponse = {
  208. id: string
  209. indexing_status: DocumentIndexingStatus
  210. processing_started_at: number
  211. parsing_completed_at: number
  212. cleaning_completed_at: number
  213. splitting_completed_at: number
  214. completed_at: any
  215. paused_at: any
  216. error: any
  217. stopped_at: any
  218. completed_segments: number
  219. total_segments: number
  220. }
  221. export type IndexingStatusBatchResponse = {
  222. data: IndexingStatusResponse[]
  223. }
  224. export enum ProcessMode {
  225. general = 'custom',
  226. parentChild = 'hierarchical',
  227. }
  228. export type ParentMode = 'full-doc' | 'paragraph'
  229. export type ProcessRuleResponse = {
  230. mode: ProcessMode
  231. rules: Rules
  232. limits: Limits
  233. }
  234. export type Rules = {
  235. pre_processing_rules: PreProcessingRule[]
  236. segmentation: Segmentation
  237. parent_mode: ParentMode
  238. subchunk_segmentation: Segmentation
  239. }
  240. export type Limits = {
  241. indexing_max_segmentation_tokens_length: number
  242. }
  243. export type PreProcessingRule = {
  244. id: string
  245. enabled: boolean
  246. }
  247. export type Segmentation = {
  248. separator: string
  249. max_tokens: number
  250. chunk_overlap?: number
  251. }
  252. export const DocumentIndexingStatusList = [
  253. 'waiting',
  254. 'parsing',
  255. 'cleaning',
  256. 'splitting',
  257. 'indexing',
  258. 'paused',
  259. 'error',
  260. 'completed',
  261. ] as const
  262. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  263. export const DisplayStatusList = [
  264. 'queuing',
  265. 'indexing',
  266. 'paused',
  267. 'error',
  268. 'available',
  269. 'enabled',
  270. 'disabled',
  271. 'archived',
  272. ] as const
  273. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  274. export type LegacyDataSourceInfo = {
  275. upload_file: {
  276. id: string
  277. name: string
  278. size: number
  279. mime_type: string
  280. created_at: number
  281. created_by: string
  282. extension: string
  283. }
  284. notion_page_icon?: string
  285. notion_workspace_id?: string
  286. notion_page_id?: string
  287. provider?: DataSourceProvider
  288. job_id: string
  289. url: string
  290. credential_id?: string
  291. }
  292. export type LocalFileInfo = {
  293. extension: string
  294. mime_type: string
  295. name: string
  296. related_id: string
  297. size: number
  298. transfer_method: TransferMethod
  299. url: string
  300. }
  301. export type WebsiteCrawlInfo = {
  302. content: string
  303. credential_id: string
  304. description: string
  305. source_url: string
  306. title: string
  307. }
  308. export type OnlineDocumentInfo = {
  309. credential_id: string
  310. workspace_id: string
  311. page: {
  312. last_edited_time: string
  313. page_icon: DataSourceNotionPage['page_icon']
  314. page_id: string
  315. page_name: string
  316. parent_id: string
  317. type: string
  318. },
  319. }
  320. export type OnlineDriveInfo = {
  321. bucket: string
  322. credential_id: string
  323. id: string
  324. name: string
  325. type: 'file' | 'folder'
  326. }
  327. export type DataSourceInfo = LegacyDataSourceInfo | LocalFileInfo | OnlineDocumentInfo | WebsiteCrawlInfo
  328. export type InitialDocumentDetail = {
  329. id: string
  330. batch: string
  331. position: number
  332. dataset_id: string
  333. data_source_type: DataSourceType | DatasourceType
  334. data_source_info: DataSourceInfo
  335. dataset_process_rule_id: string
  336. name: string
  337. created_from: 'api' | 'web'
  338. created_by: string
  339. created_at: number
  340. indexing_status: DocumentIndexingStatus
  341. display_status: DocumentDisplayStatus
  342. completed_segments?: number
  343. total_segments?: number
  344. doc_form: ChunkingMode
  345. doc_language: string
  346. }
  347. export type SimpleDocumentDetail = InitialDocumentDetail & {
  348. enabled: boolean
  349. word_count: number
  350. error?: string | null
  351. archived: boolean
  352. updated_at: number
  353. hit_count: number
  354. dataset_process_rule_id?: string
  355. data_source_detail_dict?: {
  356. upload_file: {
  357. name: string
  358. extension: string
  359. }
  360. }
  361. doc_metadata?: MetadataItemWithValue[]
  362. created_from: string
  363. }
  364. export type DocumentListResponse = {
  365. data: SimpleDocumentDetail[]
  366. has_more: boolean
  367. total: number
  368. page: number
  369. limit: number
  370. }
  371. export type DocumentReq = {
  372. original_document_id?: string
  373. indexing_technique?: IndexingType
  374. doc_form: ChunkingMode
  375. doc_language: string
  376. process_rule: ProcessRule
  377. }
  378. export type CreateDocumentReq = DocumentReq & {
  379. data_source: DataSource
  380. retrieval_model: RetrievalConfig
  381. embedding_model: string
  382. embedding_model_provider: string
  383. }
  384. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  385. dataset_id: string
  386. }
  387. export type DataSource = {
  388. type: DataSourceType
  389. info_list: {
  390. data_source_type: DataSourceType
  391. notion_info_list?: NotionInfo[]
  392. file_info_list?: {
  393. file_ids: string[]
  394. }
  395. website_info_list?: {
  396. provider: string
  397. job_id: string
  398. urls: string[]
  399. }
  400. }
  401. }
  402. export type NotionInfo = {
  403. workspace_id: string
  404. pages: DataSourceNotionPage[]
  405. credential_id: string
  406. }
  407. export type NotionPage = {
  408. page_id: string
  409. type: string
  410. }
  411. export type ProcessRule = {
  412. mode: ProcessMode
  413. rules: Rules
  414. }
  415. export type createDocumentResponse = {
  416. dataset?: DataSet
  417. batch: string
  418. documents: InitialDocumentDetail[]
  419. }
  420. export type PrecessRule = {
  421. mode: ProcessMode
  422. rules: Rules
  423. }
  424. export type FullDocumentDetail = SimpleDocumentDetail & {
  425. batch: string
  426. created_api_request_id: string
  427. processing_started_at: number
  428. parsing_completed_at: number
  429. cleaning_completed_at: number
  430. splitting_completed_at: number
  431. tokens: number
  432. indexing_latency: number
  433. completed_at: number
  434. paused_by: string
  435. paused_at: number
  436. stopped_at: number
  437. indexing_status: string
  438. disabled_at: number
  439. disabled_by: string
  440. archived_reason: 'rule_modified' | 're_upload'
  441. archived_by: string
  442. archived_at: number
  443. doc_type?: DocType | null | 'others'
  444. doc_metadata?: DocMetadata | null
  445. segment_count: number
  446. dataset_process_rule: PrecessRule
  447. document_process_rule: ProcessRule
  448. [key: string]: any
  449. }
  450. export type DocMetadata = {
  451. title: string
  452. language: string
  453. author: string
  454. publisher: string
  455. publicationDate: string
  456. ISBN: string
  457. category: string
  458. [key: string]: string
  459. }
  460. export const CUSTOMIZABLE_DOC_TYPES = [
  461. 'book',
  462. 'web_page',
  463. 'paper',
  464. 'social_media_post',
  465. 'personal_document',
  466. 'business_document',
  467. 'im_chat_log',
  468. ] as const
  469. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  470. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  471. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  472. export type DocType = CustomizableDocType | FixedDocType
  473. export type DocumentDetailResponse = FullDocumentDetail
  474. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  475. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  476. export type SegmentsQuery = {
  477. page?: string
  478. limit: number
  479. // status?: SegmentStatus
  480. hit_count_gte?: number
  481. keyword?: string
  482. enabled?: boolean | 'all'
  483. }
  484. export type SegmentDetailModel = {
  485. id: string
  486. position: number
  487. document_id: string
  488. content: string
  489. sign_content: string
  490. word_count: number
  491. tokens: number
  492. keywords: string[]
  493. index_node_id: string
  494. index_node_hash: string
  495. hit_count: number
  496. enabled: boolean
  497. disabled_at: number
  498. disabled_by: string
  499. status: SegmentStatus
  500. created_by: string
  501. created_at: number
  502. indexing_at: number
  503. completed_at: number
  504. error: string | null
  505. stopped_at: number
  506. answer?: string
  507. child_chunks?: ChildChunkDetail[]
  508. updated_at: number
  509. }
  510. export type SegmentsResponse = {
  511. data: SegmentDetailModel[]
  512. has_more: boolean
  513. limit: number
  514. total: number
  515. total_pages: number
  516. page: number
  517. }
  518. export type HitTestingRecord = {
  519. id: string
  520. content: string
  521. source: 'app' | 'hit_testing' | 'plugin'
  522. source_app_id: string
  523. created_by_role: 'account' | 'end_user'
  524. created_by: string
  525. created_at: number
  526. }
  527. export type HitTestingChildChunk = {
  528. id: string
  529. content: string
  530. position: number
  531. score: number
  532. }
  533. export type HitTesting = {
  534. segment: Segment
  535. content: Segment
  536. score: number
  537. tsne_position: TsnePosition
  538. child_chunks?: HitTestingChildChunk[] | null
  539. }
  540. export type ExternalKnowledgeBaseHitTesting = {
  541. content: string
  542. title: string
  543. score: number
  544. metadata: {
  545. 'x-amz-bedrock-kb-source-uri': string
  546. 'x-amz-bedrock-kb-data-source-id': string
  547. }
  548. }
  549. export type Segment = {
  550. id: string
  551. document: Document
  552. content: string
  553. sign_content: string
  554. position: number
  555. word_count: number
  556. tokens: number
  557. keywords: string[]
  558. hit_count: number
  559. index_node_hash: string
  560. answer: string
  561. }
  562. export type Document = {
  563. id: string
  564. data_source_type: string
  565. name: string
  566. doc_type: DocType
  567. }
  568. export type HitTestingRecordsResponse = {
  569. data: HitTestingRecord[]
  570. has_more: boolean
  571. limit: number
  572. total: number
  573. page: number
  574. }
  575. export type TsnePosition = {
  576. x: number
  577. y: number
  578. }
  579. export type HitTestingResponse = {
  580. query: {
  581. content: string
  582. tsne_position: TsnePosition
  583. }
  584. records: Array<HitTesting>
  585. }
  586. export type ExternalKnowledgeBaseHitTestingResponse = {
  587. query: {
  588. content: string
  589. }
  590. records: Array<ExternalKnowledgeBaseHitTesting>
  591. }
  592. export type RelatedApp = {
  593. id: string
  594. name: string
  595. mode: AppMode
  596. icon_type: AppIconType | null
  597. icon: string
  598. icon_background: string
  599. icon_url: string
  600. }
  601. export type RelatedAppResponse = {
  602. data: Array<RelatedApp>
  603. total: number
  604. }
  605. export type SegmentUpdater = {
  606. content: string
  607. answer?: string
  608. keywords?: string[]
  609. regenerate_child_chunks?: boolean
  610. }
  611. export type ErrorDocsResponse = {
  612. data: IndexingStatusResponse[]
  613. total: number
  614. }
  615. export type SelectedDatasetsMode = {
  616. allHighQuality: boolean
  617. allHighQualityVectorSearch: boolean
  618. allHighQualityFullTextSearch: boolean
  619. allEconomic: boolean
  620. mixtureHighQualityAndEconomic: boolean
  621. allInternal: boolean
  622. allExternal: boolean
  623. mixtureInternalAndExternal: boolean
  624. inconsistentEmbeddingModel: boolean
  625. }
  626. export enum WeightedScoreEnum {
  627. SemanticFirst = 'semantic_first',
  628. KeywordFirst = 'keyword_first',
  629. Customized = 'customized',
  630. }
  631. export enum RerankingModeEnum {
  632. RerankingModel = 'reranking_model',
  633. WeightedScore = 'weighted_score',
  634. }
  635. export const DEFAULT_WEIGHTED_SCORE = {
  636. allHighQualityVectorSearch: {
  637. semantic: 1.0,
  638. keyword: 0,
  639. },
  640. allHighQualityFullTextSearch: {
  641. semantic: 0,
  642. keyword: 1.0,
  643. },
  644. other: {
  645. semantic: 0.7,
  646. keyword: 0.3,
  647. },
  648. }
  649. export type ChildChunkType = 'automatic' | 'customized'
  650. export type ChildChunkDetail = {
  651. id: string
  652. position: number
  653. segment_id: string
  654. content: string
  655. word_count: number
  656. created_at: number
  657. updated_at: number
  658. type: ChildChunkType
  659. }
  660. export type ChildSegmentsResponse = {
  661. data: ChildChunkDetail[]
  662. total: number
  663. total_pages: number
  664. page: number
  665. limit: number
  666. }
  667. export type UpdateDocumentParams = {
  668. datasetId: string
  669. documentId: string
  670. }
  671. // Used in api url
  672. export enum DocumentActionType {
  673. enable = 'enable',
  674. disable = 'disable',
  675. archive = 'archive',
  676. unArchive = 'un_archive',
  677. delete = 'delete',
  678. }
  679. export type UpdateDocumentBatchParams = {
  680. datasetId: string
  681. documentId?: string
  682. documentIds?: string[] | string
  683. }
  684. export type BatchImportResponse = {
  685. job_id: string
  686. job_status: string
  687. }
  688. export const DOC_FORM_ICON_WITH_BG: Record<ChunkingMode | 'external', React.ComponentType<{ className: string }>> = {
  689. [ChunkingMode.text]: General,
  690. [ChunkingMode.qa]: Qa,
  691. [ChunkingMode.parentChild]: ParentChild,
  692. // [ChunkingMode.graph]: Graph, // todo: Graph RAG
  693. external: ExternalKnowledgeBase,
  694. }
  695. export const DOC_FORM_ICON: Record<ChunkingMode.text | ChunkingMode.qa | ChunkingMode.parentChild, React.ComponentType<{ className: string }>> = {
  696. [ChunkingMode.text]: GeneralChunk,
  697. [ChunkingMode.qa]: QuestionAndAnswer,
  698. [ChunkingMode.parentChild]: ParentChildChunk,
  699. }
  700. export const DOC_FORM_TEXT: Record<ChunkingMode, string> = {
  701. [ChunkingMode.text]: 'general',
  702. [ChunkingMode.qa]: 'qa',
  703. [ChunkingMode.parentChild]: 'parentChild',
  704. // [ChunkingMode.graph]: 'graph', // todo: Graph RAG
  705. }
  706. export type CreateDatasetReq = {
  707. yaml_content?: string
  708. }
  709. export type CreateDatasetResponse = {
  710. id: string
  711. name: string
  712. description: string
  713. permission: DatasetPermission
  714. indexing_technique: IndexingType
  715. created_by: string
  716. created_at: number
  717. updated_by: string
  718. updated_at: number
  719. pipeline_id: string
  720. dataset_id: string
  721. }
  722. export type IndexingStatusBatchRequest = {
  723. datasetId: string
  724. batchId: string
  725. }