Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

datasets.ts 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. import { ExternalKnowledgeBase, General, ParentChild, Qa } from '@/app/components/base/icons/src/public/knowledge/dataset-card'
  8. import { GeneralChunk, ParentChildChunk, QuestionAndAnswer } from '@/app/components/base/icons/src/vender/knowledge'
  9. import type { DatasourceType } from './pipeline'
  10. export enum DataSourceType {
  11. FILE = 'upload_file',
  12. NOTION = 'notion_import',
  13. WEB = 'website_crawl',
  14. }
  15. export enum DatasetPermission {
  16. onlyMe = 'only_me',
  17. allTeamMembers = 'all_team_members',
  18. partialMembers = 'partial_members',
  19. }
  20. export enum ChunkingMode {
  21. text = 'text_model', // General text
  22. qa = 'qa_model', // General QA
  23. parentChild = 'hierarchical_model', // Parent-Child
  24. // graph = 'graph', // todo: Graph RAG
  25. }
  26. export type MetadataInDoc = {
  27. value: string
  28. id: string
  29. type: MetadataFilteringVariableType
  30. name: string
  31. }
  32. export type IconInfo = {
  33. icon: string
  34. icon_background?: string
  35. icon_type: AppIconType
  36. icon_url?: string
  37. }
  38. export type DataSet = {
  39. id: string
  40. name: string
  41. indexing_status: DocumentIndexingStatus
  42. icon_info: IconInfo
  43. description: string
  44. permission: DatasetPermission
  45. data_source_type: DataSourceType
  46. indexing_technique: IndexingType
  47. created_by: string
  48. updated_by: string
  49. updated_at: number
  50. app_count: number
  51. doc_form: ChunkingMode
  52. document_count: number
  53. total_document_count: number
  54. total_available_documents?: number
  55. word_count: number
  56. provider: string
  57. embedding_model: string
  58. embedding_model_provider: string
  59. embedding_available: boolean
  60. retrieval_model_dict: RetrievalConfig
  61. retrieval_model: RetrievalConfig
  62. tags: Tag[]
  63. partial_member_list?: string[]
  64. external_knowledge_info: {
  65. external_knowledge_id: string
  66. external_knowledge_api_id: string
  67. external_knowledge_api_name: string
  68. external_knowledge_api_endpoint: string
  69. }
  70. external_retrieval_model: {
  71. top_k: number
  72. score_threshold: number
  73. score_threshold_enabled: boolean
  74. }
  75. built_in_field_enabled: boolean
  76. doc_metadata?: MetadataInDoc[]
  77. keyword_number?: number
  78. pipeline_id?: string
  79. is_published?: boolean // Indicates if the pipeline is published
  80. }
  81. export type ExternalAPIItem = {
  82. id: string
  83. tenant_id: string
  84. name: string
  85. description: string
  86. settings: {
  87. endpoint: string
  88. api_key: string
  89. }
  90. dataset_bindings: { id: string; name: string }[]
  91. created_by: string
  92. created_at: string
  93. }
  94. export type ExternalKnowledgeItem = {
  95. id: string
  96. name: string
  97. description: string | null
  98. provider: 'external'
  99. permission: DatasetPermission
  100. data_source_type: null
  101. indexing_technique: null
  102. app_count: number
  103. document_count: number
  104. word_count: number
  105. created_by: string
  106. created_at: string
  107. updated_by: string
  108. updated_at: string
  109. tags: Tag[]
  110. }
  111. export type ExternalAPIDeleteResponse = {
  112. result: 'success' | 'error'
  113. }
  114. export type ExternalAPIUsage = {
  115. is_using: boolean
  116. count: number
  117. }
  118. export type CustomFile = File & {
  119. id?: string
  120. extension?: string
  121. mime_type?: string
  122. created_by?: string
  123. created_at?: number
  124. }
  125. export type DocumentItem = {
  126. id: string
  127. name: string
  128. extension: string
  129. }
  130. export type CrawlOptions = {
  131. crawl_sub_pages: boolean
  132. only_main_content: boolean
  133. includes: string
  134. excludes: string
  135. limit: number | string
  136. max_depth: number | string
  137. use_sitemap: boolean
  138. }
  139. export type CrawlResultItem = {
  140. title: string
  141. markdown: string
  142. description: string
  143. source_url: string
  144. }
  145. export type FileItem = {
  146. fileID: string
  147. file: CustomFile
  148. progress: number
  149. }
  150. export type FetchDatasetsParams = {
  151. url: string
  152. params: {
  153. page: number
  154. ids?: string[]
  155. tag_ids?: string[]
  156. limit?: number
  157. include_all?: boolean
  158. keyword?: string
  159. }
  160. }
  161. export type DatasetListRequest = {
  162. initialPage: number
  163. tag_ids: string[]
  164. limit: number
  165. include_all: boolean
  166. keyword: string
  167. }
  168. export type DataSetListResponse = {
  169. data: DataSet[]
  170. has_more: boolean
  171. limit: number
  172. page: number
  173. total: number
  174. }
  175. export type ExternalAPIListResponse = {
  176. data: ExternalAPIItem[]
  177. has_more: boolean
  178. limit: number
  179. page: number
  180. total: number
  181. }
  182. export type QA = {
  183. question: string
  184. answer: string
  185. }
  186. export type IndexingEstimateResponse = {
  187. tokens: number
  188. total_price: number
  189. currency: string
  190. total_segments: number
  191. preview: Array<{ content: string; child_chunks: string[] }>
  192. qa_preview?: QA[]
  193. }
  194. export type FileIndexingEstimateResponse = {
  195. total_nodes: number
  196. } & IndexingEstimateResponse
  197. export type IndexingStatusResponse = {
  198. id: string
  199. indexing_status: DocumentIndexingStatus
  200. processing_started_at: number
  201. parsing_completed_at: number
  202. cleaning_completed_at: number
  203. splitting_completed_at: number
  204. completed_at: any
  205. paused_at: any
  206. error: any
  207. stopped_at: any
  208. completed_segments: number
  209. total_segments: number
  210. }
  211. export type IndexingStatusBatchResponse = {
  212. data: IndexingStatusResponse[]
  213. }
  214. export enum ProcessMode {
  215. general = 'custom',
  216. parentChild = 'hierarchical',
  217. }
  218. export type ParentMode = 'full-doc' | 'paragraph'
  219. export type ProcessRuleResponse = {
  220. mode: ProcessMode
  221. rules: Rules
  222. limits: Limits
  223. }
  224. export type Rules = {
  225. pre_processing_rules: PreProcessingRule[]
  226. segmentation: Segmentation
  227. parent_mode: ParentMode
  228. subchunk_segmentation: Segmentation
  229. }
  230. export type Limits = {
  231. indexing_max_segmentation_tokens_length: number
  232. }
  233. export type PreProcessingRule = {
  234. id: string
  235. enabled: boolean
  236. }
  237. export type Segmentation = {
  238. separator: string
  239. max_tokens: number
  240. chunk_overlap?: number
  241. }
  242. export const DocumentIndexingStatusList = [
  243. 'waiting',
  244. 'parsing',
  245. 'cleaning',
  246. 'splitting',
  247. 'indexing',
  248. 'paused',
  249. 'error',
  250. 'completed',
  251. ] as const
  252. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  253. export const DisplayStatusList = [
  254. 'queuing',
  255. 'indexing',
  256. 'paused',
  257. 'error',
  258. 'available',
  259. 'enabled',
  260. 'disabled',
  261. 'archived',
  262. ] as const
  263. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  264. export type DataSourceInfo = {
  265. upload_file: {
  266. id: string
  267. name: string
  268. size: number
  269. mime_type: string
  270. created_at: number
  271. created_by: string
  272. extension: string
  273. }
  274. notion_page_icon?: string
  275. notion_workspace_id?: string
  276. notion_page_id?: string
  277. provider?: DataSourceProvider
  278. job_id: string
  279. url: string
  280. }
  281. export type InitialDocumentDetail = {
  282. id: string
  283. batch: string
  284. position: number
  285. dataset_id: string
  286. data_source_type: DataSourceType | DatasourceType
  287. data_source_info: DataSourceInfo
  288. dataset_process_rule_id: string
  289. name: string
  290. created_from: 'api' | 'web'
  291. created_by: string
  292. created_at: number
  293. indexing_status: DocumentIndexingStatus
  294. display_status: DocumentDisplayStatus
  295. completed_segments?: number
  296. total_segments?: number
  297. doc_form: ChunkingMode
  298. doc_language: string
  299. }
  300. export type SimpleDocumentDetail = InitialDocumentDetail & {
  301. enabled: boolean
  302. word_count: number
  303. is_qa: boolean // TODO waiting for backend to add this field
  304. error?: string | null
  305. archived: boolean
  306. updated_at: number
  307. hit_count: number
  308. dataset_process_rule_id?: string
  309. data_source_detail_dict?: {
  310. upload_file: {
  311. name: string
  312. extension: string
  313. }
  314. }
  315. doc_metadata?: MetadataItemWithValue[]
  316. }
  317. export type DocumentListResponse = {
  318. data: SimpleDocumentDetail[]
  319. has_more: boolean
  320. total: number
  321. page: number
  322. limit: number
  323. }
  324. export type DocumentReq = {
  325. original_document_id?: string
  326. indexing_technique?: IndexingType
  327. doc_form: ChunkingMode
  328. doc_language: string
  329. process_rule: ProcessRule
  330. }
  331. export type CreateDocumentReq = DocumentReq & {
  332. data_source: DataSource
  333. retrieval_model: RetrievalConfig
  334. embedding_model: string
  335. embedding_model_provider: string
  336. }
  337. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  338. dataset_id: string
  339. }
  340. export type DataSource = {
  341. type: DataSourceType
  342. info_list: {
  343. data_source_type: DataSourceType
  344. notion_info_list?: NotionInfo[]
  345. file_info_list?: {
  346. file_ids: string[]
  347. }
  348. website_info_list?: {
  349. provider: string
  350. job_id: string
  351. urls: string[]
  352. }
  353. }
  354. }
  355. export type NotionInfo = {
  356. workspace_id: string
  357. pages: DataSourceNotionPage[]
  358. }
  359. export type NotionPage = {
  360. page_id: string
  361. type: string
  362. }
  363. export type ProcessRule = {
  364. mode: ProcessMode
  365. rules: Rules
  366. }
  367. export type createDocumentResponse = {
  368. dataset?: DataSet
  369. batch: string
  370. documents: InitialDocumentDetail[]
  371. }
  372. export type PrecessRule = {
  373. mode: ProcessMode
  374. rules: Rules
  375. }
  376. export type FullDocumentDetail = SimpleDocumentDetail & {
  377. batch: string
  378. created_api_request_id: string
  379. processing_started_at: number
  380. parsing_completed_at: number
  381. cleaning_completed_at: number
  382. splitting_completed_at: number
  383. tokens: number
  384. indexing_latency: number
  385. completed_at: number
  386. paused_by: string
  387. paused_at: number
  388. stopped_at: number
  389. indexing_status: string
  390. disabled_at: number
  391. disabled_by: string
  392. archived_reason: 'rule_modified' | 're_upload'
  393. archived_by: string
  394. archived_at: number
  395. doc_type?: DocType | null | 'others'
  396. doc_metadata?: DocMetadata | null
  397. segment_count: number
  398. dataset_process_rule: PrecessRule
  399. document_process_rule: ProcessRule
  400. [key: string]: any
  401. }
  402. export type DocMetadata = {
  403. title: string
  404. language: string
  405. author: string
  406. publisher: string
  407. publicationDate: string
  408. ISBN: string
  409. category: string
  410. [key: string]: string
  411. }
  412. export const CUSTOMIZABLE_DOC_TYPES = [
  413. 'book',
  414. 'web_page',
  415. 'paper',
  416. 'social_media_post',
  417. 'personal_document',
  418. 'business_document',
  419. 'im_chat_log',
  420. ] as const
  421. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  422. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  423. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  424. export type DocType = CustomizableDocType | FixedDocType
  425. export type DocumentDetailResponse = FullDocumentDetail
  426. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  427. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  428. export type SegmentsQuery = {
  429. page?: string
  430. limit: number
  431. // status?: SegmentStatus
  432. hit_count_gte?: number
  433. keyword?: string
  434. enabled?: boolean | 'all'
  435. }
  436. export type SegmentDetailModel = {
  437. id: string
  438. position: number
  439. document_id: string
  440. content: string
  441. sign_content: string
  442. word_count: number
  443. tokens: number
  444. keywords: string[]
  445. index_node_id: string
  446. index_node_hash: string
  447. hit_count: number
  448. enabled: boolean
  449. disabled_at: number
  450. disabled_by: string
  451. status: SegmentStatus
  452. created_by: string
  453. created_at: number
  454. indexing_at: number
  455. completed_at: number
  456. error: string | null
  457. stopped_at: number
  458. answer?: string
  459. child_chunks?: ChildChunkDetail[]
  460. updated_at: number
  461. }
  462. export type SegmentsResponse = {
  463. data: SegmentDetailModel[]
  464. has_more: boolean
  465. limit: number
  466. total: number
  467. total_pages: number
  468. page: number
  469. }
  470. export type HitTestingRecord = {
  471. id: string
  472. content: string
  473. source: 'app' | 'hit_testing' | 'plugin'
  474. source_app_id: string
  475. created_by_role: 'account' | 'end_user'
  476. created_by: string
  477. created_at: number
  478. }
  479. export type HitTestingChildChunk = {
  480. id: string
  481. content: string
  482. position: number
  483. score: number
  484. }
  485. export type HitTesting = {
  486. segment: Segment
  487. content: Segment
  488. score: number
  489. tsne_position: TsnePosition
  490. child_chunks?: HitTestingChildChunk[] | null
  491. }
  492. export type ExternalKnowledgeBaseHitTesting = {
  493. content: string
  494. title: string
  495. score: number
  496. metadata: {
  497. 'x-amz-bedrock-kb-source-uri': string
  498. 'x-amz-bedrock-kb-data-source-id': string
  499. }
  500. }
  501. export type Segment = {
  502. id: string
  503. document: Document
  504. content: string
  505. sign_content: string
  506. position: number
  507. word_count: number
  508. tokens: number
  509. keywords: string[]
  510. hit_count: number
  511. index_node_hash: string
  512. }
  513. export type Document = {
  514. id: string
  515. data_source_type: string
  516. name: string
  517. doc_type: DocType
  518. }
  519. export type HitTestingRecordsResponse = {
  520. data: HitTestingRecord[]
  521. has_more: boolean
  522. limit: number
  523. total: number
  524. page: number
  525. }
  526. export type TsnePosition = {
  527. x: number
  528. y: number
  529. }
  530. export type HitTestingResponse = {
  531. query: {
  532. content: string
  533. tsne_position: TsnePosition
  534. }
  535. records: Array<HitTesting>
  536. }
  537. export type ExternalKnowledgeBaseHitTestingResponse = {
  538. query: {
  539. content: string
  540. }
  541. records: Array<ExternalKnowledgeBaseHitTesting>
  542. }
  543. export type RelatedApp = {
  544. id: string
  545. name: string
  546. mode: AppMode
  547. icon_type: AppIconType | null
  548. icon: string
  549. icon_background: string
  550. icon_url: string
  551. }
  552. export type RelatedAppResponse = {
  553. data: Array<RelatedApp>
  554. total: number
  555. }
  556. export type SegmentUpdater = {
  557. content: string
  558. answer?: string
  559. keywords?: string[]
  560. regenerate_child_chunks?: boolean
  561. }
  562. export type ErrorDocsResponse = {
  563. data: IndexingStatusResponse[]
  564. total: number
  565. }
  566. export type SelectedDatasetsMode = {
  567. allHighQuality: boolean
  568. allHighQualityVectorSearch: boolean
  569. allHighQualityFullTextSearch: boolean
  570. allEconomic: boolean
  571. mixtureHighQualityAndEconomic: boolean
  572. allInternal: boolean
  573. allExternal: boolean
  574. mixtureInternalAndExternal: boolean
  575. inconsistentEmbeddingModel: boolean
  576. }
  577. export enum WeightedScoreEnum {
  578. SemanticFirst = 'semantic_first',
  579. KeywordFirst = 'keyword_first',
  580. Customized = 'customized',
  581. }
  582. export enum RerankingModeEnum {
  583. RerankingModel = 'reranking_model',
  584. WeightedScore = 'weighted_score',
  585. }
  586. export const DEFAULT_WEIGHTED_SCORE = {
  587. allHighQualityVectorSearch: {
  588. semantic: 1.0,
  589. keyword: 0,
  590. },
  591. allHighQualityFullTextSearch: {
  592. semantic: 0,
  593. keyword: 1.0,
  594. },
  595. other: {
  596. semantic: 0.7,
  597. keyword: 0.3,
  598. },
  599. }
  600. export type ChildChunkType = 'automatic' | 'customized'
  601. export type ChildChunkDetail = {
  602. id: string
  603. position: number
  604. segment_id: string
  605. content: string
  606. word_count: number
  607. created_at: number
  608. updated_at: number
  609. type: ChildChunkType
  610. }
  611. export type ChildSegmentsResponse = {
  612. data: ChildChunkDetail[]
  613. total: number
  614. total_pages: number
  615. page: number
  616. limit: number
  617. }
  618. export type UpdateDocumentParams = {
  619. datasetId: string
  620. documentId: string
  621. }
  622. // Used in api url
  623. export enum DocumentActionType {
  624. enable = 'enable',
  625. disable = 'disable',
  626. archive = 'archive',
  627. unArchive = 'un_archive',
  628. delete = 'delete',
  629. }
  630. export type UpdateDocumentBatchParams = {
  631. datasetId: string
  632. documentId?: string
  633. documentIds?: string[] | string
  634. }
  635. export type BatchImportResponse = {
  636. job_id: string
  637. job_status: string
  638. }
  639. export const DOC_FORM_ICON_WITH_BG: Record<ChunkingMode | 'external', React.ComponentType<{ className: string }>> = {
  640. [ChunkingMode.text]: General,
  641. [ChunkingMode.qa]: Qa,
  642. [ChunkingMode.parentChild]: ParentChild,
  643. // [ChunkingMode.graph]: Graph, // todo: Graph RAG
  644. external: ExternalKnowledgeBase,
  645. }
  646. export const DOC_FORM_ICON: Record<ChunkingMode.text | ChunkingMode.qa | ChunkingMode.parentChild, React.ComponentType<{ className: string }>> = {
  647. [ChunkingMode.text]: GeneralChunk,
  648. [ChunkingMode.qa]: QuestionAndAnswer,
  649. [ChunkingMode.parentChild]: ParentChildChunk,
  650. }
  651. export const DOC_FORM_TEXT: Record<ChunkingMode, string> = {
  652. [ChunkingMode.text]: 'general',
  653. [ChunkingMode.qa]: 'qa',
  654. [ChunkingMode.parentChild]: 'parentChild',
  655. // [ChunkingMode.graph]: 'graph', // todo: Graph RAG
  656. }
  657. export type CreateDatasetReq = {
  658. name: string
  659. description: string
  660. icon_info: IconInfo
  661. doc_form?: ChunkingMode
  662. permission: DatasetPermission
  663. partial_member_list?: {
  664. user_id: string
  665. role?: 'owner' | 'admin' | 'editor' | 'normal' | 'dataset_operator'
  666. }[]
  667. yaml_content?: string
  668. }
  669. export type CreateDatasetResponse = {
  670. id: string
  671. name: string
  672. description: string
  673. permission: DatasetPermission
  674. indexing_technique: IndexingType
  675. created_by: string
  676. created_at: number
  677. updated_by: string
  678. updated_at: number
  679. pipeline_id: string
  680. }