| import uuid | import uuid | ||||
| import pandas as pd | |||||
| from flask import request | from flask import request | ||||
| from flask_login import current_user | from flask_login import current_user | ||||
| from flask_restful import Resource, marshal, reqparse | from flask_restful import Resource, marshal, reqparse | ||||
| ChildChunkDeleteIndexError, | ChildChunkDeleteIndexError, | ||||
| ChildChunkIndexingError, | ChildChunkIndexingError, | ||||
| InvalidActionError, | InvalidActionError, | ||||
| NoFileUploadedError, | |||||
| TooManyFilesError, | |||||
| ) | ) | ||||
| from controllers.console.wraps import ( | from controllers.console.wraps import ( | ||||
| account_initialization_required, | account_initialization_required, | ||||
| from fields.segment_fields import child_chunk_fields, segment_fields | from fields.segment_fields import child_chunk_fields, segment_fields | ||||
| from libs.login import login_required | from libs.login import login_required | ||||
| from models.dataset import ChildChunk, DocumentSegment | from models.dataset import ChildChunk, DocumentSegment | ||||
| from models.model import UploadFile | |||||
| from services.dataset_service import DatasetService, DocumentService, SegmentService | from services.dataset_service import DatasetService, DocumentService, SegmentService | ||||
| from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs | from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs | ||||
| from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError | from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError | ||||
| document = DocumentService.get_document(dataset_id, document_id) | document = DocumentService.get_document(dataset_id, document_id) | ||||
| if not document: | if not document: | ||||
| raise NotFound("Document not found.") | raise NotFound("Document not found.") | ||||
| # get file from request | |||||
| file = request.files["file"] | |||||
| # check file | |||||
| if "file" not in request.files: | |||||
| raise NoFileUploadedError() | |||||
| if len(request.files) > 1: | |||||
| raise TooManyFilesError() | |||||
| parser = reqparse.RequestParser() | |||||
| parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json") | |||||
| args = parser.parse_args() | |||||
| upload_file_id = args["upload_file_id"] | |||||
| upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first() | |||||
| if not upload_file: | |||||
| raise NotFound("UploadFile not found.") | |||||
| # check file type | # check file type | ||||
| if not file.filename or not file.filename.lower().endswith(".csv"): | |||||
| if not upload_file.name or not upload_file.name.lower().endswith(".csv"): | |||||
| raise ValueError("Invalid file type. Only CSV files are allowed") | raise ValueError("Invalid file type. Only CSV files are allowed") | ||||
| try: | try: | ||||
| # Skip the first row | |||||
| df = pd.read_csv(file) | |||||
| result = [] | |||||
| for index, row in df.iterrows(): | |||||
| if document.doc_form == "qa_model": | |||||
| data = {"content": row.iloc[0], "answer": row.iloc[1]} | |||||
| else: | |||||
| data = {"content": row.iloc[0]} | |||||
| result.append(data) | |||||
| if len(result) == 0: | |||||
| raise ValueError("The CSV file is empty.") | |||||
| # async job | # async job | ||||
| job_id = str(uuid.uuid4()) | job_id = str(uuid.uuid4()) | ||||
| indexing_cache_key = f"segment_batch_import_{str(job_id)}" | indexing_cache_key = f"segment_batch_import_{str(job_id)}" | ||||
| # send batch add segments task | # send batch add segments task | ||||
| redis_client.setnx(indexing_cache_key, "waiting") | redis_client.setnx(indexing_cache_key, "waiting") | ||||
| batch_create_segment_to_index_task.delay( | batch_create_segment_to_index_task.delay( | ||||
| str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id | |||||
| str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id | |||||
| ) | ) | ||||
| except Exception as e: | except Exception as e: | ||||
| return {"error": str(e)}, 500 | return {"error": str(e)}, 500 |
| import datetime | import datetime | ||||
| import logging | import logging | ||||
| import tempfile | |||||
| import time | import time | ||||
| import uuid | import uuid | ||||
| from pathlib import Path | |||||
| import click | import click | ||||
| import pandas as pd | |||||
| from celery import shared_task # type: ignore | from celery import shared_task # type: ignore | ||||
| from sqlalchemy import func | from sqlalchemy import func | ||||
| from sqlalchemy.orm import Session | from sqlalchemy.orm import Session | ||||
| from core.model_runtime.entities.model_entities import ModelType | from core.model_runtime.entities.model_entities import ModelType | ||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| from extensions.ext_redis import redis_client | from extensions.ext_redis import redis_client | ||||
| from extensions.ext_storage import storage | |||||
| from libs import helper | from libs import helper | ||||
| from models.dataset import Dataset, Document, DocumentSegment | from models.dataset import Dataset, Document, DocumentSegment | ||||
| from models.model import UploadFile | |||||
| from services.vector_service import VectorService | from services.vector_service import VectorService | ||||
| @shared_task(queue="dataset") | @shared_task(queue="dataset") | ||||
| def batch_create_segment_to_index_task( | def batch_create_segment_to_index_task( | ||||
| job_id: str, | job_id: str, | ||||
| content: list, | |||||
| upload_file_id: str, | |||||
| dataset_id: str, | dataset_id: str, | ||||
| document_id: str, | document_id: str, | ||||
| tenant_id: str, | tenant_id: str, | ||||
| """ | """ | ||||
| Async batch create segment to index | Async batch create segment to index | ||||
| :param job_id: | :param job_id: | ||||
| :param content: | |||||
| :param upload_file_id: | |||||
| :param dataset_id: | :param dataset_id: | ||||
| :param document_id: | :param document_id: | ||||
| :param tenant_id: | :param tenant_id: | ||||
| :param user_id: | :param user_id: | ||||
| Usage: batch_create_segment_to_index_task.delay(job_id, content, dataset_id, document_id, tenant_id, user_id) | |||||
| Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id) | |||||
| """ | """ | ||||
| logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green")) | logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green")) | ||||
| start_at = time.perf_counter() | start_at = time.perf_counter() | ||||
| or dataset_document.indexing_status != "completed" | or dataset_document.indexing_status != "completed" | ||||
| ): | ): | ||||
| raise ValueError("Document is not available.") | raise ValueError("Document is not available.") | ||||
| upload_file = session.get(UploadFile, upload_file_id) | |||||
| if not upload_file: | |||||
| raise ValueError("UploadFile not found.") | |||||
| with tempfile.TemporaryDirectory() as temp_dir: | |||||
| suffix = Path(upload_file.key).suffix | |||||
| # FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here | |||||
| file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore | |||||
| storage.download(upload_file.key, file_path) | |||||
| # Skip the first row | |||||
| df = pd.read_csv(file_path) | |||||
| content = [] | |||||
| for index, row in df.iterrows(): | |||||
| if dataset_document.doc_form == "qa_model": | |||||
| data = {"content": row.iloc[0], "answer": row.iloc[1]} | |||||
| else: | |||||
| data = {"content": row.iloc[0]} | |||||
| content.append(data) | |||||
| if len(content) == 0: | |||||
| raise ValueError("The CSV file is empty.") | |||||
| document_segments = [] | document_segments = [] | ||||
| embedding_model = None | embedding_model = None | ||||
| if dataset.indexing_technique == "high_quality": | if dataset.indexing_technique == "high_quality": |
| 'use client' | 'use client' | ||||
| import type { FC } from 'react' | import type { FC } from 'react' | ||||
| import React, { useEffect, useRef, useState } from 'react' | |||||
| import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react' | |||||
| import { | import { | ||||
| RiDeleteBinLine, | RiDeleteBinLine, | ||||
| } from '@remixicon/react' | } from '@remixicon/react' | ||||
| import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files' | import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files' | ||||
| import { ToastContext } from '@/app/components/base/toast' | import { ToastContext } from '@/app/components/base/toast' | ||||
| import Button from '@/app/components/base/button' | import Button from '@/app/components/base/button' | ||||
| import type { FileItem } from '@/models/datasets' | |||||
| import { upload } from '@/service/base' | |||||
| import useSWR from 'swr' | |||||
| import { fetchFileUploadConfig } from '@/service/common' | |||||
| import SimplePieChart from '@/app/components/base/simple-pie-chart' | |||||
| import { Theme } from '@/types/app' | |||||
| import useTheme from '@/hooks/use-theme' | |||||
| export type Props = { | export type Props = { | ||||
| file: File | undefined | |||||
| updateFile: (file?: File) => void | |||||
| file: FileItem | undefined | |||||
| updateFile: (file?: FileItem) => void | |||||
| } | } | ||||
| const CSVUploader: FC<Props> = ({ | const CSVUploader: FC<Props> = ({ | ||||
| const dropRef = useRef<HTMLDivElement>(null) | const dropRef = useRef<HTMLDivElement>(null) | ||||
| const dragRef = useRef<HTMLDivElement>(null) | const dragRef = useRef<HTMLDivElement>(null) | ||||
| const fileUploader = useRef<HTMLInputElement>(null) | const fileUploader = useRef<HTMLInputElement>(null) | ||||
| const { data: fileUploadConfigResponse } = useSWR({ url: '/files/upload' }, fetchFileUploadConfig) | |||||
| const fileUploadConfig = useMemo(() => fileUploadConfigResponse ?? { | |||||
| file_size_limit: 15, | |||||
| }, [fileUploadConfigResponse]) | |||||
| const fileUpload = useCallback(async (fileItem: FileItem): Promise<FileItem> => { | |||||
| fileItem.progress = 0 | |||||
| const formData = new FormData() | |||||
| formData.append('file', fileItem.file) | |||||
| const onProgress = (e: ProgressEvent) => { | |||||
| if (e.lengthComputable) { | |||||
| const progress = Math.floor(e.loaded / e.total * 100) | |||||
| updateFile({ | |||||
| ...fileItem, | |||||
| progress, | |||||
| }) | |||||
| } | |||||
| } | |||||
| return upload({ | |||||
| xhr: new XMLHttpRequest(), | |||||
| data: formData, | |||||
| onprogress: onProgress, | |||||
| }, false, undefined, '?source=datasets') | |||||
| .then((res: File) => { | |||||
| const completeFile = { | |||||
| fileID: fileItem.fileID, | |||||
| file: res, | |||||
| progress: 100, | |||||
| } | |||||
| updateFile(completeFile) | |||||
| return Promise.resolve({ ...completeFile }) | |||||
| }) | |||||
| .catch((e) => { | |||||
| notify({ type: 'error', message: e?.response?.code === 'forbidden' ? e?.response?.message : t('datasetCreation.stepOne.uploader.failed') }) | |||||
| const errorFile = { | |||||
| ...fileItem, | |||||
| progress: -2, | |||||
| } | |||||
| updateFile(errorFile) | |||||
| return Promise.resolve({ ...errorFile }) | |||||
| }) | |||||
| .finally() | |||||
| }, [notify, t, updateFile]) | |||||
| const uploadFile = useCallback(async (fileItem: FileItem) => { | |||||
| await fileUpload(fileItem) | |||||
| }, [fileUpload]) | |||||
| const initialUpload = useCallback((file?: File) => { | |||||
| if (!file) | |||||
| return false | |||||
| const newFile: FileItem = { | |||||
| fileID: `file0-${Date.now()}`, | |||||
| file, | |||||
| progress: -1, | |||||
| } | |||||
| updateFile(newFile) | |||||
| uploadFile(newFile) | |||||
| }, [updateFile, uploadFile]) | |||||
| const handleDragEnter = (e: DragEvent) => { | const handleDragEnter = (e: DragEvent) => { | ||||
| e.preventDefault() | e.preventDefault() | ||||
| notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') }) | notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') }) | ||||
| return | return | ||||
| } | } | ||||
| updateFile(files[0]) | |||||
| initialUpload(files[0]) | |||||
| } | } | ||||
| const selectHandle = () => { | const selectHandle = () => { | ||||
| if (fileUploader.current) | if (fileUploader.current) | ||||
| fileUploader.current.value = '' | fileUploader.current.value = '' | ||||
| updateFile() | updateFile() | ||||
| } | } | ||||
| const getFileType = (currentFile: File) => { | |||||
| if (!currentFile) | |||||
| return '' | |||||
| const arr = currentFile.name.split('.') | |||||
| return arr[arr.length - 1] | |||||
| } | |||||
| const isValid = useCallback((file?: File) => { | |||||
| if (!file) | |||||
| return false | |||||
| const { size } = file | |||||
| const ext = `.${getFileType(file)}` | |||||
| const isValidType = ext.toLowerCase() === '.csv' | |||||
| if (!isValidType) | |||||
| notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') }) | |||||
| const isValidSize = size <= fileUploadConfig.file_size_limit * 1024 * 1024 | |||||
| if (!isValidSize) | |||||
| notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.size', { size: fileUploadConfig.file_size_limit }) }) | |||||
| return isValidType && isValidSize | |||||
| }, [fileUploadConfig, notify, t]) | |||||
| const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => { | const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => { | ||||
| const currentFile = e.target.files?.[0] | const currentFile = e.target.files?.[0] | ||||
| updateFile(currentFile) | |||||
| if (!isValid(currentFile)) | |||||
| return | |||||
| initialUpload(currentFile) | |||||
| } | } | ||||
| const { theme } = useTheme() | |||||
| const chartColor = useMemo(() => theme === Theme.dark ? '#5289ff' : '#296dff', [theme]) | |||||
| useEffect(() => { | useEffect(() => { | ||||
| dropRef.current?.addEventListener('dragenter', handleDragEnter) | dropRef.current?.addEventListener('dragenter', handleDragEnter) | ||||
| dropRef.current?.addEventListener('dragover', handleDragOver) | dropRef.current?.addEventListener('dragover', handleDragOver) | ||||
| <div className={cn('group flex h-20 items-center rounded-xl border border-components-panel-border bg-components-panel-bg-blur px-6 text-sm font-normal', 'hover:border-divider-subtle hover:bg-components-panel-on-panel-item-bg-hover')}> | <div className={cn('group flex h-20 items-center rounded-xl border border-components-panel-border bg-components-panel-bg-blur px-6 text-sm font-normal', 'hover:border-divider-subtle hover:bg-components-panel-on-panel-item-bg-hover')}> | ||||
| <CSVIcon className="shrink-0" /> | <CSVIcon className="shrink-0" /> | ||||
| <div className='ml-2 flex w-0 grow'> | <div className='ml-2 flex w-0 grow'> | ||||
| <span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.name.replace(/.csv$/, '')}</span> | |||||
| <span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.file.name.replace(/.csv$/, '')}</span> | |||||
| <span className='shrink-0 text-text-secondary'>.csv</span> | <span className='shrink-0 text-text-secondary'>.csv</span> | ||||
| </div> | </div> | ||||
| <div className='hidden items-center group-hover:flex'> | <div className='hidden items-center group-hover:flex'> | ||||
| {(file.progress < 100 && file.progress >= 0) && ( | |||||
| <> | |||||
| <SimplePieChart percentage={file.progress} stroke={chartColor} fill={chartColor} animationDuration={0}/> | |||||
| <div className='mx-2 h-4 w-px bg-text-secondary'/> | |||||
| </> | |||||
| )} | |||||
| <Button onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button> | <Button onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button> | ||||
| <div className='mx-2 h-4 w-px bg-text-secondary' /> | <div className='mx-2 h-4 w-px bg-text-secondary' /> | ||||
| <div className='cursor-pointer p-2' onClick={removeFile}> | <div className='cursor-pointer p-2' onClick={removeFile}> |
| import CSVDownloader from './csv-downloader' | import CSVDownloader from './csv-downloader' | ||||
| import Button from '@/app/components/base/button' | import Button from '@/app/components/base/button' | ||||
| import Modal from '@/app/components/base/modal' | import Modal from '@/app/components/base/modal' | ||||
| import type { ChunkingMode } from '@/models/datasets' | |||||
| import type { ChunkingMode, FileItem } from '@/models/datasets' | |||||
| import { noop } from 'lodash-es' | import { noop } from 'lodash-es' | ||||
| export type IBatchModalProps = { | export type IBatchModalProps = { | ||||
| isShow: boolean | isShow: boolean | ||||
| docForm: ChunkingMode | docForm: ChunkingMode | ||||
| onCancel: () => void | onCancel: () => void | ||||
| onConfirm: (file: File) => void | |||||
| onConfirm: (file: FileItem) => void | |||||
| } | } | ||||
| const BatchModal: FC<IBatchModalProps> = ({ | const BatchModal: FC<IBatchModalProps> = ({ | ||||
| onConfirm, | onConfirm, | ||||
| }) => { | }) => { | ||||
| const { t } = useTranslation() | const { t } = useTranslation() | ||||
| const [currentCSV, setCurrentCSV] = useState<File>() | |||||
| const handleFile = (file?: File) => setCurrentCSV(file) | |||||
| const [currentCSV, setCurrentCSV] = useState<FileItem>() | |||||
| const handleFile = (file?: FileItem) => setCurrentCSV(file) | |||||
| const handleSend = () => { | const handleSend = () => { | ||||
| if (!currentCSV) | if (!currentCSV) | ||||
| <Button className='mr-2' onClick={onCancel}> | <Button className='mr-2' onClick={onCancel}> | ||||
| {t('datasetDocuments.list.batchModal.cancel')} | {t('datasetDocuments.list.batchModal.cancel')} | ||||
| </Button> | </Button> | ||||
| <Button variant="primary" onClick={handleSend} disabled={!currentCSV}> | |||||
| <Button variant="primary" onClick={handleSend} disabled={!currentCSV || !currentCSV.file || !currentCSV.file.id}> | |||||
| {t('datasetDocuments.list.batchModal.run')} | {t('datasetDocuments.list.batchModal.run')} | ||||
| </Button> | </Button> | ||||
| </div> | </div> |
| import Divider from '@/app/components/base/divider' | import Divider from '@/app/components/base/divider' | ||||
| import Loading from '@/app/components/base/loading' | import Loading from '@/app/components/base/loading' | ||||
| import { ToastContext } from '@/app/components/base/toast' | import { ToastContext } from '@/app/components/base/toast' | ||||
| import type { ChunkingMode, ParentMode, ProcessMode } from '@/models/datasets' | |||||
| import type { ChunkingMode, FileItem, ParentMode, ProcessMode } from '@/models/datasets' | |||||
| import { useDatasetDetailContext } from '@/context/dataset-detail' | import { useDatasetDetailContext } from '@/context/dataset-detail' | ||||
| import FloatRightContainer from '@/app/components/base/float-right-container' | import FloatRightContainer from '@/app/components/base/float-right-container' | ||||
| import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' | import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' | ||||
| } | } | ||||
| const { mutateAsync: segmentBatchImport } = useSegmentBatchImport() | const { mutateAsync: segmentBatchImport } = useSegmentBatchImport() | ||||
| const runBatch = async (csv: File) => { | |||||
| const formData = new FormData() | |||||
| formData.append('file', csv) | |||||
| const runBatch = async (csv: FileItem) => { | |||||
| await segmentBatchImport({ | await segmentBatchImport({ | ||||
| url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`, | url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`, | ||||
| body: formData, | |||||
| body: { upload_file_id: csv.file.id! }, | |||||
| }, { | }, { | ||||
| onSuccess: (res) => { | onSuccess: (res) => { | ||||
| setImportStatus(res.job_status) | setImportStatus(res.job_status) |
| export const useSegmentBatchImport = () => { | export const useSegmentBatchImport = () => { | ||||
| return useMutation({ | return useMutation({ | ||||
| mutationKey: [NAME_SPACE, 'batchImport'], | mutationKey: [NAME_SPACE, 'batchImport'], | ||||
| mutationFn: (payload: { url: string; body: FormData }) => { | |||||
| mutationFn: (payload: { url: string; body: { upload_file_id: string } }) => { | |||||
| const { url, body } = payload | const { url, body } = payload | ||||
| return post<BatchImportResponse>(url, { body }, { bodyStringify: false, deleteContentType: true }) | |||||
| return post<BatchImportResponse>(url, { body }) | |||||
| }, | }, | ||||
| }) | }) | ||||
| } | } |