Pārlūkot izejas kodu

fix(dataset): CELERY_BROKER uses amqp rabbitmq. When adding document segments in batches and uploading large files, the status will always remain stuck at "In batch processing" #22709 (#23038)

tags/1.7.1
zhaobingshuang pirms 3 mēnešiem
vecāks
revīzija
5c5f61b2aa
Revīzijas autora e-pasta adrese nav piesaistīta nevienam kontam

+ 13
- 24
api/controllers/console/datasets/datasets_segments.py Parādīt failu

@@ -1,6 +1,5 @@
import uuid

import pandas as pd
from flask import request
from flask_login import current_user
from flask_restful import Resource, marshal, reqparse
@@ -14,8 +13,6 @@ from controllers.console.datasets.error import (
ChildChunkDeleteIndexError,
ChildChunkIndexingError,
InvalidActionError,
NoFileUploadedError,
TooManyFilesError,
)
from controllers.console.wraps import (
account_initialization_required,
@@ -32,6 +29,7 @@ from extensions.ext_redis import redis_client
from fields.segment_fields import child_chunk_fields, segment_fields
from libs.login import login_required
from models.dataset import ChildChunk, DocumentSegment
from models.model import UploadFile
from services.dataset_service import DatasetService, DocumentService, SegmentService
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
@@ -365,37 +363,28 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
document = DocumentService.get_document(dataset_id, document_id)
if not document:
raise NotFound("Document not found.")
# get file from request
file = request.files["file"]
# check file
if "file" not in request.files:
raise NoFileUploadedError()

if len(request.files) > 1:
raise TooManyFilesError()

parser = reqparse.RequestParser()
parser.add_argument("upload_file_id", type=str, required=True, nullable=False, location="json")
args = parser.parse_args()
upload_file_id = args["upload_file_id"]

upload_file = db.session.query(UploadFile).where(UploadFile.id == upload_file_id).first()
if not upload_file:
raise NotFound("UploadFile not found.")

# check file type
if not file.filename or not file.filename.lower().endswith(".csv"):
if not upload_file.name or not upload_file.name.lower().endswith(".csv"):
raise ValueError("Invalid file type. Only CSV files are allowed")

try:
# Skip the first row
df = pd.read_csv(file)
result = []
for index, row in df.iterrows():
if document.doc_form == "qa_model":
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row.iloc[0]}
result.append(data)
if len(result) == 0:
raise ValueError("The CSV file is empty.")
# async job
job_id = str(uuid.uuid4())
indexing_cache_key = f"segment_batch_import_{str(job_id)}"
# send batch add segments task
redis_client.setnx(indexing_cache_key, "waiting")
batch_create_segment_to_index_task.delay(
str(job_id), result, dataset_id, document_id, current_user.current_tenant_id, current_user.id
str(job_id), upload_file_id, dataset_id, document_id, current_user.current_tenant_id, current_user.id
)
except Exception as e:
return {"error": str(e)}, 500

+ 31
- 3
api/tasks/batch_create_segment_to_index_task.py Parādīt failu

@@ -1,9 +1,12 @@
import datetime
import logging
import tempfile
import time
import uuid
from pathlib import Path

import click
import pandas as pd
from celery import shared_task # type: ignore
from sqlalchemy import func
from sqlalchemy.orm import Session
@@ -12,15 +15,17 @@ from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from libs import helper
from models.dataset import Dataset, Document, DocumentSegment
from models.model import UploadFile
from services.vector_service import VectorService


@shared_task(queue="dataset")
def batch_create_segment_to_index_task(
job_id: str,
content: list,
upload_file_id: str,
dataset_id: str,
document_id: str,
tenant_id: str,
@@ -29,13 +34,13 @@ def batch_create_segment_to_index_task(
"""
Async batch create segment to index
:param job_id:
:param content:
:param upload_file_id:
:param dataset_id:
:param document_id:
:param tenant_id:
:param user_id:

Usage: batch_create_segment_to_index_task.delay(job_id, content, dataset_id, document_id, tenant_id, user_id)
Usage: batch_create_segment_to_index_task.delay(job_id, upload_file_id, dataset_id, document_id, tenant_id, user_id)
"""
logging.info(click.style(f"Start batch create segment jobId: {job_id}", fg="green"))
start_at = time.perf_counter()
@@ -58,6 +63,29 @@ def batch_create_segment_to_index_task(
or dataset_document.indexing_status != "completed"
):
raise ValueError("Document is not available.")

upload_file = session.get(UploadFile, upload_file_id)
if not upload_file:
raise ValueError("UploadFile not found.")

with tempfile.TemporaryDirectory() as temp_dir:
suffix = Path(upload_file.key).suffix
# FIXME mypy: Cannot determine type of 'tempfile._get_candidate_names' better not use it here
file_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}" # type: ignore
storage.download(upload_file.key, file_path)

# Skip the first row
df = pd.read_csv(file_path)
content = []
for index, row in df.iterrows():
if dataset_document.doc_form == "qa_model":
data = {"content": row.iloc[0], "answer": row.iloc[1]}
else:
data = {"content": row.iloc[0]}
content.append(data)
if len(content) == 0:
raise ValueError("The CSV file is empty.")

document_segments = []
embedding_model = None
if dataset.indexing_technique == "high_quality":

+ 113
- 6
web/app/components/datasets/documents/detail/batch-modal/csv-uploader.tsx Parādīt failu

@@ -1,6 +1,6 @@
'use client'
import type { FC } from 'react'
import React, { useEffect, useRef, useState } from 'react'
import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'
import {
RiDeleteBinLine,
} from '@remixicon/react'
@@ -10,10 +10,17 @@ import cn from '@/utils/classnames'
import { Csv as CSVIcon } from '@/app/components/base/icons/src/public/files'
import { ToastContext } from '@/app/components/base/toast'
import Button from '@/app/components/base/button'
import type { FileItem } from '@/models/datasets'
import { upload } from '@/service/base'
import useSWR from 'swr'
import { fetchFileUploadConfig } from '@/service/common'
import SimplePieChart from '@/app/components/base/simple-pie-chart'
import { Theme } from '@/types/app'
import useTheme from '@/hooks/use-theme'

export type Props = {
file: File | undefined
updateFile: (file?: File) => void
file: FileItem | undefined
updateFile: (file?: FileItem) => void
}

const CSVUploader: FC<Props> = ({
@@ -26,6 +33,68 @@ const CSVUploader: FC<Props> = ({
const dropRef = useRef<HTMLDivElement>(null)
const dragRef = useRef<HTMLDivElement>(null)
const fileUploader = useRef<HTMLInputElement>(null)
const { data: fileUploadConfigResponse } = useSWR({ url: '/files/upload' }, fetchFileUploadConfig)
const fileUploadConfig = useMemo(() => fileUploadConfigResponse ?? {
file_size_limit: 15,
}, [fileUploadConfigResponse])

const fileUpload = useCallback(async (fileItem: FileItem): Promise<FileItem> => {
fileItem.progress = 0

const formData = new FormData()
formData.append('file', fileItem.file)
const onProgress = (e: ProgressEvent) => {
if (e.lengthComputable) {
const progress = Math.floor(e.loaded / e.total * 100)
updateFile({
...fileItem,
progress,
})
}
}

return upload({
xhr: new XMLHttpRequest(),
data: formData,
onprogress: onProgress,
}, false, undefined, '?source=datasets')
.then((res: File) => {
const completeFile = {
fileID: fileItem.fileID,
file: res,
progress: 100,
}
updateFile(completeFile)
return Promise.resolve({ ...completeFile })
})
.catch((e) => {
notify({ type: 'error', message: e?.response?.code === 'forbidden' ? e?.response?.message : t('datasetCreation.stepOne.uploader.failed') })
const errorFile = {
...fileItem,
progress: -2,
}
updateFile(errorFile)
return Promise.resolve({ ...errorFile })
})
.finally()
}, [notify, t, updateFile])

const uploadFile = useCallback(async (fileItem: FileItem) => {
await fileUpload(fileItem)
}, [fileUpload])

const initialUpload = useCallback((file?: File) => {
if (!file)
return false

const newFile: FileItem = {
fileID: `file0-${Date.now()}`,
file,
progress: -1,
}
updateFile(newFile)
uploadFile(newFile)
}, [updateFile, uploadFile])

const handleDragEnter = (e: DragEvent) => {
e.preventDefault()
@@ -52,7 +121,7 @@ const CSVUploader: FC<Props> = ({
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.count') })
return
}
updateFile(files[0])
initialUpload(files[0])
}
const selectHandle = () => {
if (fileUploader.current)
@@ -63,11 +132,43 @@ const CSVUploader: FC<Props> = ({
fileUploader.current.value = ''
updateFile()
}

const getFileType = (currentFile: File) => {
if (!currentFile)
return ''

const arr = currentFile.name.split('.')
return arr[arr.length - 1]
}

const isValid = useCallback((file?: File) => {
if (!file)
return false

const { size } = file
const ext = `.${getFileType(file)}`
const isValidType = ext.toLowerCase() === '.csv'
if (!isValidType)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.typeError') })

const isValidSize = size <= fileUploadConfig.file_size_limit * 1024 * 1024
if (!isValidSize)
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.size', { size: fileUploadConfig.file_size_limit }) })

return isValidType && isValidSize
}, [fileUploadConfig, notify, t])

const fileChangeHandle = (e: React.ChangeEvent<HTMLInputElement>) => {
const currentFile = e.target.files?.[0]
updateFile(currentFile)
if (!isValid(currentFile))
return

initialUpload(currentFile)
}

const { theme } = useTheme()
const chartColor = useMemo(() => theme === Theme.dark ? '#5289ff' : '#296dff', [theme])

useEffect(() => {
dropRef.current?.addEventListener('dragenter', handleDragEnter)
dropRef.current?.addEventListener('dragover', handleDragOver)
@@ -108,10 +209,16 @@ const CSVUploader: FC<Props> = ({
<div className={cn('group flex h-20 items-center rounded-xl border border-components-panel-border bg-components-panel-bg-blur px-6 text-sm font-normal', 'hover:border-divider-subtle hover:bg-components-panel-on-panel-item-bg-hover')}>
<CSVIcon className="shrink-0" />
<div className='ml-2 flex w-0 grow'>
<span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.name.replace(/.csv$/, '')}</span>
<span className='max-w-[calc(100%_-_30px)] overflow-hidden text-ellipsis whitespace-nowrap text-text-primary'>{file.file.name.replace(/.csv$/, '')}</span>
<span className='shrink-0 text-text-secondary'>.csv</span>
</div>
<div className='hidden items-center group-hover:flex'>
{(file.progress < 100 && file.progress >= 0) && (
<>
<SimplePieChart percentage={file.progress} stroke={chartColor} fill={chartColor} animationDuration={0}/>
<div className='mx-2 h-4 w-px bg-text-secondary'/>
</>
)}
<Button onClick={selectHandle}>{t('datasetCreation.stepOne.uploader.change')}</Button>
<div className='mx-2 h-4 w-px bg-text-secondary' />
<div className='cursor-pointer p-2' onClick={removeFile}>

+ 5
- 5
web/app/components/datasets/documents/detail/batch-modal/index.tsx Parādīt failu

@@ -7,14 +7,14 @@ import CSVUploader from './csv-uploader'
import CSVDownloader from './csv-downloader'
import Button from '@/app/components/base/button'
import Modal from '@/app/components/base/modal'
import type { ChunkingMode } from '@/models/datasets'
import type { ChunkingMode, FileItem } from '@/models/datasets'
import { noop } from 'lodash-es'

export type IBatchModalProps = {
isShow: boolean
docForm: ChunkingMode
onCancel: () => void
onConfirm: (file: File) => void
onConfirm: (file: FileItem) => void
}

const BatchModal: FC<IBatchModalProps> = ({
@@ -24,8 +24,8 @@ const BatchModal: FC<IBatchModalProps> = ({
onConfirm,
}) => {
const { t } = useTranslation()
const [currentCSV, setCurrentCSV] = useState<File>()
const handleFile = (file?: File) => setCurrentCSV(file)
const [currentCSV, setCurrentCSV] = useState<FileItem>()
const handleFile = (file?: FileItem) => setCurrentCSV(file)

const handleSend = () => {
if (!currentCSV)
@@ -56,7 +56,7 @@ const BatchModal: FC<IBatchModalProps> = ({
<Button className='mr-2' onClick={onCancel}>
{t('datasetDocuments.list.batchModal.cancel')}
</Button>
<Button variant="primary" onClick={handleSend} disabled={!currentCSV}>
<Button variant="primary" onClick={handleSend} disabled={!currentCSV || !currentCSV.file || !currentCSV.file.id}>
{t('datasetDocuments.list.batchModal.run')}
</Button>
</div>

+ 3
- 5
web/app/components/datasets/documents/detail/index.tsx Parādīt failu

@@ -17,7 +17,7 @@ import cn from '@/utils/classnames'
import Divider from '@/app/components/base/divider'
import Loading from '@/app/components/base/loading'
import { ToastContext } from '@/app/components/base/toast'
import type { ChunkingMode, ParentMode, ProcessMode } from '@/models/datasets'
import type { ChunkingMode, FileItem, ParentMode, ProcessMode } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import FloatRightContainer from '@/app/components/base/float-right-container'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
@@ -111,12 +111,10 @@ const DocumentDetail: FC<Props> = ({ datasetId, documentId }) => {
}

const { mutateAsync: segmentBatchImport } = useSegmentBatchImport()
const runBatch = async (csv: File) => {
const formData = new FormData()
formData.append('file', csv)
const runBatch = async (csv: FileItem) => {
await segmentBatchImport({
url: `/datasets/${datasetId}/documents/${documentId}/segments/batch_import`,
body: formData,
body: { upload_file_id: csv.file.id! },
}, {
onSuccess: (res) => {
setImportStatus(res.job_status)

+ 2
- 2
web/service/knowledge/use-segment.ts Parādīt failu

@@ -154,9 +154,9 @@ export const useUpdateChildSegment = () => {
export const useSegmentBatchImport = () => {
return useMutation({
mutationKey: [NAME_SPACE, 'batchImport'],
mutationFn: (payload: { url: string; body: FormData }) => {
mutationFn: (payload: { url: string; body: { upload_file_id: string } }) => {
const { url, body } = payload
return post<BatchImportResponse>(url, { body }, { bodyStringify: false, deleteContentType: true })
return post<BatchImportResponse>(url, { body })
},
})
}

Notiek ielāde…
Atcelt
Saglabāt