浏览代码

Fix/upload limit (#2521)

Co-authored-by: jyong <jyong@dify.ai>
Co-authored-by: StyleZhang <jasonapring2015@outlook.com>
tags/0.5.7
Jyong 1年前
父节点
当前提交
97fe817186
没有帐户链接到提交者的电子邮件

+ 2
- 0
api/.env.example 查看文件



SSRF_PROXY_HTTP_URL= SSRF_PROXY_HTTP_URL=
SSRF_PROXY_HTTPS_URL= SSRF_PROXY_HTTPS_URL=

BATCH_UPLOAD_LIMIT=10

+ 3
- 0
api/config.py 查看文件

'BILLING_ENABLED': 'False', 'BILLING_ENABLED': 'False',
'CAN_REPLACE_LOGO': 'False', 'CAN_REPLACE_LOGO': 'False',
'ETL_TYPE': 'dify', 'ETL_TYPE': 'dify',
'BATCH_UPLOAD_LIMIT': 20
} }




self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED') self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED')
self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO') self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO')


self.BATCH_UPLOAD_LIMIT = get_env('BATCH_UPLOAD_LIMIT')



class CloudEditionConfig(Config): class CloudEditionConfig(Config):



+ 17
- 0
api/core/indexing_runner.py 查看文件

from models.dataset import Document as DatasetDocument from models.dataset import Document as DatasetDocument
from models.model import UploadFile from models.model import UploadFile
from models.source import DataSourceBinding from models.source import DataSourceBinding
from services.feature_service import FeatureService




class IndexingRunner: class IndexingRunner:
""" """
Estimate the indexing for the document. Estimate the indexing for the document.
""" """
# check document limit
features = FeatureService.get_features(tenant_id)
if features.billing.enabled:
count = len(file_details)
batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")

embedding_model_instance = None embedding_model_instance = None
if dataset_id: if dataset_id:
dataset = Dataset.query.filter_by( dataset = Dataset.query.filter_by(
""" """
Estimate the indexing for the document. Estimate the indexing for the document.
""" """
# check document limit
features = FeatureService.get_features(tenant_id)
if features.billing.enabled:
count = len(notion_info_list)
batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")

embedding_model_instance = None embedding_model_instance = None
if dataset_id: if dataset_id:
dataset = Dataset.query.filter_by( dataset = Dataset.query.filter_by(

+ 7
- 0
api/services/annotation_service.py 查看文件

from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from models.model import App, AppAnnotationHitHistory, AppAnnotationSetting, Message, MessageAnnotation from models.model import App, AppAnnotationHitHistory, AppAnnotationSetting, Message, MessageAnnotation
from services.feature_service import FeatureService
from tasks.annotation.add_annotation_to_index_task import add_annotation_to_index_task from tasks.annotation.add_annotation_to_index_task import add_annotation_to_index_task
from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task from tasks.annotation.batch_import_annotations_task import batch_import_annotations_task
from tasks.annotation.delete_annotation_index_task import delete_annotation_index_task from tasks.annotation.delete_annotation_index_task import delete_annotation_index_task
result.append(content) result.append(content)
if len(result) == 0: if len(result) == 0:
raise ValueError("The CSV file is empty.") raise ValueError("The CSV file is empty.")
# check annotation limit
features = FeatureService.get_features(current_user.current_tenant_id)
if features.billing.enabled:
annotation_quota_limit = features.annotation_quota_limit
if annotation_quota_limit.limit < len(result) + annotation_quota_limit.size:
raise ValueError("The number of annotations exceeds the limit of your subscription.")
# async job # async job
job_id = str(uuid.uuid4()) job_id = str(uuid.uuid4())
indexing_cache_key = 'app_annotation_batch_import_{}'.format(str(job_id)) indexing_cache_key = 'app_annotation_batch_import_{}'.format(str(job_id))

+ 22
- 10
api/services/dataset_service.py 查看文件

from services.errors.dataset import DatasetNameDuplicateError from services.errors.dataset import DatasetNameDuplicateError
from services.errors.document import DocumentIndexingError from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError from services.errors.file import FileNotExistsError
from services.feature_service import FeatureService
from services.vector_service import VectorService from services.vector_service import VectorService
from tasks.clean_notion_document_task import clean_notion_document_task from tasks.clean_notion_document_task import clean_notion_document_task
from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task from tasks.deal_dataset_vector_index_task import deal_dataset_vector_index_task
created_from: str = 'web'): created_from: str = 'web'):


# check document limit # check document limit
if current_app.config['EDITION'] == 'CLOUD':
features = FeatureService.get_features(current_user.current_tenant_id)

if features.billing.enabled:
if 'original_document_id' not in document_data or not document_data['original_document_id']: if 'original_document_id' not in document_data or not document_data['original_document_id']:
count = 0 count = 0
if document_data["data_source"]["type"] == "upload_file": if document_data["data_source"]["type"] == "upload_file":
notion_info_list = document_data["data_source"]['info_list']['notion_info_list'] notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
for notion_info in notion_info_list: for notion_info in notion_info_list:
count = count + len(notion_info['pages']) count = count + len(notion_info['pages'])
batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
# if dataset is empty, update dataset data_source_type # if dataset is empty, update dataset data_source_type
if not dataset.data_source_type: if not dataset.data_source_type:
dataset.data_source_type = document_data["data_source"]["type"] dataset.data_source_type = document_data["data_source"]["type"]


@staticmethod @staticmethod
def save_document_without_dataset_id(tenant_id: str, document_data: dict, account: Account): def save_document_without_dataset_id(tenant_id: str, document_data: dict, account: Account):
count = 0
if document_data["data_source"]["type"] == "upload_file":
upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
count = len(upload_file_list)
elif document_data["data_source"]["type"] == "notion_import":
notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
for notion_info in notion_info_list:
count = count + len(notion_info['pages'])
features = FeatureService.get_features(current_user.current_tenant_id)

if features.billing.enabled:
count = 0
if document_data["data_source"]["type"] == "upload_file":
upload_file_list = document_data["data_source"]["info_list"]['file_info_list']['file_ids']
count = len(upload_file_list)
elif document_data["data_source"]["type"] == "notion_import":
notion_info_list = document_data["data_source"]['info_list']['notion_info_list']
for notion_info in notion_info_list:
count = count + len(notion_info['pages'])
batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")


embedding_model = None embedding_model = None
dataset_collection_binding_id = None dataset_collection_binding_id = None
segment.answer = args['answer'] segment.answer = args['answer']
if 'keywords' in args and args['keywords']: if 'keywords' in args and args['keywords']:
segment.keywords = args['keywords'] segment.keywords = args['keywords']
if'enabled' in args and args['enabled'] is not None:
if 'enabled' in args and args['enabled'] is not None:
segment.enabled = args['enabled'] segment.enabled = args['enabled']
db.session.add(segment) db.session.add(segment)
db.session.commit() db.session.commit()

+ 2
- 2
api/services/file_service.py 查看文件

IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])


ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] + IMAGE_EXTENSIONS
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] + IMAGE_EXTENSIONS
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
PREVIEW_WORDS_LIMIT = 3000 PREVIEW_WORDS_LIMIT = 3000





+ 32
- 1
api/tasks/document_indexing_task.py 查看文件



import click import click
from celery import shared_task from celery import shared_task
from flask import current_app


from core.indexing_runner import DocumentIsPausedException, IndexingRunner from core.indexing_runner import DocumentIsPausedException, IndexingRunner
from extensions.ext_database import db from extensions.ext_database import db
from models.dataset import Document
from models.dataset import Dataset, Document
from services.feature_service import FeatureService




@shared_task(queue='dataset') @shared_task(queue='dataset')
""" """
documents = [] documents = []
start_at = time.perf_counter() start_at = time.perf_counter()

dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()

# check document limit
features = FeatureService.get_features(dataset.tenant_id)
try:
if features.billing.enabled:
vector_space = features.vector_space
count = len(document_ids)
batch_upload_limit = int(current_app.config['BATCH_UPLOAD_LIMIT'])
if count > batch_upload_limit:
raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
if 0 < vector_space.limit <= vector_space.size:
raise ValueError("Your total number of documents plus the number of uploads have over the limit of "
"your subscription.")
except Exception as e:
for document_id in document_ids:
document = db.session.query(Document).filter(
Document.id == document_id,
Document.dataset_id == dataset_id
).first()
if document:
document.indexing_status = 'error'
document.error = str(e)
document.stopped_at = datetime.datetime.utcnow()
db.session.add(document)
db.session.commit()
return

for document_id in document_ids: for document_id in document_ids:
logging.info(click.style('Start process document: {}'.format(document_id), fg='green')) logging.info(click.style('Start process document: {}'.format(document_id), fg='green'))



+ 8
- 1
web/app/components/datasets/create/file-uploader/index.tsx 查看文件

import I18n from '@/context/i18n' import I18n from '@/context/i18n'
import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language' import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language'


const FILES_NUMBER_LIMIT = 20

type IFileUploaderProps = { type IFileUploaderProps = {
fileList: FileItem[] fileList: FileItem[]
titleClassName?: string titleClassName?: string
if (!files.length) if (!files.length)
return false return false


if (files.length + fileList.length > FILES_NUMBER_LIMIT) {
notify({ type: 'error', message: t('datasetCreation.stepOne.uploader.validation.filesNumber', { filesNumber: FILES_NUMBER_LIMIT }) })
return false
}

const preparedFiles = files.map((file, index) => ({ const preparedFiles = files.map((file, index) => ({
fileID: `file${index}-${Date.now()}`, fileID: `file${index}-${Date.now()}`,
file, file,
prepareFileList(newFiles) prepareFileList(newFiles)
fileListRef.current = newFiles fileListRef.current = newFiles
uploadMultipleFiles(preparedFiles) uploadMultipleFiles(preparedFiles)
}, [prepareFileList, uploadMultipleFiles])
}, [prepareFileList, uploadMultipleFiles, notify, t, fileList])


const handleDragEnter = (e: DragEvent) => { const handleDragEnter = (e: DragEvent) => {
e.preventDefault() e.preventDefault()

+ 1
- 0
web/i18n/lang/dataset-creation.en.ts 查看文件

typeError: 'File type not supported', typeError: 'File type not supported',
size: 'File too large. Maximum is {{size}}MB', size: 'File too large. Maximum is {{size}}MB',
count: 'Multiple files not supported', count: 'Multiple files not supported',
filesNumber: 'You have reached the batch upload limit of {{filesNumber}}.',
}, },
cancel: 'Cancel', cancel: 'Cancel',
change: 'Change', change: 'Change',

+ 1
- 0
web/i18n/lang/dataset-creation.pt.ts 查看文件

typeError: 'Tipo de arquivo não suportado', typeError: 'Tipo de arquivo não suportado',
size: 'Arquivo muito grande. Máximo é {{size}}MB', size: 'Arquivo muito grande. Máximo é {{size}}MB',
count: 'Vários arquivos não suportados', count: 'Vários arquivos não suportados',
filesNumber: 'Limite de upload em massa {{filesNumber}}.',
}, },
cancel: 'Cancelar', cancel: 'Cancelar',
change: 'Alterar', change: 'Alterar',

+ 1
- 0
web/i18n/lang/dataset-creation.uk.ts 查看文件

typeError: 'Тип файлу не підтримується', typeError: 'Тип файлу не підтримується',
size: 'Файл занадто великий. Максимум – {{size}} МБ', size: 'Файл занадто великий. Максимум – {{size}} МБ',
count: 'Не підтримується завантаження кількох файлів', count: 'Не підтримується завантаження кількох файлів',
filesNumber: 'Ліміт масового завантаження {{filesNumber}}.',
}, },
cancel: 'Скасувати', cancel: 'Скасувати',
change: 'Змінити', change: 'Змінити',

+ 1
- 0
web/i18n/lang/dataset-creation.zh.ts 查看文件

typeError: '文件类型不支持', typeError: '文件类型不支持',
size: '文件太大了,不能超过 {{size}}MB', size: '文件太大了,不能超过 {{size}}MB',
count: '暂不支持多个文件', count: '暂不支持多个文件',
filesNumber: '批量上传限制 {{filesNumber}}。',
}, },
cancel: '取消', cancel: '取消',
change: '更改文件', change: '更改文件',

正在加载...
取消
保存