Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>tags/0.5.6
| @@ -1,7 +1,7 @@ | |||
| import logging | |||
| from flask import request | |||
| from flask_restful import Resource | |||
| from flask_restful import Resource, reqparse | |||
| from werkzeug.exceptions import InternalServerError | |||
| import services | |||
| @@ -23,6 +23,7 @@ from controllers.console.wraps import account_initialization_required | |||
| from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | |||
| from core.model_runtime.errors.invoke import InvokeError | |||
| from libs.login import login_required | |||
| from models.model import AppModelConfig | |||
| from services.audio_service import AudioService | |||
| from services.errors.audio import ( | |||
| AudioTooLargeServiceError, | |||
| @@ -45,7 +46,9 @@ class ChatMessageAudioApi(Resource): | |||
| try: | |||
| response = AudioService.transcript_asr( | |||
| tenant_id=app_model.tenant_id, | |||
| file=file | |||
| file=file, | |||
| end_user=None, | |||
| promot=app_model.app_model_config.pre_prompt | |||
| ) | |||
| return response | |||
| @@ -71,7 +74,7 @@ class ChatMessageAudioApi(Resource): | |||
| except ValueError as e: | |||
| raise e | |||
| except Exception as e: | |||
| logging.exception("internal server error.") | |||
| logging.exception(f"internal server error, {str(e)}.") | |||
| raise InternalServerError() | |||
| @@ -82,10 +85,17 @@ class ChatMessageTextApi(Resource): | |||
| def post(self, app_id): | |||
| app_id = str(app_id) | |||
| app_model = _get_app(app_id, None) | |||
| app_model_config: AppModelConfig = app_model.app_model_config | |||
| if not app_model_config.text_to_speech_dict['enabled']: | |||
| raise AppUnavailableError() | |||
| try: | |||
| response = AudioService.transcript_tts( | |||
| tenant_id=app_model.tenant_id, | |||
| text=request.form['text'], | |||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||
| streaming=False | |||
| ) | |||
| @@ -112,9 +122,54 @@ class ChatMessageTextApi(Resource): | |||
| except ValueError as e: | |||
| raise e | |||
| except Exception as e: | |||
| logging.exception("internal server error.") | |||
| logging.exception(f"internal server error, {str(e)}.") | |||
| raise InternalServerError() | |||
| class TextModesApi(Resource): | |||
| def get(self, app_id: str): | |||
| app_model = _get_app(str(app_id)) | |||
| app_model_config: AppModelConfig = app_model.app_model_config | |||
| if not app_model_config.text_to_speech_dict['enabled']: | |||
| raise AppUnavailableError() | |||
| try: | |||
| parser = reqparse.RequestParser() | |||
| parser.add_argument('language', type=str, required=True, location='args') | |||
| args = parser.parse_args() | |||
| response = AudioService.transcript_tts_voices( | |||
| tenant_id=app_model.tenant_id, | |||
| language=args['language'], | |||
| ) | |||
| return response | |||
| except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError: | |||
| raise AppUnavailableError("Text to audio voices language parameter loss.") | |||
| except NoAudioUploadedServiceError: | |||
| raise NoAudioUploadedError() | |||
| except AudioTooLargeServiceError as e: | |||
| raise AudioTooLargeError(str(e)) | |||
| except UnsupportedAudioTypeServiceError: | |||
| raise UnsupportedAudioTypeError() | |||
| except ProviderNotSupportSpeechToTextServiceError: | |||
| raise ProviderNotSupportSpeechToTextError() | |||
| except ProviderTokenNotInitError as ex: | |||
| raise ProviderNotInitializeError(ex.description) | |||
| except QuotaExceededError: | |||
| raise ProviderQuotaExceededError() | |||
| except ModelCurrentlyNotSupportError: | |||
| raise ProviderModelCurrentlyNotSupportError() | |||
| except InvokeError as e: | |||
| raise CompletionRequestError(e.description) | |||
| except ValueError as e: | |||
| raise e | |||
| except Exception as e: | |||
| logging.exception(f"internal server error, {str(e)}.") | |||
| raise InternalServerError() | |||
| api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text') | |||
| api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio') | |||
| api.add_resource(TextModesApi, '/apps/<uuid:app_id>/text-to-audio/voices') | |||
| @@ -85,6 +85,7 @@ class ChatTextApi(InstalledAppResource): | |||
| response = AudioService.transcript_tts( | |||
| tenant_id=app_model.tenant_id, | |||
| text=request.form['text'], | |||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||
| streaming=False | |||
| ) | |||
| return {'data': response.data.decode('latin1')} | |||
| @@ -86,6 +86,7 @@ class TextApi(AppApiResource): | |||
| tenant_id=app_model.tenant_id, | |||
| text=args['text'], | |||
| end_user=args['user'], | |||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||
| streaming=args['streaming'] | |||
| ) | |||
| @@ -68,17 +68,23 @@ class AudioApi(WebApiResource): | |||
| except ValueError as e: | |||
| raise e | |||
| except Exception as e: | |||
| logging.exception("internal server error.") | |||
| logging.exception(f"internal server error: {str(e)}") | |||
| raise InternalServerError() | |||
| class TextApi(WebApiResource): | |||
| def post(self, app_model: App, end_user): | |||
| app_model_config: AppModelConfig = app_model.app_model_config | |||
| if not app_model_config.text_to_speech_dict['enabled']: | |||
| raise AppUnavailableError() | |||
| try: | |||
| response = AudioService.transcript_tts( | |||
| tenant_id=app_model.tenant_id, | |||
| text=request.form['text'], | |||
| end_user=end_user.external_user_id, | |||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||
| streaming=False | |||
| ) | |||
| @@ -105,7 +111,7 @@ class TextApi(WebApiResource): | |||
| except ValueError as e: | |||
| raise e | |||
| except Exception as e: | |||
| logging.exception("internal server error.") | |||
| logging.exception(f"internal server error: {str(e)}") | |||
| raise InternalServerError() | |||
| @@ -28,6 +28,7 @@ from core.entities.application_entities import ( | |||
| ModelConfigEntity, | |||
| PromptTemplateEntity, | |||
| SensitiveWordAvoidanceEntity, | |||
| TextToSpeechEntity, | |||
| ) | |||
| from core.entities.model_entities import ModelStatus | |||
| from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | |||
| @@ -572,7 +573,11 @@ class ApplicationManager: | |||
| text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech') | |||
| if text_to_speech_dict: | |||
| if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']: | |||
| properties['text_to_speech'] = True | |||
| properties['text_to_speech'] = TextToSpeechEntity( | |||
| enabled=text_to_speech_dict.get('enabled'), | |||
| voice=text_to_speech_dict.get('voice'), | |||
| language=text_to_speech_dict.get('language'), | |||
| ) | |||
| # sensitive word avoidance | |||
| sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance') | |||
| @@ -42,6 +42,7 @@ class AdvancedCompletionPromptTemplateEntity(BaseModel): | |||
| """ | |||
| Advanced Completion Prompt Template Entity. | |||
| """ | |||
| class RolePrefixEntity(BaseModel): | |||
| """ | |||
| Role Prefix Entity. | |||
| @@ -57,6 +58,7 @@ class PromptTemplateEntity(BaseModel): | |||
| """ | |||
| Prompt Template Entity. | |||
| """ | |||
| class PromptType(Enum): | |||
| """ | |||
| Prompt Type. | |||
| @@ -97,6 +99,7 @@ class DatasetRetrieveConfigEntity(BaseModel): | |||
| """ | |||
| Dataset Retrieve Config Entity. | |||
| """ | |||
| class RetrieveStrategy(Enum): | |||
| """ | |||
| Dataset Retrieve Strategy. | |||
| @@ -143,6 +146,15 @@ class SensitiveWordAvoidanceEntity(BaseModel): | |||
| config: dict[str, Any] = {} | |||
| class TextToSpeechEntity(BaseModel): | |||
| """ | |||
| Sensitive Word Avoidance Entity. | |||
| """ | |||
| enabled: bool | |||
| voice: Optional[str] = None | |||
| language: Optional[str] = None | |||
| class FileUploadEntity(BaseModel): | |||
| """ | |||
| File Upload Entity. | |||
| @@ -159,6 +171,7 @@ class AgentToolEntity(BaseModel): | |||
| tool_name: str | |||
| tool_parameters: dict[str, Any] = {} | |||
| class AgentPromptEntity(BaseModel): | |||
| """ | |||
| Agent Prompt Entity. | |||
| @@ -166,6 +179,7 @@ class AgentPromptEntity(BaseModel): | |||
| first_prompt: str | |||
| next_iteration: str | |||
| class AgentScratchpadUnit(BaseModel): | |||
| """ | |||
| Agent First Prompt Entity. | |||
| @@ -182,12 +196,14 @@ class AgentScratchpadUnit(BaseModel): | |||
| thought: Optional[str] = None | |||
| action_str: Optional[str] = None | |||
| observation: Optional[str] = None | |||
| action: Optional[Action] = None | |||
| action: Optional[Action] = None | |||
| class AgentEntity(BaseModel): | |||
| """ | |||
| Agent Entity. | |||
| """ | |||
| class Strategy(Enum): | |||
| """ | |||
| Agent Strategy. | |||
| @@ -202,6 +218,7 @@ class AgentEntity(BaseModel): | |||
| tools: list[AgentToolEntity] = None | |||
| max_iteration: int = 5 | |||
| class AppOrchestrationConfigEntity(BaseModel): | |||
| """ | |||
| App Orchestration Config Entity. | |||
| @@ -219,7 +236,7 @@ class AppOrchestrationConfigEntity(BaseModel): | |||
| show_retrieve_source: bool = False | |||
| more_like_this: bool = False | |||
| speech_to_text: bool = False | |||
| text_to_speech: bool = False | |||
| text_to_speech: dict = {} | |||
| sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None | |||
| @@ -99,7 +99,8 @@ class ModelInstance: | |||
| user=user | |||
| ) | |||
| def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None, | |||
| def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, | |||
| top_n: Optional[int] = None, | |||
| user: Optional[str] = None) \ | |||
| -> RerankResult: | |||
| """ | |||
| @@ -166,13 +167,15 @@ class ModelInstance: | |||
| user=user | |||
| ) | |||
| def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \ | |||
| def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \ | |||
| -> str: | |||
| """ | |||
| Invoke large language model | |||
| Invoke large language tts model | |||
| :param content_text: text content to be translated | |||
| :param tenant_id: user tenant id | |||
| :param user: unique user id | |||
| :param voice: model timbre | |||
| :param streaming: output is streaming | |||
| :return: text for given audio file | |||
| """ | |||
| @@ -185,9 +188,28 @@ class ModelInstance: | |||
| credentials=self.credentials, | |||
| content_text=content_text, | |||
| user=user, | |||
| tenant_id=tenant_id, | |||
| voice=voice, | |||
| streaming=streaming | |||
| ) | |||
| def get_tts_voices(self, language: str) -> list: | |||
| """ | |||
| Invoke large language tts model voices | |||
| :param language: tts language | |||
| :return: tts model voices | |||
| """ | |||
| if not isinstance(self.model_type_instance, TTSModel): | |||
| raise Exception("Model type instance is not TTSModel") | |||
| self.model_type_instance = cast(TTSModel, self.model_type_instance) | |||
| return self.model_type_instance.get_tts_model_voices( | |||
| model=self.model, | |||
| credentials=self.credentials, | |||
| language=language | |||
| ) | |||
| class ModelManager: | |||
| def __init__(self) -> None: | |||
| @@ -48,6 +48,10 @@ | |||
| - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`) | |||
| - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`) | |||
| - `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`) | |||
| - `voices` (list) List of available voice.(available for model type `tts`) | |||
| - `mode` (string) voice model.(available for model type `tts`) | |||
| - `name` (string) voice model display name.(available for model type `tts`) | |||
| - `lanuage` (string) the voice model supports languages.(available for model type `tts`) | |||
| - `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`) | |||
| - `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`) | |||
| - `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`) | |||
| @@ -48,7 +48,11 @@ | |||
| - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用) | |||
| - `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用) | |||
| - `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用) | |||
| - `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) | |||
| - `default_voice` (string) 缺省音色,必选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) | |||
| - `voices` (list) 可选音色列表。 | |||
| - `mode` (string) 音色模型。(模型类型 `tts` 可用) | |||
| - `name` (string) 音色模型显示名称。(模型类型 `tts` 可用) | |||
| - `lanuage` (string) 音色模型支持语言。(模型类型 `tts` 可用) | |||
| - `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用) | |||
| - `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用) | |||
| - `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用) | |||
| @@ -127,6 +127,7 @@ class ModelPropertyKey(Enum): | |||
| SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions" | |||
| MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk" | |||
| DEFAULT_VOICE = "default_voice" | |||
| VOICES = "voices" | |||
| WORD_LIMIT = "word_limit" | |||
| AUDOI_TYPE = "audio_type" | |||
| MAX_WORKERS = "max_workers" | |||
| @@ -15,29 +15,37 @@ class TTSModel(AIModel): | |||
| """ | |||
| model_type: ModelType = ModelType.TTS | |||
| def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): | |||
| def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||
| user: Optional[str] = None): | |||
| """ | |||
| Invoke large language model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param content_text: text content to be translated | |||
| :param streaming: output is streaming | |||
| :param user: unique user id | |||
| :return: translated audio file | |||
| """ | |||
| try: | |||
| return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text) | |||
| self._is_ffmpeg_installed() | |||
| return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, | |||
| content_text=content_text, voice=voice, tenant_id=tenant_id) | |||
| except Exception as e: | |||
| raise self._transform_invoke_error(e) | |||
| @abstractmethod | |||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): | |||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||
| user: Optional[str] = None): | |||
| """ | |||
| Invoke large language model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param content_text: text content to be translated | |||
| :param streaming: output is streaming | |||
| :param user: unique user id | |||
| @@ -45,7 +53,22 @@ class TTSModel(AIModel): | |||
| """ | |||
| raise NotImplementedError | |||
| def _get_model_voice(self, model: str, credentials: dict) -> any: | |||
| def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list: | |||
| """ | |||
| Get voice for given tts model voices | |||
| :param language: tts language | |||
| :param model: model name | |||
| :param credentials: model credentials | |||
| :return: voices lists | |||
| """ | |||
| model_schema = self.get_model_schema(model, credentials) | |||
| if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties: | |||
| voices = model_schema.model_properties[ModelPropertyKey.VOICES] | |||
| return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')] | |||
| def _get_model_default_voice(self, model: str, credentials: dict) -> any: | |||
| """ | |||
| Get voice for given tts model | |||
| @@ -1,7 +1,31 @@ | |||
| model: tts-1-hd | |||
| model: tts-1 | |||
| model_type: tts | |||
| model_properties: | |||
| default_voice: 'alloy' | |||
| voices: | |||
| - mode: 'alloy' | |||
| name: 'Alloy' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'echo' | |||
| name: 'Echo' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'fable' | |||
| name: 'Fable' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'onyx' | |||
| name: 'Onyx' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'nova' | |||
| name: 'Nova' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'shimmer' | |||
| name: 'Shimmer' | |||
| language: ['zh-CN', 'en-US'] | |||
| word_limit: 120 | |||
| audio_type: 'mp3' | |||
| max_workers: 5 | |||
| pricing: | |||
| input: '0.03' | |||
| output: '0' | |||
| unit: '0.001' | |||
| currency: USD | |||
| @@ -2,6 +2,30 @@ model: tts-1 | |||
| model_type: tts | |||
| model_properties: | |||
| default_voice: 'alloy' | |||
| voices: | |||
| - mode: 'alloy' | |||
| name: 'Alloy' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'echo' | |||
| name: 'Echo' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'fable' | |||
| name: 'Fable' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'onyx' | |||
| name: 'Onyx' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'nova' | |||
| name: 'Nova' | |||
| language: ['zh-CN', 'en-US'] | |||
| - mode: 'shimmer' | |||
| name: 'Shimmer' | |||
| language: ['zh-CN', 'en-US'] | |||
| word_limit: 120 | |||
| audio_type: 'mp3' | |||
| max_workers: 5 | |||
| pricing: | |||
| input: '0.015' | |||
| output: '0' | |||
| unit: '0.001' | |||
| currency: USD | |||
| @@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError | |||
| from core.model_runtime.errors.validate import CredentialsValidateFailedError | |||
| from core.model_runtime.model_providers.__base.tts_model import TTSModel | |||
| from core.model_runtime.model_providers.openai._common import _CommonOpenAI | |||
| from extensions.ext_storage import storage | |||
| class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): | |||
| """ | |||
| Model class for OpenAI Speech to text model. | |||
| """ | |||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: | |||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, | |||
| content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any: | |||
| """ | |||
| _invoke text2speech model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param content_text: text content to be translated | |||
| :param voice: model timbre | |||
| :param streaming: output is streaming | |||
| :param user: unique user id | |||
| :return: text translated to audio file | |||
| """ | |||
| self._is_ffmpeg_installed() | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| if not voice: | |||
| voice = self._get_model_default_voice(model, credentials) | |||
| if streaming: | |||
| return Response(stream_with_context(self._tts_invoke_streaming(model=model, | |||
| credentials=credentials, | |||
| content_text=content_text, | |||
| user=user)), | |||
| tenant_id=tenant_id, | |||
| voice=voice)), | |||
| status=200, mimetype=f'audio/{audio_type}') | |||
| else: | |||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) | |||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) | |||
| def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | |||
| """ | |||
| @@ -52,91 +59,96 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): | |||
| self._tts_invoke( | |||
| model=model, | |||
| credentials=credentials, | |||
| content_text='Hello world!', | |||
| user=user | |||
| content_text='Hello Dify!', | |||
| voice=self._get_model_default_voice(model, credentials), | |||
| ) | |||
| except Exception as ex: | |||
| raise CredentialsValidateFailedError(str(ex)) | |||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: | |||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: | |||
| """ | |||
| _tts_invoke text2speech model | |||
| :param model: model name | |||
| :param credentials: model credentials | |||
| :param content_text: text content to be translated | |||
| :param user: unique user id | |||
| :param voice: model timbre | |||
| :return: text translated to audio file | |||
| """ | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| word_limit = self._get_model_word_limit(model, credentials) | |||
| max_workers = self._get_model_workers_limit(model, credentials) | |||
| try: | |||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | |||
| audio_bytes_list = list() | |||
| # Create a thread pool and map the function to the list of sentences | |||
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | |||
| futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence | |||
| in sentences] | |||
| futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, | |||
| credentials=credentials) for sentence in sentences] | |||
| for future in futures: | |||
| try: | |||
| audio_bytes_list.append(future.result()) | |||
| if future.result(): | |||
| audio_bytes_list.append(future.result()) | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||
| audio_bytes_list if audio_bytes] | |||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||
| buffer: BytesIO = BytesIO() | |||
| combined_segment.export(buffer, format=audio_type) | |||
| buffer.seek(0) | |||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||
| if len(audio_bytes_list) > 0: | |||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||
| audio_bytes_list if audio_bytes] | |||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||
| buffer: BytesIO = BytesIO() | |||
| combined_segment.export(buffer, format=audio_type) | |||
| buffer.seek(0) | |||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| # Todo: To improve the streaming function | |||
| def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: | |||
| def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, | |||
| voice: str) -> any: | |||
| """ | |||
| _tts_invoke_streaming text2speech model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param content_text: text content to be translated | |||
| :param user: unique user id | |||
| :param voice: model timbre | |||
| :return: text translated to audio file | |||
| """ | |||
| # transform credentials to kwargs for model instance | |||
| credentials_kwargs = self._to_credential_kwargs(credentials) | |||
| voice_name = self._get_model_voice(model, credentials) | |||
| if not voice: | |||
| voice = self._get_model_default_voice(model, credentials) | |||
| word_limit = self._get_model_word_limit(model, credentials) | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| tts_file_id = self._get_file_name(content_text) | |||
| file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}' | |||
| file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' | |||
| try: | |||
| client = OpenAI(**credentials_kwargs) | |||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | |||
| for sentence in sentences: | |||
| response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) | |||
| response.stream_to_file(file_path) | |||
| response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) | |||
| # response.stream_to_file(file_path) | |||
| storage.save(file_path, response.read()) | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| def _process_sentence(self, sentence: str, model: str, credentials: dict): | |||
| def _process_sentence(self, sentence: str, model: str, | |||
| voice, credentials: dict): | |||
| """ | |||
| _tts_invoke openai text2speech model api | |||
| :param model: model name | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param sentence: text content to be translated | |||
| :return: text translated to audio file | |||
| """ | |||
| # transform credentials to kwargs for model instance | |||
| credentials_kwargs = self._to_credential_kwargs(credentials) | |||
| voice_name = self._get_model_voice(model, credentials) | |||
| client = OpenAI(**credentials_kwargs) | |||
| response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) | |||
| response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) | |||
| if isinstance(response.read(), bytes): | |||
| return response.read() | |||
| @@ -1,7 +1,134 @@ | |||
| model: tts-1 | |||
| model_type: tts | |||
| model_properties: | |||
| default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置 | |||
| default_voice: 'sambert-zhiru-v1' | |||
| voices: | |||
| - mode: "sambert-zhinan-v1" | |||
| name: "知楠(广告男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiqi-v1" | |||
| name: "知琪(温柔女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhichu-v1" | |||
| name: "知厨(新闻播报)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhide-v1" | |||
| name: "知德(新闻男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhijia-v1" | |||
| name: "知佳(标准女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiru-v1" | |||
| name: "知茹(新闻女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiqian-v1" | |||
| name: "知倩(配音解说、新闻播报)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhixiang-v1" | |||
| name: "知祥(配音解说)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiwei-v1" | |||
| name: "知薇(萝莉女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhihao-v1" | |||
| name: "知浩(咨询男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhijing-v1" | |||
| name: "知婧(严厉女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiming-v1" | |||
| name: "知茗(诙谐男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhimo-v1" | |||
| name: "知墨(情感男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhina-v1" | |||
| name: "知娜(浙普女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhishu-v1" | |||
| name: "知树(资讯男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhistella-v1" | |||
| name: "知莎(知性女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiting-v1" | |||
| name: "知婷(电台女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhixiao-v1" | |||
| name: "知笑(资讯女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiya-v1" | |||
| name: "知雅(严厉女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiye-v1" | |||
| name: "知晔(青年男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiying-v1" | |||
| name: "知颖(软萌童声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhiyuan-v1" | |||
| name: "知媛(知心姐姐)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhigui-v1" | |||
| name: "知柜(直播女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhishuo-v1" | |||
| name: "知硕(自然男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhimiao-emo-v1" | |||
| name: "知妙(多种情感女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhimao-v1" | |||
| name: "知猫(直播女声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhilun-v1" | |||
| name: "知伦(悬疑解说)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhifei-v1" | |||
| name: "知飞(激昂解说)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-zhida-v1" | |||
| name: "知达(标准男声)" | |||
| language: [ "zh-CN", "en-US" ] | |||
| - mode: "sambert-camila-v1" | |||
| name: "Camila(西班牙语女声)" | |||
| language: [ "es-ES" ] | |||
| - mode: "sambert-perla-v1" | |||
| name: "Perla(意大利语女声)" | |||
| language: [ "it-IT" ] | |||
| - mode: "sambert-indah-v1" | |||
| name: "Indah(印尼语女声)" | |||
| language: [ "id-ID" ] | |||
| - mode: "sambert-clara-v1" | |||
| name: "Clara(法语女声)" | |||
| language: [ "fr-FR" ] | |||
| - mode: "sambert-hanna-v1" | |||
| name: "Hanna(德语女声)" | |||
| language: [ "de-DE" ] | |||
| - mode: "sambert-beth-v1" | |||
| name: "Beth(咨询女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-betty-v1" | |||
| name: "Betty(客服女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-cally-v1" | |||
| name: "Cally(自然女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-cindy-v1" | |||
| name: "Cindy(对话女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-eva-v1" | |||
| name: "Eva(陪伴女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-donna-v1" | |||
| name: "Donna(教育女声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-brian-v1" | |||
| name: "Brian(客服男声)" | |||
| language: [ "en-US" ] | |||
| - mode: "sambert-waan-v1" | |||
| name: "Waan(泰语女声)" | |||
| language: [ "th-TH" ] | |||
| word_limit: 120 | |||
| audio_type: 'mp3' | |||
| max_workers: 5 | |||
| @@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError | |||
| from core.model_runtime.errors.validate import CredentialsValidateFailedError | |||
| from core.model_runtime.model_providers.__base.tts_model import TTSModel | |||
| from core.model_runtime.model_providers.tongyi._common import _CommonTongyi | |||
| from extensions.ext_storage import storage | |||
| class TongyiText2SpeechModel(_CommonTongyi, TTSModel): | |||
| """ | |||
| Model class for Tongyi Speech to text model. | |||
| """ | |||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: | |||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||
| user: Optional[str] = None) -> any: | |||
| """ | |||
| _invoke text2speech model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param content_text: text content to be translated | |||
| :param streaming: output is streaming | |||
| :param user: unique user id | |||
| :return: text translated to audio file | |||
| """ | |||
| self._is_ffmpeg_installed() | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| if not voice: | |||
| voice = self._get_model_default_voice(model, credentials) | |||
| if streaming: | |||
| return Response(stream_with_context(self._tts_invoke_streaming(model=model, | |||
| credentials=credentials, | |||
| content_text=content_text, | |||
| user=user)), | |||
| voice=voice, | |||
| tenant_id=tenant_id)), | |||
| status=200, mimetype=f'audio/{audio_type}') | |||
| else: | |||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) | |||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) | |||
| def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | |||
| """ | |||
| @@ -52,91 +59,96 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel): | |||
| self._tts_invoke( | |||
| model=model, | |||
| credentials=credentials, | |||
| content_text='Hello world!', | |||
| user=user | |||
| content_text='Hello Dify!', | |||
| voice=self._get_model_default_voice(model, credentials), | |||
| ) | |||
| except Exception as ex: | |||
| raise CredentialsValidateFailedError(str(ex)) | |||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: | |||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: | |||
| """ | |||
| _tts_invoke text2speech model | |||
| :param model: model name | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param content_text: text content to be translated | |||
| :param user: unique user id | |||
| :return: text translated to audio file | |||
| """ | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| word_limit = self._get_model_word_limit(model, credentials) | |||
| max_workers = self._get_model_workers_limit(model, credentials) | |||
| try: | |||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | |||
| audio_bytes_list = list() | |||
| # Create a thread pool and map the function to the list of sentences | |||
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | |||
| futures = [executor.submit(self._process_sentence, model=model, sentence=sentence, | |||
| credentials=credentials, audio_type=audio_type) for sentence in sentences] | |||
| futures = [executor.submit(self._process_sentence, sentence=sentence, | |||
| credentials=credentials, voice=voice, audio_type=audio_type) for sentence in | |||
| sentences] | |||
| for future in futures: | |||
| try: | |||
| audio_bytes_list.append(future.result()) | |||
| if future.result(): | |||
| audio_bytes_list.append(future.result()) | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||
| audio_bytes_list if audio_bytes] | |||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||
| buffer: BytesIO = BytesIO() | |||
| combined_segment.export(buffer, format=audio_type) | |||
| buffer.seek(0) | |||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||
| if len(audio_bytes_list) > 0: | |||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||
| audio_bytes_list if audio_bytes] | |||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||
| buffer: BytesIO = BytesIO() | |||
| combined_segment.export(buffer, format=audio_type) | |||
| buffer.seek(0) | |||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| # Todo: To improve the streaming function | |||
| def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: | |||
| def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, | |||
| voice: str) -> any: | |||
| """ | |||
| _tts_invoke_streaming text2speech model | |||
| :param model: model name | |||
| :param tenant_id: user tenant id | |||
| :param credentials: model credentials | |||
| :param voice: model timbre | |||
| :param content_text: text content to be translated | |||
| :param user: unique user id | |||
| :return: text translated to audio file | |||
| """ | |||
| # transform credentials to kwargs for model instance | |||
| dashscope.api_key = credentials.get('dashscope_api_key') | |||
| voice_name = self._get_model_voice(model, credentials) | |||
| word_limit = self._get_model_word_limit(model, credentials) | |||
| audio_type = self._get_model_audio_type(model, credentials) | |||
| tts_file_id = self._get_file_name(content_text) | |||
| file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' | |||
| try: | |||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | |||
| for sentence in sentences: | |||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), | |||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, | |||
| text=sentence.strip(), | |||
| format=audio_type, word_timestamp_enabled=True, | |||
| phoneme_timestamp_enabled=True) | |||
| if isinstance(response.get_audio_data(), bytes): | |||
| return response.get_audio_data() | |||
| storage.save(file_path, response.get_audio_data()) | |||
| except Exception as ex: | |||
| raise InvokeBadRequestError(str(ex)) | |||
| def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str): | |||
| @staticmethod | |||
| def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str): | |||
| """ | |||
| _tts_invoke Tongyi text2speech model api | |||
| :param model: model name | |||
| :param credentials: model credentials | |||
| :param sentence: text content to be translated | |||
| :param voice: model timbre | |||
| :param audio_type: audio file type | |||
| :return: text translated to audio file | |||
| """ | |||
| # transform credentials to kwargs for model instance | |||
| dashscope.api_key = credentials.get('dashscope_api_key') | |||
| voice_name = self._get_model_voice(model, credentials) | |||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type) | |||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, | |||
| text=sentence.strip(), | |||
| format=audio_type) | |||
| if isinstance(response.get_audio_data(), bytes): | |||
| return response.get_audio_data() | |||
| @@ -98,7 +98,9 @@ class AppModelConfigService: | |||
| # text_to_speech | |||
| if 'text_to_speech' not in config or not config["text_to_speech"]: | |||
| config["text_to_speech"] = { | |||
| "enabled": False | |||
| "enabled": False, | |||
| "voice": "", | |||
| "language": "" | |||
| } | |||
| if not isinstance(config["text_to_speech"], dict): | |||
| @@ -106,6 +108,8 @@ class AppModelConfigService: | |||
| if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]: | |||
| config["text_to_speech"]["enabled"] = False | |||
| config["text_to_speech"]["voice"] = "" | |||
| config["text_to_speech"]["language"] = "" | |||
| if not isinstance(config["text_to_speech"]["enabled"], bool): | |||
| raise ValueError("enabled in text_to_speech must be of boolean type") | |||
| @@ -13,14 +13,14 @@ from services.errors.audio import ( | |||
| UnsupportedAudioTypeServiceError, | |||
| ) | |||
| FILE_SIZE = 15 | |||
| FILE_SIZE = 30 | |||
| FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 | |||
| ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr'] | |||
| class AudioService: | |||
| @classmethod | |||
| def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None): | |||
| def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None): | |||
| if file is None: | |||
| raise NoAudioUploadedServiceError() | |||
| @@ -49,7 +49,7 @@ class AudioService: | |||
| return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)} | |||
| @classmethod | |||
| def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None): | |||
| def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None): | |||
| model_manager = ModelManager() | |||
| model_instance = model_manager.get_default_model_instance( | |||
| tenant_id=tenant_id, | |||
| @@ -59,6 +59,21 @@ class AudioService: | |||
| raise ProviderNotSupportTextToSpeechServiceError() | |||
| try: | |||
| return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming) | |||
| return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice) | |||
| except Exception as e: | |||
| raise e | |||
| @classmethod | |||
| def transcript_tts_voices(cls, tenant_id: str, language: str): | |||
| model_manager = ModelManager() | |||
| model_instance = model_manager.get_default_model_instance( | |||
| tenant_id=tenant_id, | |||
| model_type=ModelType.TTS | |||
| ) | |||
| if model_instance is None: | |||
| raise ProviderNotSupportTextToSpeechServiceError() | |||
| try: | |||
| return model_instance.get_tts_voices(language) | |||
| except Exception as e: | |||
| raise e | |||
| @@ -16,3 +16,7 @@ class ProviderNotSupportSpeechToTextServiceError(Exception): | |||
| class ProviderNotSupportTextToSpeechServiceError(Exception): | |||
| pass | |||
| class ProviderNotSupportTextToSpeechLanageServiceError(Exception): | |||
| pass | |||
| @@ -2,6 +2,7 @@ | |||
| import type { FC, ReactNode } from 'react' | |||
| import React from 'react' | |||
| import cn from 'classnames' | |||
| import ParamsConfig from '@/app/components/app/configuration/config-voice/param-config' | |||
| export type IFeaturePanelProps = { | |||
| className?: string | |||
| @@ -12,6 +13,7 @@ export type IFeaturePanelProps = { | |||
| isFocus?: boolean | |||
| noBodySpacing?: boolean | |||
| children?: ReactNode | |||
| isShowTextToSpeech?: boolean | |||
| } | |||
| const FeaturePanel: FC<IFeaturePanelProps> = ({ | |||
| @@ -23,6 +25,7 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({ | |||
| isFocus, | |||
| noBodySpacing, | |||
| children, | |||
| isShowTextToSpeech, | |||
| }) => { | |||
| return ( | |||
| <div | |||
| @@ -41,7 +44,13 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({ | |||
| <div className='text-sm font-semibold text-gray-800'>{title}</div> | |||
| </div> | |||
| <div> | |||
| {headerRight} | |||
| {isShowTextToSpeech | |||
| ? ( | |||
| <div className='flex items-center'> | |||
| <ParamsConfig/> | |||
| </div> | |||
| ) | |||
| : headerRight} | |||
| </div> | |||
| </div> | |||
| </div> | |||
| @@ -0,0 +1,187 @@ | |||
| 'use client' | |||
| import useSWR from 'swr' | |||
| import type { FC } from 'react' | |||
| import { useContext } from 'use-context-selector' | |||
| import React, { Fragment } from 'react' | |||
| import classNames from 'classnames' | |||
| import { usePathname } from 'next/navigation' | |||
| import { useTranslation } from 'react-i18next' | |||
| import { Listbox, Transition } from '@headlessui/react' | |||
| import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid' | |||
| import type { Item } from '@/app/components/base/select' | |||
| import ConfigContext from '@/context/debug-configuration' | |||
| import { fetchAppVoices } from '@/service/apps' | |||
| import Tooltip from '@/app/components/base/tooltip' | |||
| import { HelpCircle } from '@/app/components/base/icons/src/vender/line/general' | |||
| const VoiceParamConfig: FC = () => { | |||
| const { t } = useTranslation() | |||
| const pathname = usePathname() | |||
| const matched = pathname.match(/\/app\/([^/]+)/) | |||
| const appId = (matched?.length && matched[1]) ? matched[1] : '' | |||
| const LanguageItems = [ | |||
| { value: 'zh-CN', name: '中文' }, | |||
| { value: 'en-US', name: '英语' }, | |||
| { value: 'de-DE', name: '德语' }, | |||
| { value: 'fr-FR', name: '法语' }, | |||
| { value: 'es-ES', name: '西班牙语' }, | |||
| { value: 'it-IT', name: '意大利语' }, | |||
| { value: 'th-TH', name: '泰语' }, | |||
| { value: 'id-ID', name: '印尼语' }, | |||
| ] | |||
| const { | |||
| textToSpeechConfig, | |||
| setTextToSpeechConfig, | |||
| } = useContext(ConfigContext) | |||
| const languageItem = LanguageItems.find(item => item.value === textToSpeechConfig.language) | |||
| const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select') | |||
| const voiceItems = useSWR({ url: `/apps/${appId}/text-to-audio/voices?language=${languageItem ? languageItem.value : 'zh-CN'}` }, fetchAppVoices).data | |||
| const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice) | |||
| const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select') | |||
| return ( | |||
| <div> | |||
| <div> | |||
| <div className='leading-6 text-base font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.title')}</div> | |||
| <div className='pt-3 space-y-6'> | |||
| <div> | |||
| <div className='mb-2 flex items-center space-x-1'> | |||
| <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div> | |||
| <Tooltip htmlContent={<div className='w-[180px]' > | |||
| {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => ( | |||
| <div key={item}>{item}</div> | |||
| ))} | |||
| </div>} selector='config-resolution-tooltip'> | |||
| <HelpCircle className='w-[14px] h-[14px] text-gray-400' /> | |||
| </Tooltip> | |||
| </div> | |||
| <Listbox | |||
| value={languageItem} | |||
| onChange={(value: Item) => { | |||
| setTextToSpeechConfig({ | |||
| ...textToSpeechConfig, | |||
| language: String(value.value), | |||
| }) | |||
| }} | |||
| > | |||
| <div className={'relative h-9'}> | |||
| <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> | |||
| <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>{languageItem?.name ?? localLanguagePlaceholder}</span> | |||
| <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> | |||
| <ChevronDownIcon | |||
| className="h-5 w-5 text-gray-400" | |||
| aria-hidden="true" | |||
| /> | |||
| </span> | |||
| </Listbox.Button> | |||
| <Transition | |||
| as={Fragment} | |||
| leave="transition ease-in duration-100" | |||
| leaveFrom="opacity-100" | |||
| leaveTo="opacity-0" | |||
| > | |||
| <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> | |||
| {LanguageItems.map((item: Item) => ( | |||
| <Listbox.Option | |||
| key={item.value} | |||
| className={({ active }) => | |||
| `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' | |||
| }` | |||
| } | |||
| value={item} | |||
| disabled={false} | |||
| > | |||
| {({ /* active, */ selected }) => ( | |||
| <> | |||
| <span className={classNames('block', selected && 'font-normal')}>{item.name}</span> | |||
| {(selected || item.value === textToSpeechConfig.language) && ( | |||
| <span | |||
| className={classNames( | |||
| 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', | |||
| )} | |||
| > | |||
| <CheckIcon className="h-5 w-5" aria-hidden="true" /> | |||
| </span> | |||
| )} | |||
| </> | |||
| )} | |||
| </Listbox.Option> | |||
| ))} | |||
| </Listbox.Options> | |||
| </Transition> | |||
| </div> | |||
| </Listbox> | |||
| </div> | |||
| <div> | |||
| <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div> | |||
| <Listbox | |||
| value={voiceItem} | |||
| disabled={!languageItem} | |||
| onChange={(value: Item) => { | |||
| setTextToSpeechConfig({ | |||
| ...textToSpeechConfig, | |||
| voice: String(value.value), | |||
| }) | |||
| }} | |||
| > | |||
| <div className={'relative h-9'}> | |||
| <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> | |||
| <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span> | |||
| <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> | |||
| <ChevronDownIcon | |||
| className="h-5 w-5 text-gray-400" | |||
| aria-hidden="true" | |||
| /> | |||
| </span> | |||
| </Listbox.Button> | |||
| <Transition | |||
| as={Fragment} | |||
| leave="transition ease-in duration-100" | |||
| leaveFrom="opacity-100" | |||
| leaveTo="opacity-0" | |||
| > | |||
| <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> | |||
| {voiceItems?.map((item: Item) => ( | |||
| <Listbox.Option | |||
| key={item.value} | |||
| className={({ active }) => | |||
| `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' | |||
| }` | |||
| } | |||
| value={item} | |||
| disabled={false} | |||
| > | |||
| {({ /* active, */ selected }) => ( | |||
| <> | |||
| <span className={classNames('block', selected && 'font-normal')}>{item.name}</span> | |||
| {(selected || item.value === textToSpeechConfig.voice) && ( | |||
| <span | |||
| className={classNames( | |||
| 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', | |||
| )} | |||
| > | |||
| <CheckIcon className="h-5 w-5" aria-hidden="true" /> | |||
| </span> | |||
| )} | |||
| </> | |||
| )} | |||
| </Listbox.Option> | |||
| ))} | |||
| </Listbox.Options> | |||
| </Transition> | |||
| </div> | |||
| </Listbox> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| ) | |||
| } | |||
| export default React.memo(VoiceParamConfig) | |||
| @@ -0,0 +1,41 @@ | |||
| 'use client' | |||
| import type { FC } from 'react' | |||
| import { memo, useState } from 'react' | |||
| import { useTranslation } from 'react-i18next' | |||
| import cn from 'classnames' | |||
| import VoiceParamConfig from './param-config-content' | |||
| import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' | |||
| import { | |||
| PortalToFollowElem, | |||
| PortalToFollowElemContent, | |||
| PortalToFollowElemTrigger, | |||
| } from '@/app/components/base/portal-to-follow-elem' | |||
| const ParamsConfig: FC = () => { | |||
| const { t } = useTranslation() | |||
| const [open, setOpen] = useState(false) | |||
| return ( | |||
| <PortalToFollowElem | |||
| open={open} | |||
| onOpenChange={setOpen} | |||
| placement='bottom-end' | |||
| offset={{ | |||
| mainAxis: 4, | |||
| }} | |||
| > | |||
| <PortalToFollowElemTrigger onClick={() => setOpen(v => !v)}> | |||
| <div className={cn('flex items-center rounded-md h-7 px-3 space-x-1 text-gray-700 cursor-pointer hover:bg-gray-200', open && 'bg-gray-200')}> | |||
| <Settings01 className='w-3.5 h-3.5 ' /> | |||
| <div className='ml-1 leading-[18px] text-xs font-medium '>{t('appDebug.voice.settings')}</div> | |||
| </div> | |||
| </PortalToFollowElemTrigger> | |||
| <PortalToFollowElemContent style={{ zIndex: 50 }}> | |||
| <div className='w-80 sm:w-[412px] p-4 bg-white rounded-lg border-[0.5px] border-gray-200 shadow-lg space-y-3'> | |||
| <VoiceParamConfig /> | |||
| </div> | |||
| </PortalToFollowElemContent> | |||
| </PortalToFollowElem> | |||
| ) | |||
| } | |||
| export default memo(ParamsConfig) | |||
| @@ -119,6 +119,8 @@ const Config: FC = () => { | |||
| setTextToSpeech: (value) => { | |||
| setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => { | |||
| draft.enabled = value | |||
| draft.voice = textToSpeechConfig?.voice | |||
| draft.language = textToSpeechConfig?.language | |||
| })) | |||
| }, | |||
| citation: citationConfig.enabled, | |||
| @@ -245,6 +247,7 @@ const Config: FC = () => { | |||
| {(isAgent && isChatApp) && ( | |||
| <AgentTools /> | |||
| )} | |||
| <ConfigVision /> | |||
| {/* Chat History */} | |||
| @@ -61,6 +61,11 @@ const TextGenerationItem: FC<TextGenerationItemProps> = ({ | |||
| sensitive_word_avoidance: moderationConfig, | |||
| external_data_tools: externalDataToolsConfig, | |||
| more_like_this: moreLikeThisConfig, | |||
| text_to_speech: { | |||
| enabled: false, | |||
| voice: '', | |||
| language: '', | |||
| }, | |||
| agent_mode: { | |||
| enabled: false, | |||
| tools: [], | |||
| @@ -213,9 +213,6 @@ const Debug: FC<IDebug> = ({ | |||
| const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key | |||
| const postModelConfig: BackendModelConfig = { | |||
| text_to_speech: { | |||
| enabled: false, | |||
| }, | |||
| pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '', | |||
| prompt_type: promptMode, | |||
| chat_prompt_config: {}, | |||
| @@ -234,6 +231,11 @@ const Debug: FC<IDebug> = ({ | |||
| mode: modelConfig.mode, | |||
| completion_params: completionParams as any, | |||
| }, | |||
| text_to_speech: { | |||
| enabled: false, | |||
| voice: '', | |||
| language: '', | |||
| }, | |||
| agent_mode: { | |||
| enabled: false, | |||
| tools: [], | |||
| @@ -19,6 +19,7 @@ const TextToSpeech: FC = () => { | |||
| <div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div> | |||
| } | |||
| noBodySpacing | |||
| isShowTextToSpeech={true} | |||
| /> | |||
| ) | |||
| } | |||
| @@ -30,6 +30,7 @@ import type { | |||
| MoreLikeThisConfig, | |||
| PromptConfig, | |||
| PromptVariable, | |||
| TextToSpeechConfig, | |||
| } from '@/models/debug' | |||
| import type { ExternalDataTool } from '@/models/common' | |||
| import type { DataSet } from '@/models/datasets' | |||
| @@ -98,8 +99,10 @@ const Configuration: FC = () => { | |||
| const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({ | |||
| enabled: false, | |||
| }) | |||
| const [textToSpeechConfig, setTextToSpeechConfig] = useState<MoreLikeThisConfig>({ | |||
| const [textToSpeechConfig, setTextToSpeechConfig] = useState<TextToSpeechConfig>({ | |||
| enabled: false, | |||
| voice: '', | |||
| language: '', | |||
| }) | |||
| const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({ | |||
| enabled: false, | |||
| @@ -246,6 +249,8 @@ const Configuration: FC = () => { | |||
| }) | |||
| setTextToSpeechConfig(modelConfig.text_to_speech || { | |||
| enabled: false, | |||
| voice: '', | |||
| language: '', | |||
| }) | |||
| setCitationConfig(modelConfig.retriever_resource || { | |||
| enabled: false, | |||
| @@ -73,7 +73,8 @@ const Operation: FC<OperationProps> = ({ | |||
| /> | |||
| ) | |||
| } | |||
| {!isOpeningStatement && config?.text_to_speech && ( | |||
| {(!isOpeningStatement && config?.text_to_speech.enabled) && ( | |||
| <AudioBtn | |||
| value={content} | |||
| className='hidden group-hover:block' | |||
| @@ -156,6 +156,8 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({ | |||
| setSpeechToTextConfig: () => { }, | |||
| textToSpeechConfig: { | |||
| enabled: false, | |||
| voice: '', | |||
| language: '', | |||
| }, | |||
| setTextToSpeechConfig: () => { }, | |||
| citationConfig: { | |||
| @@ -298,6 +298,17 @@ const translation = { | |||
| uploadLimit: 'Upload Limit', | |||
| }, | |||
| }, | |||
| voice: { | |||
| name: 'Voice', | |||
| description: 'Text to speech voice Settings', | |||
| settings: 'Settings', | |||
| voiceSettings: { | |||
| title: 'Voice Settings', | |||
| language: 'Language', | |||
| resolutionTooltip: 'Text-to-speech voice support language。', | |||
| voice: 'Voice', | |||
| }, | |||
| }, | |||
| openingStatement: { | |||
| title: 'Conversation Opener', | |||
| add: 'Add', | |||
| @@ -294,6 +294,17 @@ const translation = { | |||
| uploadLimit: '上传数量限制', | |||
| }, | |||
| }, | |||
| voice: { | |||
| name: '音色', | |||
| description: '文本转语音音色设置', | |||
| settings: '设置', | |||
| voiceSettings: { | |||
| title: '音色设置', | |||
| language: '语言', | |||
| resolutionTooltip: '文本转语音音色支持语言。', | |||
| voice: '音色', | |||
| }, | |||
| }, | |||
| openingStatement: { | |||
| title: '对话开场白', | |||
| add: '添加开场白', | |||
| @@ -122,3 +122,8 @@ export type UpdateOpenAIKeyResponse = ValidateOpenAIKeyResponse | |||
| export type GenerationIntroductionResponse = { | |||
| introduction: string | |||
| } | |||
| export type AppVoicesListResponse = [{ | |||
| name: string | |||
| value: string | |||
| }] | |||
| @@ -75,7 +75,11 @@ export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig | |||
| export type SpeechToTextConfig = MoreLikeThisConfig | |||
| export type TextToSpeechConfig = MoreLikeThisConfig | |||
| export type TextToSpeechConfig = { | |||
| enabled: boolean | |||
| voice?: string | |||
| language?: string | |||
| } | |||
| export type CitationConfig = MoreLikeThisConfig | |||
| @@ -1,6 +1,6 @@ | |||
| import type { Fetcher } from 'swr' | |||
| import { del, get, post } from './base' | |||
| import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' | |||
| import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, AppVoicesListResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' | |||
| import type { CommonResponse } from '@/models/common' | |||
| import type { AppMode, ModelConfig } from '@/types/app' | |||
| @@ -93,3 +93,7 @@ export const updateOpenAIKey: Fetcher<UpdateOpenAIKeyResponse, { url: string; bo | |||
| export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => { | |||
| return post<GenerationIntroductionResponse>(url, { body }) | |||
| } | |||
| export const fetchAppVoices: Fetcher<AppVoicesListResponse, { url: string }> = ({ url }) => { | |||
| return get<AppVoicesListResponse>(url) | |||
| } | |||
| @@ -155,6 +155,8 @@ export type ModelConfig = { | |||
| } | |||
| text_to_speech: { | |||
| enabled: boolean | |||
| voice?: string | |||
| language?: string | |||
| } | |||
| retriever_resource: { | |||
| enabled: boolean | |||