Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>tags/0.5.6
| import logging | import logging | ||||
| from flask import request | from flask import request | ||||
| from flask_restful import Resource | |||||
| from flask_restful import Resource, reqparse | |||||
| from werkzeug.exceptions import InternalServerError | from werkzeug.exceptions import InternalServerError | ||||
| import services | import services | ||||
| from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | ||||
| from core.model_runtime.errors.invoke import InvokeError | from core.model_runtime.errors.invoke import InvokeError | ||||
| from libs.login import login_required | from libs.login import login_required | ||||
| from models.model import AppModelConfig | |||||
| from services.audio_service import AudioService | from services.audio_service import AudioService | ||||
| from services.errors.audio import ( | from services.errors.audio import ( | ||||
| AudioTooLargeServiceError, | AudioTooLargeServiceError, | ||||
| try: | try: | ||||
| response = AudioService.transcript_asr( | response = AudioService.transcript_asr( | ||||
| tenant_id=app_model.tenant_id, | tenant_id=app_model.tenant_id, | ||||
| file=file | |||||
| file=file, | |||||
| end_user=None, | |||||
| promot=app_model.app_model_config.pre_prompt | |||||
| ) | ) | ||||
| return response | return response | ||||
| except ValueError as e: | except ValueError as e: | ||||
| raise e | raise e | ||||
| except Exception as e: | except Exception as e: | ||||
| logging.exception("internal server error.") | |||||
| logging.exception(f"internal server error, {str(e)}.") | |||||
| raise InternalServerError() | raise InternalServerError() | ||||
| def post(self, app_id): | def post(self, app_id): | ||||
| app_id = str(app_id) | app_id = str(app_id) | ||||
| app_model = _get_app(app_id, None) | app_model = _get_app(app_id, None) | ||||
| app_model_config: AppModelConfig = app_model.app_model_config | |||||
| if not app_model_config.text_to_speech_dict['enabled']: | |||||
| raise AppUnavailableError() | |||||
| try: | try: | ||||
| response = AudioService.transcript_tts( | response = AudioService.transcript_tts( | ||||
| tenant_id=app_model.tenant_id, | tenant_id=app_model.tenant_id, | ||||
| text=request.form['text'], | text=request.form['text'], | ||||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||||
| streaming=False | streaming=False | ||||
| ) | ) | ||||
| except ValueError as e: | except ValueError as e: | ||||
| raise e | raise e | ||||
| except Exception as e: | except Exception as e: | ||||
| logging.exception("internal server error.") | |||||
| logging.exception(f"internal server error, {str(e)}.") | |||||
| raise InternalServerError() | |||||
| class TextModesApi(Resource): | |||||
| def get(self, app_id: str): | |||||
| app_model = _get_app(str(app_id)) | |||||
| app_model_config: AppModelConfig = app_model.app_model_config | |||||
| if not app_model_config.text_to_speech_dict['enabled']: | |||||
| raise AppUnavailableError() | |||||
| try: | |||||
| parser = reqparse.RequestParser() | |||||
| parser.add_argument('language', type=str, required=True, location='args') | |||||
| args = parser.parse_args() | |||||
| response = AudioService.transcript_tts_voices( | |||||
| tenant_id=app_model.tenant_id, | |||||
| language=args['language'], | |||||
| ) | |||||
| return response | |||||
| except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError: | |||||
| raise AppUnavailableError("Text to audio voices language parameter loss.") | |||||
| except NoAudioUploadedServiceError: | |||||
| raise NoAudioUploadedError() | |||||
| except AudioTooLargeServiceError as e: | |||||
| raise AudioTooLargeError(str(e)) | |||||
| except UnsupportedAudioTypeServiceError: | |||||
| raise UnsupportedAudioTypeError() | |||||
| except ProviderNotSupportSpeechToTextServiceError: | |||||
| raise ProviderNotSupportSpeechToTextError() | |||||
| except ProviderTokenNotInitError as ex: | |||||
| raise ProviderNotInitializeError(ex.description) | |||||
| except QuotaExceededError: | |||||
| raise ProviderQuotaExceededError() | |||||
| except ModelCurrentlyNotSupportError: | |||||
| raise ProviderModelCurrentlyNotSupportError() | |||||
| except InvokeError as e: | |||||
| raise CompletionRequestError(e.description) | |||||
| except ValueError as e: | |||||
| raise e | |||||
| except Exception as e: | |||||
| logging.exception(f"internal server error, {str(e)}.") | |||||
| raise InternalServerError() | raise InternalServerError() | ||||
| api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text') | api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text') | ||||
| api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio') | api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio') | ||||
| api.add_resource(TextModesApi, '/apps/<uuid:app_id>/text-to-audio/voices') |
| response = AudioService.transcript_tts( | response = AudioService.transcript_tts( | ||||
| tenant_id=app_model.tenant_id, | tenant_id=app_model.tenant_id, | ||||
| text=request.form['text'], | text=request.form['text'], | ||||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||||
| streaming=False | streaming=False | ||||
| ) | ) | ||||
| return {'data': response.data.decode('latin1')} | return {'data': response.data.decode('latin1')} |
| tenant_id=app_model.tenant_id, | tenant_id=app_model.tenant_id, | ||||
| text=args['text'], | text=args['text'], | ||||
| end_user=args['user'], | end_user=args['user'], | ||||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||||
| streaming=args['streaming'] | streaming=args['streaming'] | ||||
| ) | ) | ||||
| except ValueError as e: | except ValueError as e: | ||||
| raise e | raise e | ||||
| except Exception as e: | except Exception as e: | ||||
| logging.exception("internal server error.") | |||||
| logging.exception(f"internal server error: {str(e)}") | |||||
| raise InternalServerError() | raise InternalServerError() | ||||
| class TextApi(WebApiResource): | class TextApi(WebApiResource): | ||||
| def post(self, app_model: App, end_user): | def post(self, app_model: App, end_user): | ||||
| app_model_config: AppModelConfig = app_model.app_model_config | |||||
| if not app_model_config.text_to_speech_dict['enabled']: | |||||
| raise AppUnavailableError() | |||||
| try: | try: | ||||
| response = AudioService.transcript_tts( | response = AudioService.transcript_tts( | ||||
| tenant_id=app_model.tenant_id, | tenant_id=app_model.tenant_id, | ||||
| text=request.form['text'], | text=request.form['text'], | ||||
| end_user=end_user.external_user_id, | end_user=end_user.external_user_id, | ||||
| voice=app_model.app_model_config.text_to_speech_dict.get('voice'), | |||||
| streaming=False | streaming=False | ||||
| ) | ) | ||||
| except ValueError as e: | except ValueError as e: | ||||
| raise e | raise e | ||||
| except Exception as e: | except Exception as e: | ||||
| logging.exception("internal server error.") | |||||
| logging.exception(f"internal server error: {str(e)}") | |||||
| raise InternalServerError() | raise InternalServerError() | ||||
| ModelConfigEntity, | ModelConfigEntity, | ||||
| PromptTemplateEntity, | PromptTemplateEntity, | ||||
| SensitiveWordAvoidanceEntity, | SensitiveWordAvoidanceEntity, | ||||
| TextToSpeechEntity, | |||||
| ) | ) | ||||
| from core.entities.model_entities import ModelStatus | from core.entities.model_entities import ModelStatus | ||||
| from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError | ||||
| text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech') | text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech') | ||||
| if text_to_speech_dict: | if text_to_speech_dict: | ||||
| if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']: | if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']: | ||||
| properties['text_to_speech'] = True | |||||
| properties['text_to_speech'] = TextToSpeechEntity( | |||||
| enabled=text_to_speech_dict.get('enabled'), | |||||
| voice=text_to_speech_dict.get('voice'), | |||||
| language=text_to_speech_dict.get('language'), | |||||
| ) | |||||
| # sensitive word avoidance | # sensitive word avoidance | ||||
| sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance') | sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance') |
| """ | """ | ||||
| Advanced Completion Prompt Template Entity. | Advanced Completion Prompt Template Entity. | ||||
| """ | """ | ||||
| class RolePrefixEntity(BaseModel): | class RolePrefixEntity(BaseModel): | ||||
| """ | """ | ||||
| Role Prefix Entity. | Role Prefix Entity. | ||||
| """ | """ | ||||
| Prompt Template Entity. | Prompt Template Entity. | ||||
| """ | """ | ||||
| class PromptType(Enum): | class PromptType(Enum): | ||||
| """ | """ | ||||
| Prompt Type. | Prompt Type. | ||||
| """ | """ | ||||
| Dataset Retrieve Config Entity. | Dataset Retrieve Config Entity. | ||||
| """ | """ | ||||
| class RetrieveStrategy(Enum): | class RetrieveStrategy(Enum): | ||||
| """ | """ | ||||
| Dataset Retrieve Strategy. | Dataset Retrieve Strategy. | ||||
| config: dict[str, Any] = {} | config: dict[str, Any] = {} | ||||
| class TextToSpeechEntity(BaseModel): | |||||
| """ | |||||
| Sensitive Word Avoidance Entity. | |||||
| """ | |||||
| enabled: bool | |||||
| voice: Optional[str] = None | |||||
| language: Optional[str] = None | |||||
| class FileUploadEntity(BaseModel): | class FileUploadEntity(BaseModel): | ||||
| """ | """ | ||||
| File Upload Entity. | File Upload Entity. | ||||
| tool_name: str | tool_name: str | ||||
| tool_parameters: dict[str, Any] = {} | tool_parameters: dict[str, Any] = {} | ||||
| class AgentPromptEntity(BaseModel): | class AgentPromptEntity(BaseModel): | ||||
| """ | """ | ||||
| Agent Prompt Entity. | Agent Prompt Entity. | ||||
| first_prompt: str | first_prompt: str | ||||
| next_iteration: str | next_iteration: str | ||||
| class AgentScratchpadUnit(BaseModel): | class AgentScratchpadUnit(BaseModel): | ||||
| """ | """ | ||||
| Agent First Prompt Entity. | Agent First Prompt Entity. | ||||
| thought: Optional[str] = None | thought: Optional[str] = None | ||||
| action_str: Optional[str] = None | action_str: Optional[str] = None | ||||
| observation: Optional[str] = None | observation: Optional[str] = None | ||||
| action: Optional[Action] = None | |||||
| action: Optional[Action] = None | |||||
| class AgentEntity(BaseModel): | class AgentEntity(BaseModel): | ||||
| """ | """ | ||||
| Agent Entity. | Agent Entity. | ||||
| """ | """ | ||||
| class Strategy(Enum): | class Strategy(Enum): | ||||
| """ | """ | ||||
| Agent Strategy. | Agent Strategy. | ||||
| tools: list[AgentToolEntity] = None | tools: list[AgentToolEntity] = None | ||||
| max_iteration: int = 5 | max_iteration: int = 5 | ||||
| class AppOrchestrationConfigEntity(BaseModel): | class AppOrchestrationConfigEntity(BaseModel): | ||||
| """ | """ | ||||
| App Orchestration Config Entity. | App Orchestration Config Entity. | ||||
| show_retrieve_source: bool = False | show_retrieve_source: bool = False | ||||
| more_like_this: bool = False | more_like_this: bool = False | ||||
| speech_to_text: bool = False | speech_to_text: bool = False | ||||
| text_to_speech: bool = False | |||||
| text_to_speech: dict = {} | |||||
| sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None | sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None | ||||
| user=user | user=user | ||||
| ) | ) | ||||
| def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None, | |||||
| def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, | |||||
| top_n: Optional[int] = None, | |||||
| user: Optional[str] = None) \ | user: Optional[str] = None) \ | ||||
| -> RerankResult: | -> RerankResult: | ||||
| """ | """ | ||||
| user=user | user=user | ||||
| ) | ) | ||||
| def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \ | |||||
| def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \ | |||||
| -> str: | -> str: | ||||
| """ | """ | ||||
| Invoke large language model | |||||
| Invoke large language tts model | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param tenant_id: user tenant id | |||||
| :param user: unique user id | :param user: unique user id | ||||
| :param voice: model timbre | |||||
| :param streaming: output is streaming | :param streaming: output is streaming | ||||
| :return: text for given audio file | :return: text for given audio file | ||||
| """ | """ | ||||
| credentials=self.credentials, | credentials=self.credentials, | ||||
| content_text=content_text, | content_text=content_text, | ||||
| user=user, | user=user, | ||||
| tenant_id=tenant_id, | |||||
| voice=voice, | |||||
| streaming=streaming | streaming=streaming | ||||
| ) | ) | ||||
| def get_tts_voices(self, language: str) -> list: | |||||
| """ | |||||
| Invoke large language tts model voices | |||||
| :param language: tts language | |||||
| :return: tts model voices | |||||
| """ | |||||
| if not isinstance(self.model_type_instance, TTSModel): | |||||
| raise Exception("Model type instance is not TTSModel") | |||||
| self.model_type_instance = cast(TTSModel, self.model_type_instance) | |||||
| return self.model_type_instance.get_tts_model_voices( | |||||
| model=self.model, | |||||
| credentials=self.credentials, | |||||
| language=language | |||||
| ) | |||||
| class ModelManager: | class ModelManager: | ||||
| def __init__(self) -> None: | def __init__(self) -> None: |
| - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`) | - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`) | ||||
| - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`) | - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`) | ||||
| - `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`) | - `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`) | ||||
| - `voices` (list) List of available voice.(available for model type `tts`) | |||||
| - `mode` (string) voice model.(available for model type `tts`) | |||||
| - `name` (string) voice model display name.(available for model type `tts`) | |||||
| - `lanuage` (string) the voice model supports languages.(available for model type `tts`) | |||||
| - `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`) | - `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`) | ||||
| - `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`) | - `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`) | ||||
| - `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`) | - `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`) |
| - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用) | - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用) | ||||
| - `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用) | - `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用) | ||||
| - `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用) | - `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用) | ||||
| - `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) | |||||
| - `default_voice` (string) 缺省音色,必选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) | |||||
| - `voices` (list) 可选音色列表。 | |||||
| - `mode` (string) 音色模型。(模型类型 `tts` 可用) | |||||
| - `name` (string) 音色模型显示名称。(模型类型 `tts` 可用) | |||||
| - `lanuage` (string) 音色模型支持语言。(模型类型 `tts` 可用) | |||||
| - `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用) | - `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用) | ||||
| - `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用) | - `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用) | ||||
| - `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用) | - `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用) |
| SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions" | SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions" | ||||
| MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk" | MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk" | ||||
| DEFAULT_VOICE = "default_voice" | DEFAULT_VOICE = "default_voice" | ||||
| VOICES = "voices" | |||||
| WORD_LIMIT = "word_limit" | WORD_LIMIT = "word_limit" | ||||
| AUDOI_TYPE = "audio_type" | AUDOI_TYPE = "audio_type" | ||||
| MAX_WORKERS = "max_workers" | MAX_WORKERS = "max_workers" |
| """ | """ | ||||
| model_type: ModelType = ModelType.TTS | model_type: ModelType = ModelType.TTS | ||||
| def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): | |||||
| def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||||
| user: Optional[str] = None): | |||||
| """ | """ | ||||
| Invoke large language model | Invoke large language model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param streaming: output is streaming | :param streaming: output is streaming | ||||
| :param user: unique user id | :param user: unique user id | ||||
| :return: translated audio file | :return: translated audio file | ||||
| """ | """ | ||||
| try: | try: | ||||
| return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text) | |||||
| self._is_ffmpeg_installed() | |||||
| return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, | |||||
| content_text=content_text, voice=voice, tenant_id=tenant_id) | |||||
| except Exception as e: | except Exception as e: | ||||
| raise self._transform_invoke_error(e) | raise self._transform_invoke_error(e) | ||||
| @abstractmethod | @abstractmethod | ||||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): | |||||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||||
| user: Optional[str] = None): | |||||
| """ | """ | ||||
| Invoke large language model | Invoke large language model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param streaming: output is streaming | :param streaming: output is streaming | ||||
| :param user: unique user id | :param user: unique user id | ||||
| """ | """ | ||||
| raise NotImplementedError | raise NotImplementedError | ||||
| def _get_model_voice(self, model: str, credentials: dict) -> any: | |||||
| def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list: | |||||
| """ | |||||
| Get voice for given tts model voices | |||||
| :param language: tts language | |||||
| :param model: model name | |||||
| :param credentials: model credentials | |||||
| :return: voices lists | |||||
| """ | |||||
| model_schema = self.get_model_schema(model, credentials) | |||||
| if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties: | |||||
| voices = model_schema.model_properties[ModelPropertyKey.VOICES] | |||||
| return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')] | |||||
| def _get_model_default_voice(self, model: str, credentials: dict) -> any: | |||||
| """ | """ | ||||
| Get voice for given tts model | Get voice for given tts model | ||||
| model: tts-1-hd | |||||
| model: tts-1 | |||||
| model_type: tts | model_type: tts | ||||
| model_properties: | model_properties: | ||||
| default_voice: 'alloy' | default_voice: 'alloy' | ||||
| voices: | |||||
| - mode: 'alloy' | |||||
| name: 'Alloy' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'echo' | |||||
| name: 'Echo' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'fable' | |||||
| name: 'Fable' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'onyx' | |||||
| name: 'Onyx' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'nova' | |||||
| name: 'Nova' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'shimmer' | |||||
| name: 'Shimmer' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| word_limit: 120 | word_limit: 120 | ||||
| audio_type: 'mp3' | audio_type: 'mp3' | ||||
| max_workers: 5 | max_workers: 5 | ||||
| pricing: | |||||
| input: '0.03' | |||||
| output: '0' | |||||
| unit: '0.001' | |||||
| currency: USD |
| model_type: tts | model_type: tts | ||||
| model_properties: | model_properties: | ||||
| default_voice: 'alloy' | default_voice: 'alloy' | ||||
| voices: | |||||
| - mode: 'alloy' | |||||
| name: 'Alloy' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'echo' | |||||
| name: 'Echo' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'fable' | |||||
| name: 'Fable' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'onyx' | |||||
| name: 'Onyx' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'nova' | |||||
| name: 'Nova' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| - mode: 'shimmer' | |||||
| name: 'Shimmer' | |||||
| language: ['zh-CN', 'en-US'] | |||||
| word_limit: 120 | word_limit: 120 | ||||
| audio_type: 'mp3' | audio_type: 'mp3' | ||||
| max_workers: 5 | max_workers: 5 | ||||
| pricing: | |||||
| input: '0.015' | |||||
| output: '0' | |||||
| unit: '0.001' | |||||
| currency: USD |
| from core.model_runtime.errors.validate import CredentialsValidateFailedError | from core.model_runtime.errors.validate import CredentialsValidateFailedError | ||||
| from core.model_runtime.model_providers.__base.tts_model import TTSModel | from core.model_runtime.model_providers.__base.tts_model import TTSModel | ||||
| from core.model_runtime.model_providers.openai._common import _CommonOpenAI | from core.model_runtime.model_providers.openai._common import _CommonOpenAI | ||||
| from extensions.ext_storage import storage | |||||
| class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): | class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): | ||||
| """ | """ | ||||
| Model class for OpenAI Speech to text model. | Model class for OpenAI Speech to text model. | ||||
| """ | """ | ||||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: | |||||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, | |||||
| content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any: | |||||
| """ | """ | ||||
| _invoke text2speech model | _invoke text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param voice: model timbre | |||||
| :param streaming: output is streaming | :param streaming: output is streaming | ||||
| :param user: unique user id | :param user: unique user id | ||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| self._is_ffmpeg_installed() | |||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| if not voice: | |||||
| voice = self._get_model_default_voice(model, credentials) | |||||
| if streaming: | if streaming: | ||||
| return Response(stream_with_context(self._tts_invoke_streaming(model=model, | return Response(stream_with_context(self._tts_invoke_streaming(model=model, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| content_text=content_text, | content_text=content_text, | ||||
| user=user)), | |||||
| tenant_id=tenant_id, | |||||
| voice=voice)), | |||||
| status=200, mimetype=f'audio/{audio_type}') | status=200, mimetype=f'audio/{audio_type}') | ||||
| else: | else: | ||||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) | |||||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) | |||||
| def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | ||||
| """ | """ | ||||
| self._tts_invoke( | self._tts_invoke( | ||||
| model=model, | model=model, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| content_text='Hello world!', | |||||
| user=user | |||||
| content_text='Hello Dify!', | |||||
| voice=self._get_model_default_voice(model, credentials), | |||||
| ) | ) | ||||
| except Exception as ex: | except Exception as ex: | ||||
| raise CredentialsValidateFailedError(str(ex)) | raise CredentialsValidateFailedError(str(ex)) | ||||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: | |||||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: | |||||
| """ | """ | ||||
| _tts_invoke text2speech model | _tts_invoke text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param user: unique user id | |||||
| :param voice: model timbre | |||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| word_limit = self._get_model_word_limit(model, credentials) | word_limit = self._get_model_word_limit(model, credentials) | ||||
| max_workers = self._get_model_workers_limit(model, credentials) | max_workers = self._get_model_workers_limit(model, credentials) | ||||
| try: | try: | ||||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | ||||
| audio_bytes_list = list() | audio_bytes_list = list() | ||||
| # Create a thread pool and map the function to the list of sentences | # Create a thread pool and map the function to the list of sentences | ||||
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | ||||
| futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence | |||||
| in sentences] | |||||
| futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, | |||||
| credentials=credentials) for sentence in sentences] | |||||
| for future in futures: | for future in futures: | ||||
| try: | try: | ||||
| audio_bytes_list.append(future.result()) | |||||
| if future.result(): | |||||
| audio_bytes_list.append(future.result()) | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||||
| audio_bytes_list if audio_bytes] | |||||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||||
| buffer: BytesIO = BytesIO() | |||||
| combined_segment.export(buffer, format=audio_type) | |||||
| buffer.seek(0) | |||||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||||
| if len(audio_bytes_list) > 0: | |||||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||||
| audio_bytes_list if audio_bytes] | |||||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||||
| buffer: BytesIO = BytesIO() | |||||
| combined_segment.export(buffer, format=audio_type) | |||||
| buffer.seek(0) | |||||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| # Todo: To improve the streaming function | # Todo: To improve the streaming function | ||||
| def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: | |||||
| def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, | |||||
| voice: str) -> any: | |||||
| """ | """ | ||||
| _tts_invoke_streaming text2speech model | _tts_invoke_streaming text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param user: unique user id | |||||
| :param voice: model timbre | |||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| # transform credentials to kwargs for model instance | # transform credentials to kwargs for model instance | ||||
| credentials_kwargs = self._to_credential_kwargs(credentials) | credentials_kwargs = self._to_credential_kwargs(credentials) | ||||
| voice_name = self._get_model_voice(model, credentials) | |||||
| if not voice: | |||||
| voice = self._get_model_default_voice(model, credentials) | |||||
| word_limit = self._get_model_word_limit(model, credentials) | word_limit = self._get_model_word_limit(model, credentials) | ||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| tts_file_id = self._get_file_name(content_text) | tts_file_id = self._get_file_name(content_text) | ||||
| file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}' | |||||
| file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' | |||||
| try: | try: | ||||
| client = OpenAI(**credentials_kwargs) | client = OpenAI(**credentials_kwargs) | ||||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | ||||
| for sentence in sentences: | for sentence in sentences: | ||||
| response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) | |||||
| response.stream_to_file(file_path) | |||||
| response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) | |||||
| # response.stream_to_file(file_path) | |||||
| storage.save(file_path, response.read()) | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| def _process_sentence(self, sentence: str, model: str, credentials: dict): | |||||
| def _process_sentence(self, sentence: str, model: str, | |||||
| voice, credentials: dict): | |||||
| """ | """ | ||||
| _tts_invoke openai text2speech model api | _tts_invoke openai text2speech model api | ||||
| :param model: model name | :param model: model name | ||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param sentence: text content to be translated | :param sentence: text content to be translated | ||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| # transform credentials to kwargs for model instance | # transform credentials to kwargs for model instance | ||||
| credentials_kwargs = self._to_credential_kwargs(credentials) | credentials_kwargs = self._to_credential_kwargs(credentials) | ||||
| voice_name = self._get_model_voice(model, credentials) | |||||
| client = OpenAI(**credentials_kwargs) | client = OpenAI(**credentials_kwargs) | ||||
| response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) | |||||
| response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) | |||||
| if isinstance(response.read(), bytes): | if isinstance(response.read(), bytes): | ||||
| return response.read() | return response.read() |
| model: tts-1 | model: tts-1 | ||||
| model_type: tts | model_type: tts | ||||
| model_properties: | model_properties: | ||||
| default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置 | |||||
| default_voice: 'sambert-zhiru-v1' | |||||
| voices: | |||||
| - mode: "sambert-zhinan-v1" | |||||
| name: "知楠(广告男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiqi-v1" | |||||
| name: "知琪(温柔女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhichu-v1" | |||||
| name: "知厨(新闻播报)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhide-v1" | |||||
| name: "知德(新闻男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhijia-v1" | |||||
| name: "知佳(标准女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiru-v1" | |||||
| name: "知茹(新闻女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiqian-v1" | |||||
| name: "知倩(配音解说、新闻播报)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhixiang-v1" | |||||
| name: "知祥(配音解说)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiwei-v1" | |||||
| name: "知薇(萝莉女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhihao-v1" | |||||
| name: "知浩(咨询男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhijing-v1" | |||||
| name: "知婧(严厉女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiming-v1" | |||||
| name: "知茗(诙谐男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhimo-v1" | |||||
| name: "知墨(情感男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhina-v1" | |||||
| name: "知娜(浙普女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhishu-v1" | |||||
| name: "知树(资讯男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhistella-v1" | |||||
| name: "知莎(知性女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiting-v1" | |||||
| name: "知婷(电台女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhixiao-v1" | |||||
| name: "知笑(资讯女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiya-v1" | |||||
| name: "知雅(严厉女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiye-v1" | |||||
| name: "知晔(青年男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiying-v1" | |||||
| name: "知颖(软萌童声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhiyuan-v1" | |||||
| name: "知媛(知心姐姐)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhigui-v1" | |||||
| name: "知柜(直播女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhishuo-v1" | |||||
| name: "知硕(自然男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhimiao-emo-v1" | |||||
| name: "知妙(多种情感女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhimao-v1" | |||||
| name: "知猫(直播女声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhilun-v1" | |||||
| name: "知伦(悬疑解说)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhifei-v1" | |||||
| name: "知飞(激昂解说)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-zhida-v1" | |||||
| name: "知达(标准男声)" | |||||
| language: [ "zh-CN", "en-US" ] | |||||
| - mode: "sambert-camila-v1" | |||||
| name: "Camila(西班牙语女声)" | |||||
| language: [ "es-ES" ] | |||||
| - mode: "sambert-perla-v1" | |||||
| name: "Perla(意大利语女声)" | |||||
| language: [ "it-IT" ] | |||||
| - mode: "sambert-indah-v1" | |||||
| name: "Indah(印尼语女声)" | |||||
| language: [ "id-ID" ] | |||||
| - mode: "sambert-clara-v1" | |||||
| name: "Clara(法语女声)" | |||||
| language: [ "fr-FR" ] | |||||
| - mode: "sambert-hanna-v1" | |||||
| name: "Hanna(德语女声)" | |||||
| language: [ "de-DE" ] | |||||
| - mode: "sambert-beth-v1" | |||||
| name: "Beth(咨询女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-betty-v1" | |||||
| name: "Betty(客服女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-cally-v1" | |||||
| name: "Cally(自然女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-cindy-v1" | |||||
| name: "Cindy(对话女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-eva-v1" | |||||
| name: "Eva(陪伴女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-donna-v1" | |||||
| name: "Donna(教育女声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-brian-v1" | |||||
| name: "Brian(客服男声)" | |||||
| language: [ "en-US" ] | |||||
| - mode: "sambert-waan-v1" | |||||
| name: "Waan(泰语女声)" | |||||
| language: [ "th-TH" ] | |||||
| word_limit: 120 | word_limit: 120 | ||||
| audio_type: 'mp3' | audio_type: 'mp3' | ||||
| max_workers: 5 | max_workers: 5 |
| from core.model_runtime.errors.validate import CredentialsValidateFailedError | from core.model_runtime.errors.validate import CredentialsValidateFailedError | ||||
| from core.model_runtime.model_providers.__base.tts_model import TTSModel | from core.model_runtime.model_providers.__base.tts_model import TTSModel | ||||
| from core.model_runtime.model_providers.tongyi._common import _CommonTongyi | from core.model_runtime.model_providers.tongyi._common import _CommonTongyi | ||||
| from extensions.ext_storage import storage | |||||
| class TongyiText2SpeechModel(_CommonTongyi, TTSModel): | class TongyiText2SpeechModel(_CommonTongyi, TTSModel): | ||||
| """ | """ | ||||
| Model class for Tongyi Speech to text model. | Model class for Tongyi Speech to text model. | ||||
| """ | """ | ||||
| def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: | |||||
| def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, | |||||
| user: Optional[str] = None) -> any: | |||||
| """ | """ | ||||
| _invoke text2speech model | _invoke text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param streaming: output is streaming | :param streaming: output is streaming | ||||
| :param user: unique user id | :param user: unique user id | ||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| self._is_ffmpeg_installed() | |||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| if not voice: | |||||
| voice = self._get_model_default_voice(model, credentials) | |||||
| if streaming: | if streaming: | ||||
| return Response(stream_with_context(self._tts_invoke_streaming(model=model, | return Response(stream_with_context(self._tts_invoke_streaming(model=model, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| content_text=content_text, | content_text=content_text, | ||||
| user=user)), | |||||
| voice=voice, | |||||
| tenant_id=tenant_id)), | |||||
| status=200, mimetype=f'audio/{audio_type}') | status=200, mimetype=f'audio/{audio_type}') | ||||
| else: | else: | ||||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) | |||||
| return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) | |||||
| def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: | ||||
| """ | """ | ||||
| self._tts_invoke( | self._tts_invoke( | ||||
| model=model, | model=model, | ||||
| credentials=credentials, | credentials=credentials, | ||||
| content_text='Hello world!', | |||||
| user=user | |||||
| content_text='Hello Dify!', | |||||
| voice=self._get_model_default_voice(model, credentials), | |||||
| ) | ) | ||||
| except Exception as ex: | except Exception as ex: | ||||
| raise CredentialsValidateFailedError(str(ex)) | raise CredentialsValidateFailedError(str(ex)) | ||||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: | |||||
| def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: | |||||
| """ | """ | ||||
| _tts_invoke text2speech model | _tts_invoke text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param user: unique user id | |||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| word_limit = self._get_model_word_limit(model, credentials) | word_limit = self._get_model_word_limit(model, credentials) | ||||
| max_workers = self._get_model_workers_limit(model, credentials) | max_workers = self._get_model_workers_limit(model, credentials) | ||||
| try: | try: | ||||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | ||||
| audio_bytes_list = list() | audio_bytes_list = list() | ||||
| # Create a thread pool and map the function to the list of sentences | # Create a thread pool and map the function to the list of sentences | ||||
| with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | ||||
| futures = [executor.submit(self._process_sentence, model=model, sentence=sentence, | |||||
| credentials=credentials, audio_type=audio_type) for sentence in sentences] | |||||
| futures = [executor.submit(self._process_sentence, sentence=sentence, | |||||
| credentials=credentials, voice=voice, audio_type=audio_type) for sentence in | |||||
| sentences] | |||||
| for future in futures: | for future in futures: | ||||
| try: | try: | ||||
| audio_bytes_list.append(future.result()) | |||||
| if future.result(): | |||||
| audio_bytes_list.append(future.result()) | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||||
| audio_bytes_list if audio_bytes] | |||||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||||
| buffer: BytesIO = BytesIO() | |||||
| combined_segment.export(buffer, format=audio_type) | |||||
| buffer.seek(0) | |||||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||||
| if len(audio_bytes_list) > 0: | |||||
| audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in | |||||
| audio_bytes_list if audio_bytes] | |||||
| combined_segment = reduce(lambda x, y: x + y, audio_segments) | |||||
| buffer: BytesIO = BytesIO() | |||||
| combined_segment.export(buffer, format=audio_type) | |||||
| buffer.seek(0) | |||||
| return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| # Todo: To improve the streaming function | # Todo: To improve the streaming function | ||||
| def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: | |||||
| def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, | |||||
| voice: str) -> any: | |||||
| """ | """ | ||||
| _tts_invoke_streaming text2speech model | _tts_invoke_streaming text2speech model | ||||
| :param model: model name | :param model: model name | ||||
| :param tenant_id: user tenant id | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param voice: model timbre | |||||
| :param content_text: text content to be translated | :param content_text: text content to be translated | ||||
| :param user: unique user id | |||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| # transform credentials to kwargs for model instance | |||||
| dashscope.api_key = credentials.get('dashscope_api_key') | dashscope.api_key = credentials.get('dashscope_api_key') | ||||
| voice_name = self._get_model_voice(model, credentials) | |||||
| word_limit = self._get_model_word_limit(model, credentials) | word_limit = self._get_model_word_limit(model, credentials) | ||||
| audio_type = self._get_model_audio_type(model, credentials) | audio_type = self._get_model_audio_type(model, credentials) | ||||
| tts_file_id = self._get_file_name(content_text) | |||||
| file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' | |||||
| try: | try: | ||||
| sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) | ||||
| for sentence in sentences: | for sentence in sentences: | ||||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), | |||||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, | |||||
| text=sentence.strip(), | |||||
| format=audio_type, word_timestamp_enabled=True, | format=audio_type, word_timestamp_enabled=True, | ||||
| phoneme_timestamp_enabled=True) | phoneme_timestamp_enabled=True) | ||||
| if isinstance(response.get_audio_data(), bytes): | if isinstance(response.get_audio_data(), bytes): | ||||
| return response.get_audio_data() | |||||
| storage.save(file_path, response.get_audio_data()) | |||||
| except Exception as ex: | except Exception as ex: | ||||
| raise InvokeBadRequestError(str(ex)) | raise InvokeBadRequestError(str(ex)) | ||||
| def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str): | |||||
| @staticmethod | |||||
| def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str): | |||||
| """ | """ | ||||
| _tts_invoke Tongyi text2speech model api | _tts_invoke Tongyi text2speech model api | ||||
| :param model: model name | |||||
| :param credentials: model credentials | :param credentials: model credentials | ||||
| :param sentence: text content to be translated | :param sentence: text content to be translated | ||||
| :param voice: model timbre | |||||
| :param audio_type: audio file type | :param audio_type: audio file type | ||||
| :return: text translated to audio file | :return: text translated to audio file | ||||
| """ | """ | ||||
| # transform credentials to kwargs for model instance | |||||
| dashscope.api_key = credentials.get('dashscope_api_key') | dashscope.api_key = credentials.get('dashscope_api_key') | ||||
| voice_name = self._get_model_voice(model, credentials) | |||||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type) | |||||
| response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, | |||||
| text=sentence.strip(), | |||||
| format=audio_type) | |||||
| if isinstance(response.get_audio_data(), bytes): | if isinstance(response.get_audio_data(), bytes): | ||||
| return response.get_audio_data() | return response.get_audio_data() |
| # text_to_speech | # text_to_speech | ||||
| if 'text_to_speech' not in config or not config["text_to_speech"]: | if 'text_to_speech' not in config or not config["text_to_speech"]: | ||||
| config["text_to_speech"] = { | config["text_to_speech"] = { | ||||
| "enabled": False | |||||
| "enabled": False, | |||||
| "voice": "", | |||||
| "language": "" | |||||
| } | } | ||||
| if not isinstance(config["text_to_speech"], dict): | if not isinstance(config["text_to_speech"], dict): | ||||
| if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]: | if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]: | ||||
| config["text_to_speech"]["enabled"] = False | config["text_to_speech"]["enabled"] = False | ||||
| config["text_to_speech"]["voice"] = "" | |||||
| config["text_to_speech"]["language"] = "" | |||||
| if not isinstance(config["text_to_speech"]["enabled"], bool): | if not isinstance(config["text_to_speech"]["enabled"], bool): | ||||
| raise ValueError("enabled in text_to_speech must be of boolean type") | raise ValueError("enabled in text_to_speech must be of boolean type") |
| UnsupportedAudioTypeServiceError, | UnsupportedAudioTypeServiceError, | ||||
| ) | ) | ||||
| FILE_SIZE = 15 | |||||
| FILE_SIZE = 30 | |||||
| FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 | FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 | ||||
| ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr'] | ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr'] | ||||
| class AudioService: | class AudioService: | ||||
| @classmethod | @classmethod | ||||
| def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None): | |||||
| def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None): | |||||
| if file is None: | if file is None: | ||||
| raise NoAudioUploadedServiceError() | raise NoAudioUploadedServiceError() | ||||
| return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)} | return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)} | ||||
| @classmethod | @classmethod | ||||
| def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None): | |||||
| def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None): | |||||
| model_manager = ModelManager() | model_manager = ModelManager() | ||||
| model_instance = model_manager.get_default_model_instance( | model_instance = model_manager.get_default_model_instance( | ||||
| tenant_id=tenant_id, | tenant_id=tenant_id, | ||||
| raise ProviderNotSupportTextToSpeechServiceError() | raise ProviderNotSupportTextToSpeechServiceError() | ||||
| try: | try: | ||||
| return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming) | |||||
| return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice) | |||||
| except Exception as e: | |||||
| raise e | |||||
| @classmethod | |||||
| def transcript_tts_voices(cls, tenant_id: str, language: str): | |||||
| model_manager = ModelManager() | |||||
| model_instance = model_manager.get_default_model_instance( | |||||
| tenant_id=tenant_id, | |||||
| model_type=ModelType.TTS | |||||
| ) | |||||
| if model_instance is None: | |||||
| raise ProviderNotSupportTextToSpeechServiceError() | |||||
| try: | |||||
| return model_instance.get_tts_voices(language) | |||||
| except Exception as e: | except Exception as e: | ||||
| raise e | raise e |
| class ProviderNotSupportTextToSpeechServiceError(Exception): | class ProviderNotSupportTextToSpeechServiceError(Exception): | ||||
| pass | pass | ||||
| class ProviderNotSupportTextToSpeechLanageServiceError(Exception): | |||||
| pass |
| import type { FC, ReactNode } from 'react' | import type { FC, ReactNode } from 'react' | ||||
| import React from 'react' | import React from 'react' | ||||
| import cn from 'classnames' | import cn from 'classnames' | ||||
| import ParamsConfig from '@/app/components/app/configuration/config-voice/param-config' | |||||
| export type IFeaturePanelProps = { | export type IFeaturePanelProps = { | ||||
| className?: string | className?: string | ||||
| isFocus?: boolean | isFocus?: boolean | ||||
| noBodySpacing?: boolean | noBodySpacing?: boolean | ||||
| children?: ReactNode | children?: ReactNode | ||||
| isShowTextToSpeech?: boolean | |||||
| } | } | ||||
| const FeaturePanel: FC<IFeaturePanelProps> = ({ | const FeaturePanel: FC<IFeaturePanelProps> = ({ | ||||
| isFocus, | isFocus, | ||||
| noBodySpacing, | noBodySpacing, | ||||
| children, | children, | ||||
| isShowTextToSpeech, | |||||
| }) => { | }) => { | ||||
| return ( | return ( | ||||
| <div | <div | ||||
| <div className='text-sm font-semibold text-gray-800'>{title}</div> | <div className='text-sm font-semibold text-gray-800'>{title}</div> | ||||
| </div> | </div> | ||||
| <div> | <div> | ||||
| {headerRight} | |||||
| {isShowTextToSpeech | |||||
| ? ( | |||||
| <div className='flex items-center'> | |||||
| <ParamsConfig/> | |||||
| </div> | |||||
| ) | |||||
| : headerRight} | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| </div> | </div> |
| 'use client' | |||||
| import useSWR from 'swr' | |||||
| import type { FC } from 'react' | |||||
| import { useContext } from 'use-context-selector' | |||||
| import React, { Fragment } from 'react' | |||||
| import classNames from 'classnames' | |||||
| import { usePathname } from 'next/navigation' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import { Listbox, Transition } from '@headlessui/react' | |||||
| import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid' | |||||
| import type { Item } from '@/app/components/base/select' | |||||
| import ConfigContext from '@/context/debug-configuration' | |||||
| import { fetchAppVoices } from '@/service/apps' | |||||
| import Tooltip from '@/app/components/base/tooltip' | |||||
| import { HelpCircle } from '@/app/components/base/icons/src/vender/line/general' | |||||
| const VoiceParamConfig: FC = () => { | |||||
| const { t } = useTranslation() | |||||
| const pathname = usePathname() | |||||
| const matched = pathname.match(/\/app\/([^/]+)/) | |||||
| const appId = (matched?.length && matched[1]) ? matched[1] : '' | |||||
| const LanguageItems = [ | |||||
| { value: 'zh-CN', name: '中文' }, | |||||
| { value: 'en-US', name: '英语' }, | |||||
| { value: 'de-DE', name: '德语' }, | |||||
| { value: 'fr-FR', name: '法语' }, | |||||
| { value: 'es-ES', name: '西班牙语' }, | |||||
| { value: 'it-IT', name: '意大利语' }, | |||||
| { value: 'th-TH', name: '泰语' }, | |||||
| { value: 'id-ID', name: '印尼语' }, | |||||
| ] | |||||
| const { | |||||
| textToSpeechConfig, | |||||
| setTextToSpeechConfig, | |||||
| } = useContext(ConfigContext) | |||||
| const languageItem = LanguageItems.find(item => item.value === textToSpeechConfig.language) | |||||
| const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select') | |||||
| const voiceItems = useSWR({ url: `/apps/${appId}/text-to-audio/voices?language=${languageItem ? languageItem.value : 'zh-CN'}` }, fetchAppVoices).data | |||||
| const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice) | |||||
| const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select') | |||||
| return ( | |||||
| <div> | |||||
| <div> | |||||
| <div className='leading-6 text-base font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.title')}</div> | |||||
| <div className='pt-3 space-y-6'> | |||||
| <div> | |||||
| <div className='mb-2 flex items-center space-x-1'> | |||||
| <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div> | |||||
| <Tooltip htmlContent={<div className='w-[180px]' > | |||||
| {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => ( | |||||
| <div key={item}>{item}</div> | |||||
| ))} | |||||
| </div>} selector='config-resolution-tooltip'> | |||||
| <HelpCircle className='w-[14px] h-[14px] text-gray-400' /> | |||||
| </Tooltip> | |||||
| </div> | |||||
| <Listbox | |||||
| value={languageItem} | |||||
| onChange={(value: Item) => { | |||||
| setTextToSpeechConfig({ | |||||
| ...textToSpeechConfig, | |||||
| language: String(value.value), | |||||
| }) | |||||
| }} | |||||
| > | |||||
| <div className={'relative h-9'}> | |||||
| <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> | |||||
| <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>{languageItem?.name ?? localLanguagePlaceholder}</span> | |||||
| <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> | |||||
| <ChevronDownIcon | |||||
| className="h-5 w-5 text-gray-400" | |||||
| aria-hidden="true" | |||||
| /> | |||||
| </span> | |||||
| </Listbox.Button> | |||||
| <Transition | |||||
| as={Fragment} | |||||
| leave="transition ease-in duration-100" | |||||
| leaveFrom="opacity-100" | |||||
| leaveTo="opacity-0" | |||||
| > | |||||
| <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> | |||||
| {LanguageItems.map((item: Item) => ( | |||||
| <Listbox.Option | |||||
| key={item.value} | |||||
| className={({ active }) => | |||||
| `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' | |||||
| }` | |||||
| } | |||||
| value={item} | |||||
| disabled={false} | |||||
| > | |||||
| {({ /* active, */ selected }) => ( | |||||
| <> | |||||
| <span className={classNames('block', selected && 'font-normal')}>{item.name}</span> | |||||
| {(selected || item.value === textToSpeechConfig.language) && ( | |||||
| <span | |||||
| className={classNames( | |||||
| 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', | |||||
| )} | |||||
| > | |||||
| <CheckIcon className="h-5 w-5" aria-hidden="true" /> | |||||
| </span> | |||||
| )} | |||||
| </> | |||||
| )} | |||||
| </Listbox.Option> | |||||
| ))} | |||||
| </Listbox.Options> | |||||
| </Transition> | |||||
| </div> | |||||
| </Listbox> | |||||
| </div> | |||||
| <div> | |||||
| <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div> | |||||
| <Listbox | |||||
| value={voiceItem} | |||||
| disabled={!languageItem} | |||||
| onChange={(value: Item) => { | |||||
| setTextToSpeechConfig({ | |||||
| ...textToSpeechConfig, | |||||
| voice: String(value.value), | |||||
| }) | |||||
| }} | |||||
| > | |||||
| <div className={'relative h-9'}> | |||||
| <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> | |||||
| <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span> | |||||
| <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> | |||||
| <ChevronDownIcon | |||||
| className="h-5 w-5 text-gray-400" | |||||
| aria-hidden="true" | |||||
| /> | |||||
| </span> | |||||
| </Listbox.Button> | |||||
| <Transition | |||||
| as={Fragment} | |||||
| leave="transition ease-in duration-100" | |||||
| leaveFrom="opacity-100" | |||||
| leaveTo="opacity-0" | |||||
| > | |||||
| <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> | |||||
| {voiceItems?.map((item: Item) => ( | |||||
| <Listbox.Option | |||||
| key={item.value} | |||||
| className={({ active }) => | |||||
| `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' | |||||
| }` | |||||
| } | |||||
| value={item} | |||||
| disabled={false} | |||||
| > | |||||
| {({ /* active, */ selected }) => ( | |||||
| <> | |||||
| <span className={classNames('block', selected && 'font-normal')}>{item.name}</span> | |||||
| {(selected || item.value === textToSpeechConfig.voice) && ( | |||||
| <span | |||||
| className={classNames( | |||||
| 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', | |||||
| )} | |||||
| > | |||||
| <CheckIcon className="h-5 w-5" aria-hidden="true" /> | |||||
| </span> | |||||
| )} | |||||
| </> | |||||
| )} | |||||
| </Listbox.Option> | |||||
| ))} | |||||
| </Listbox.Options> | |||||
| </Transition> | |||||
| </div> | |||||
| </Listbox> | |||||
| </div> | |||||
| </div> | |||||
| </div> | |||||
| </div> | |||||
| ) | |||||
| } | |||||
| export default React.memo(VoiceParamConfig) |
| 'use client' | |||||
| import type { FC } from 'react' | |||||
| import { memo, useState } from 'react' | |||||
| import { useTranslation } from 'react-i18next' | |||||
| import cn from 'classnames' | |||||
| import VoiceParamConfig from './param-config-content' | |||||
| import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' | |||||
| import { | |||||
| PortalToFollowElem, | |||||
| PortalToFollowElemContent, | |||||
| PortalToFollowElemTrigger, | |||||
| } from '@/app/components/base/portal-to-follow-elem' | |||||
| const ParamsConfig: FC = () => { | |||||
| const { t } = useTranslation() | |||||
| const [open, setOpen] = useState(false) | |||||
| return ( | |||||
| <PortalToFollowElem | |||||
| open={open} | |||||
| onOpenChange={setOpen} | |||||
| placement='bottom-end' | |||||
| offset={{ | |||||
| mainAxis: 4, | |||||
| }} | |||||
| > | |||||
| <PortalToFollowElemTrigger onClick={() => setOpen(v => !v)}> | |||||
| <div className={cn('flex items-center rounded-md h-7 px-3 space-x-1 text-gray-700 cursor-pointer hover:bg-gray-200', open && 'bg-gray-200')}> | |||||
| <Settings01 className='w-3.5 h-3.5 ' /> | |||||
| <div className='ml-1 leading-[18px] text-xs font-medium '>{t('appDebug.voice.settings')}</div> | |||||
| </div> | |||||
| </PortalToFollowElemTrigger> | |||||
| <PortalToFollowElemContent style={{ zIndex: 50 }}> | |||||
| <div className='w-80 sm:w-[412px] p-4 bg-white rounded-lg border-[0.5px] border-gray-200 shadow-lg space-y-3'> | |||||
| <VoiceParamConfig /> | |||||
| </div> | |||||
| </PortalToFollowElemContent> | |||||
| </PortalToFollowElem> | |||||
| ) | |||||
| } | |||||
| export default memo(ParamsConfig) |
| setTextToSpeech: (value) => { | setTextToSpeech: (value) => { | ||||
| setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => { | setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => { | ||||
| draft.enabled = value | draft.enabled = value | ||||
| draft.voice = textToSpeechConfig?.voice | |||||
| draft.language = textToSpeechConfig?.language | |||||
| })) | })) | ||||
| }, | }, | ||||
| citation: citationConfig.enabled, | citation: citationConfig.enabled, | ||||
| {(isAgent && isChatApp) && ( | {(isAgent && isChatApp) && ( | ||||
| <AgentTools /> | <AgentTools /> | ||||
| )} | )} | ||||
| <ConfigVision /> | <ConfigVision /> | ||||
| {/* Chat History */} | {/* Chat History */} |
| sensitive_word_avoidance: moderationConfig, | sensitive_word_avoidance: moderationConfig, | ||||
| external_data_tools: externalDataToolsConfig, | external_data_tools: externalDataToolsConfig, | ||||
| more_like_this: moreLikeThisConfig, | more_like_this: moreLikeThisConfig, | ||||
| text_to_speech: { | |||||
| enabled: false, | |||||
| voice: '', | |||||
| language: '', | |||||
| }, | |||||
| agent_mode: { | agent_mode: { | ||||
| enabled: false, | enabled: false, | ||||
| tools: [], | tools: [], |
| const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key | const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key | ||||
| const postModelConfig: BackendModelConfig = { | const postModelConfig: BackendModelConfig = { | ||||
| text_to_speech: { | |||||
| enabled: false, | |||||
| }, | |||||
| pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '', | pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '', | ||||
| prompt_type: promptMode, | prompt_type: promptMode, | ||||
| chat_prompt_config: {}, | chat_prompt_config: {}, | ||||
| mode: modelConfig.mode, | mode: modelConfig.mode, | ||||
| completion_params: completionParams as any, | completion_params: completionParams as any, | ||||
| }, | }, | ||||
| text_to_speech: { | |||||
| enabled: false, | |||||
| voice: '', | |||||
| language: '', | |||||
| }, | |||||
| agent_mode: { | agent_mode: { | ||||
| enabled: false, | enabled: false, | ||||
| tools: [], | tools: [], |
| <div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div> | <div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div> | ||||
| } | } | ||||
| noBodySpacing | noBodySpacing | ||||
| isShowTextToSpeech={true} | |||||
| /> | /> | ||||
| ) | ) | ||||
| } | } |
| MoreLikeThisConfig, | MoreLikeThisConfig, | ||||
| PromptConfig, | PromptConfig, | ||||
| PromptVariable, | PromptVariable, | ||||
| TextToSpeechConfig, | |||||
| } from '@/models/debug' | } from '@/models/debug' | ||||
| import type { ExternalDataTool } from '@/models/common' | import type { ExternalDataTool } from '@/models/common' | ||||
| import type { DataSet } from '@/models/datasets' | import type { DataSet } from '@/models/datasets' | ||||
| const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({ | const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({ | ||||
| enabled: false, | enabled: false, | ||||
| }) | }) | ||||
| const [textToSpeechConfig, setTextToSpeechConfig] = useState<MoreLikeThisConfig>({ | |||||
| const [textToSpeechConfig, setTextToSpeechConfig] = useState<TextToSpeechConfig>({ | |||||
| enabled: false, | enabled: false, | ||||
| voice: '', | |||||
| language: '', | |||||
| }) | }) | ||||
| const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({ | const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({ | ||||
| enabled: false, | enabled: false, | ||||
| }) | }) | ||||
| setTextToSpeechConfig(modelConfig.text_to_speech || { | setTextToSpeechConfig(modelConfig.text_to_speech || { | ||||
| enabled: false, | enabled: false, | ||||
| voice: '', | |||||
| language: '', | |||||
| }) | }) | ||||
| setCitationConfig(modelConfig.retriever_resource || { | setCitationConfig(modelConfig.retriever_resource || { | ||||
| enabled: false, | enabled: false, |
| /> | /> | ||||
| ) | ) | ||||
| } | } | ||||
| {!isOpeningStatement && config?.text_to_speech && ( | |||||
| {(!isOpeningStatement && config?.text_to_speech.enabled) && ( | |||||
| <AudioBtn | <AudioBtn | ||||
| value={content} | value={content} | ||||
| className='hidden group-hover:block' | className='hidden group-hover:block' |
| setSpeechToTextConfig: () => { }, | setSpeechToTextConfig: () => { }, | ||||
| textToSpeechConfig: { | textToSpeechConfig: { | ||||
| enabled: false, | enabled: false, | ||||
| voice: '', | |||||
| language: '', | |||||
| }, | }, | ||||
| setTextToSpeechConfig: () => { }, | setTextToSpeechConfig: () => { }, | ||||
| citationConfig: { | citationConfig: { |
| uploadLimit: 'Upload Limit', | uploadLimit: 'Upload Limit', | ||||
| }, | }, | ||||
| }, | }, | ||||
| voice: { | |||||
| name: 'Voice', | |||||
| description: 'Text to speech voice Settings', | |||||
| settings: 'Settings', | |||||
| voiceSettings: { | |||||
| title: 'Voice Settings', | |||||
| language: 'Language', | |||||
| resolutionTooltip: 'Text-to-speech voice support language。', | |||||
| voice: 'Voice', | |||||
| }, | |||||
| }, | |||||
| openingStatement: { | openingStatement: { | ||||
| title: 'Conversation Opener', | title: 'Conversation Opener', | ||||
| add: 'Add', | add: 'Add', |
| uploadLimit: '上传数量限制', | uploadLimit: '上传数量限制', | ||||
| }, | }, | ||||
| }, | }, | ||||
| voice: { | |||||
| name: '音色', | |||||
| description: '文本转语音音色设置', | |||||
| settings: '设置', | |||||
| voiceSettings: { | |||||
| title: '音色设置', | |||||
| language: '语言', | |||||
| resolutionTooltip: '文本转语音音色支持语言。', | |||||
| voice: '音色', | |||||
| }, | |||||
| }, | |||||
| openingStatement: { | openingStatement: { | ||||
| title: '对话开场白', | title: '对话开场白', | ||||
| add: '添加开场白', | add: '添加开场白', |
| export type GenerationIntroductionResponse = { | export type GenerationIntroductionResponse = { | ||||
| introduction: string | introduction: string | ||||
| } | } | ||||
| export type AppVoicesListResponse = [{ | |||||
| name: string | |||||
| value: string | |||||
| }] |
| export type SpeechToTextConfig = MoreLikeThisConfig | export type SpeechToTextConfig = MoreLikeThisConfig | ||||
| export type TextToSpeechConfig = MoreLikeThisConfig | |||||
| export type TextToSpeechConfig = { | |||||
| enabled: boolean | |||||
| voice?: string | |||||
| language?: string | |||||
| } | |||||
| export type CitationConfig = MoreLikeThisConfig | export type CitationConfig = MoreLikeThisConfig | ||||
| import type { Fetcher } from 'swr' | import type { Fetcher } from 'swr' | ||||
| import { del, get, post } from './base' | import { del, get, post } from './base' | ||||
| import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' | |||||
| import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, AppVoicesListResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' | |||||
| import type { CommonResponse } from '@/models/common' | import type { CommonResponse } from '@/models/common' | ||||
| import type { AppMode, ModelConfig } from '@/types/app' | import type { AppMode, ModelConfig } from '@/types/app' | ||||
| export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => { | export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => { | ||||
| return post<GenerationIntroductionResponse>(url, { body }) | return post<GenerationIntroductionResponse>(url, { body }) | ||||
| } | } | ||||
| export const fetchAppVoices: Fetcher<AppVoicesListResponse, { url: string }> = ({ url }) => { | |||||
| return get<AppVoicesListResponse>(url) | |||||
| } |
| } | } | ||||
| text_to_speech: { | text_to_speech: { | ||||
| enabled: boolean | enabled: boolean | ||||
| voice?: string | |||||
| language?: string | |||||
| } | } | ||||
| retriever_resource: { | retriever_resource: { | ||||
| enabled: boolean | enabled: boolean |