瀏覽代碼

Add tongyi tts&tts function optimization (#2177)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
tags/0.5.1
Charlie.Wei 1 年之前
父節點
當前提交
ac4bb5c35f
沒有連結到貢獻者的電子郵件帳戶。

+ 98
- 0
api/core/model_runtime/model_providers/__base/tts_model.py 查看文件

@@ -1,8 +1,13 @@
import uuid
import hashlib
import subprocess
from abc import abstractmethod
from typing import Optional

from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.entities.model_entities import ModelType
from core.model_runtime.model_providers.__base.ai_model import AIModel
from core.model_runtime.entities.model_entities import ModelPropertyKey


class TTSModel(AIModel):
@@ -40,3 +45,96 @@ class TTSModel(AIModel):
:return: translated audio file
"""
raise NotImplementedError

def _get_model_voice(self, model: str, credentials: dict) -> any:
"""
Get voice for given tts model

:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]

def _get_model_audio_type(self, model: str, credentials: dict) -> str:
"""
Get audio type for given tts model

:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]

def _get_model_word_limit(self, model: str, credentials: dict) -> int:
"""
Get audio type for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]

def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
"""
Get audio max workers for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]

@staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
if delimiters is None:
delimiters = set('。!?;\n')

buf = []
word_count = 0
for char in text:
buf.append(char)
if char in delimiters:
if word_count >= limit:
yield ''.join(buf)
buf = []
word_count = 0
else:
word_count += 1
else:
word_count += 1

if buf:
yield ''.join(buf)

@staticmethod
def _is_ffmpeg_installed():
try:
output = subprocess.check_output("ffmpeg -version", shell=True)
if "ffmpeg version" in output.decode("utf-8"):
return True
else:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
except Exception:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")

# Todo: To improve the streaming function
@staticmethod
def _get_file_name(file_content: str) -> str:
hash_object = hashlib.sha256(file_content.encode())
hex_digest = hash_object.hexdigest()

namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
return str(unique_uuid)

+ 3
- 104
api/core/model_runtime/model_providers/openai/tts/tts.py 查看文件

@@ -1,18 +1,13 @@
import uuid
import hashlib
import subprocess
from io import BytesIO
from typing import Optional
from functools import reduce
from pydub import AudioSegment

from core.model_runtime.entities.model_entities import ModelPropertyKey
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.openai._common import _CommonOpenAI

from typing_extensions import Literal
from flask import Response, stream_with_context
from openai import OpenAI
import concurrent.futures
@@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
"""
Model class for OpenAI Speech to text model.
"""

def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
user: Optional[str] = None) -> any:
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
"""
_invoke text2speech model

@@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
"""
_tts_invoke text2speech model

@@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
raise InvokeBadRequestError(str(ex))

# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
user: Optional[str] = None) -> any:
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
"""
_tts_invoke_streaming text2speech model

@@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _get_model_voice(self, model: str, credentials: dict) -> Literal[
"alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
"""
Get voice for given tts model

:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]

def _get_model_audio_type(self, model: str, credentials: dict) -> str:
"""
Get audio type for given tts model

:param model: model name
:param credentials: model credentials
:return: voice
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]

def _get_model_word_limit(self, model: str, credentials: dict) -> int:
"""
Get audio type for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]

def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
"""
Get audio max workers for given tts model
:return: audio type
"""
model_schema = self.get_model_schema(model, credentials)

if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]

@staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
if delimiters is None:
delimiters = set('。!?;\n')

buf = []
word_count = 0
for char in text:
buf.append(char)
if char in delimiters:
if word_count >= limit:
yield ''.join(buf)
buf = []
word_count = 0
else:
word_count += 1
else:
word_count += 1

if buf:
yield ''.join(buf)

@staticmethod
def _get_file_name(file_content: str) -> str:
hash_object = hashlib.sha256(file_content.encode())
hex_digest = hash_object.hexdigest()

namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
return str(unique_uuid)

def _process_sentence(self, sentence: str, model: str, credentials: dict):
"""
_tts_invoke openai text2speech model api
@@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
if isinstance(response.read(), bytes):
return response.read()

@staticmethod
def _is_ffmpeg_installed():
try:
output = subprocess.check_output("ffmpeg -version", shell=True)
if "ffmpeg version" in output.decode("utf-8"):
return True
else:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
except Exception:
raise InvokeBadRequestError("ffmpeg is not installed, "
"details: https://docs.dify.ai/getting-started/install-self-hosted"
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")

+ 23
- 0
api/core/model_runtime/model_providers/tongyi/_common.py 查看文件

@@ -0,0 +1,23 @@
from core.model_runtime.errors.invoke import InvokeError


class _CommonTongyi:
@staticmethod
def _to_credential_kwargs(credentials: dict) -> dict:
credentials_kwargs = {
"dashscope_api_key": credentials['dashscope_api_key'],
}

return credentials_kwargs

@property
def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
"""
Map model invoke error to unified error
The key is the error type thrown to the caller
The value is the error type thrown by the model,
which needs to be converted into a unified error type for the caller.

:return: Invoke error mapping
"""
pass

+ 1
- 0
api/core/model_runtime/model_providers/tongyi/tongyi.yaml 查看文件

@@ -16,6 +16,7 @@ help:
en_US: https://dashscope.console.aliyun.com/api-key_management
supported_model_types:
- llm
- tts
configurate_methods:
- predefined-model
provider_credential_schema:

+ 0
- 0
api/core/model_runtime/model_providers/tongyi/tts/__init__.py 查看文件


+ 7
- 0
api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml 查看文件

@@ -0,0 +1,7 @@
model: tts-1
model_type: tts
model_properties:
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
word_limit: 120
audio_type: 'mp3'
max_workers: 5

+ 142
- 0
api/core/model_runtime/model_providers/tongyi/tts/tts.py 查看文件

@@ -0,0 +1,142 @@
from io import BytesIO
from typing import Optional
from functools import reduce
from pydub import AudioSegment

from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi

import dashscope
from flask import Response, stream_with_context
import concurrent.futures


class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
"""
Model class for Tongyi Speech to text model.
"""
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
"""
_invoke text2speech model

:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
self._is_ffmpeg_installed()
audio_type = self._get_model_audio_type(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
user=user)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)

def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
validate credentials text2speech model

:param model: model name
:param credentials: model credentials
:param user: unique user id
:return: text translated to audio file
"""
try:
self._tts_invoke(
model=model,
credentials=credentials,
content_text='Hello world!',
user=user
)
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
"""
_tts_invoke text2speech model

:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)

try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
audio_bytes_list = list()

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
credentials=credentials, audio_type=audio_type) for sentence in sentences]
for future in futures:
try:
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))

# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
"""
_tts_invoke_streaming text2speech model

:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
format=audio_type, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
"""
_tts_invoke Tongyi text2speech model api

:param model: model name
:param credentials: model credentials
:param sentence: text content to be translated
:param audio_type: audio file type
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)

response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()

+ 1
- 1
web/app/components/develop/template/template.en.mdx 查看文件

@@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
/>
<Row>
<Col>
Text to speech, only supports openai model.
Text to speech.

### Request Body


+ 1
- 1
web/app/components/develop/template/template.zh.mdx 查看文件

@@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
/>
<Row>
<Col>
文字转语音,仅支持 openai 模型
文字转语音。

### Request Body


+ 1
- 1
web/app/components/develop/template/template_chat.en.mdx 查看文件

@@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
/>
<Row>
<Col>
Text to speech, only supports openai model.
Text to speech.

### Request Body


+ 1
- 1
web/app/components/develop/template/template_chat.zh.mdx 查看文件

@@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
/>
<Row>
<Col>
文字转语音,仅支持 openai 模型
文字转语音。

### Request Body


Loading…
取消
儲存