| from core.llm.whisper import Whisper | from core.llm.whisper import Whisper | ||||
| from models.provider import ProviderName | from models.provider import ProviderName | ||||
| FILE_SIZE_LIMIT = 1 * 1024 * 1024 | |||||
| FILE_SIZE = 15 | |||||
| FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 | |||||
| ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'] | ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'] | ||||
| class AudioService: | class AudioService: | ||||
| file_size = len(file_content) | file_size = len(file_content) | ||||
| if file_size > FILE_SIZE_LIMIT: | if file_size > FILE_SIZE_LIMIT: | ||||
| message = f"({file_size} > {FILE_SIZE_LIMIT})" | |||||
| message = f"Audio size larger than {FILE_SIZE} mb" | |||||
| raise AudioTooLargeServiceError(message) | raise AudioTooLargeServiceError(message) | ||||
| provider_name = LLMBuilder.get_default_provider(tenant_id) | provider_name = LLMBuilder.get_default_provider(tenant_id) | ||||
| if provider_name != ProviderName.OPENAI.value: | if provider_name != ProviderName.OPENAI.value: | ||||
| raise ProviderNotSupportSpeechToTextServiceError('haha') | |||||
| raise ProviderNotSupportSpeechToTextServiceError() | |||||
| provider_service = LLMProviderService(tenant_id, provider_name) | provider_service = LLMProviderService(tenant_id, provider_name) | ||||
| buffer = io.BytesIO(file_content) | buffer = io.BytesIO(file_content) | ||||
| buffer.name = 'temp.wav' | |||||
| buffer.name = 'temp.mp3' | |||||
| return Whisper(provider_service.provider).transcribe(buffer) | return Whisper(provider_service.provider).transcribe(buffer) | ||||
| from services.errors.base import BaseServiceError | |||||
| class NoAudioUploadedServiceError(Exception): | |||||
| pass | |||||
| class NoAudioUploadedServiceError(BaseServiceError): | |||||
| error_code = 'no_audio_uploaded' | |||||
| description = "Please upload your audio." | |||||
| code = 400 | |||||
| class AudioTooLargeServiceError(Exception): | |||||
| pass | |||||
| class AudioTooLargeServiceError(BaseServiceError): | |||||
| error_code = 'audio_too_large' | |||||
| description = "Audio size exceeded. {message}" | |||||
| code = 413 | |||||
| class UnsupportedAudioTypeServiceError(Exception): | |||||
| pass | |||||
| class UnsupportedAudioTypeServiceError(BaseServiceError): | |||||
| error_code = 'unsupported_audio_type' | |||||
| description = "Audio type not allowed." | |||||
| code = 415 | |||||
| class ProviderNotSupportSpeechToTextServiceError(BaseServiceError): | |||||
| error_code = 'provider_not_support_speech_to_text' | |||||
| description = "Provider not support speech to text. {message}" | |||||
| code = 400 | |||||
| class ProviderNotSupportSpeechToTextServiceError(Exception): | |||||
| pass |
| import cn from 'classnames' | import cn from 'classnames' | ||||
| import Recorder from 'js-audio-recorder' | import Recorder from 'js-audio-recorder' | ||||
| import { useRafInterval } from 'ahooks' | import { useRafInterval } from 'ahooks' | ||||
| import { convertToMp3 } from './utils' | |||||
| import s from './index.module.css' | import s from './index.module.css' | ||||
| import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' | import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' | ||||
| import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general' | import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general' | ||||
| onConverted, | onConverted, | ||||
| }: VoiceInputTypes) => { | }: VoiceInputTypes) => { | ||||
| const { t } = useTranslation() | const { t } = useTranslation() | ||||
| const recorder = useRef(new Recorder()) | |||||
| const recorder = useRef(new Recorder({ | |||||
| sampleBits: 16, | |||||
| sampleRate: 16000, | |||||
| numChannels: 1, | |||||
| compiling: false, | |||||
| })) | |||||
| const canvasRef = useRef<HTMLCanvasElement | null>(null) | const canvasRef = useRef<HTMLCanvasElement | null>(null) | ||||
| const ctxRef = useRef<CanvasRenderingContext2D | null>(null) | const ctxRef = useRef<CanvasRenderingContext2D | null>(null) | ||||
| const drawRecordId = useRef<number | null>(null) | const drawRecordId = useRef<number | null>(null) | ||||
| const canvas = canvasRef.current! | const canvas = canvasRef.current! | ||||
| const ctx = ctxRef.current! | const ctx = ctxRef.current! | ||||
| ctx.clearRect(0, 0, canvas.width, canvas.height) | ctx.clearRect(0, 0, canvas.width, canvas.height) | ||||
| const wavBlob = recorder.current.getWAVBlob() | |||||
| const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' }) | |||||
| const mp3Blob = convertToMp3(recorder.current) | |||||
| const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' }) | |||||
| const formData = new FormData() | const formData = new FormData() | ||||
| formData.append('file', wavFile) | |||||
| formData.append('file', mp3File) | |||||
| let url = '' | let url = '' | ||||
| let isPublic = false | let isPublic = false |
| import lamejs from 'lamejs' | |||||
| export const convertToMp3 = (recorder: any) => { | |||||
| const wav = lamejs.WavHeader.readHeader(recorder.getWAV()) | |||||
| const { channels, sampleRate } = wav | |||||
| const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128) | |||||
| const result = recorder.getChannelData() | |||||
| const buffer = [] | |||||
| const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2) | |||||
| const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2) | |||||
| const remaining = leftData.length + (rightData ? rightData.length : 0) | |||||
| const maxSamples = 1152 | |||||
| for (let i = 0; i < remaining; i += maxSamples) { | |||||
| const left = leftData.subarray(i, i + maxSamples) | |||||
| let right = null | |||||
| let mp3buf = null | |||||
| if (channels === 2) { | |||||
| right = rightData.subarray(i, i + maxSamples) | |||||
| mp3buf = mp3enc.encodeBuffer(left, right) | |||||
| } | |||||
| else { | |||||
| mp3buf = mp3enc.encodeBuffer(left) | |||||
| } | |||||
| if (mp3buf.length > 0) | |||||
| buffer.push(mp3buf) | |||||
| } | |||||
| const enc = mp3enc.flush() | |||||
| if (enc.length > 0) | |||||
| buffer.push(enc) | |||||
| return new Blob(buffer, { type: 'audio/mp3' }) | |||||
| } |
| declare module 'lamejs'; |
| "swr": "^2.1.0", | "swr": "^2.1.0", | ||||
| "tailwindcss": "^3.2.7", | "tailwindcss": "^3.2.7", | ||||
| "typescript": "4.9.5", | "typescript": "4.9.5", | ||||
| "use-context-selector": "^1.4.1" | |||||
| "use-context-selector": "^1.4.1", | |||||
| "lamejs": "1.2.0" | |||||
| }, | }, | ||||
| "devDependencies": { | "devDependencies": { | ||||
| "@antfu/eslint-config": "^0.36.0", | "@antfu/eslint-config": "^0.36.0", |