| @@ -6,7 +6,8 @@ from services.errors.audio import NoAudioUploadedServiceError, AudioTooLargeServ | |||
| from core.llm.whisper import Whisper | |||
| from models.provider import ProviderName | |||
| FILE_SIZE_LIMIT = 1 * 1024 * 1024 | |||
| FILE_SIZE = 15 | |||
| FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 | |||
| ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm'] | |||
| class AudioService: | |||
| @@ -23,17 +24,17 @@ class AudioService: | |||
| file_size = len(file_content) | |||
| if file_size > FILE_SIZE_LIMIT: | |||
| message = f"({file_size} > {FILE_SIZE_LIMIT})" | |||
| message = f"Audio size larger than {FILE_SIZE} mb" | |||
| raise AudioTooLargeServiceError(message) | |||
| provider_name = LLMBuilder.get_default_provider(tenant_id) | |||
| if provider_name != ProviderName.OPENAI.value: | |||
| raise ProviderNotSupportSpeechToTextServiceError('haha') | |||
| raise ProviderNotSupportSpeechToTextServiceError() | |||
| provider_service = LLMProviderService(tenant_id, provider_name) | |||
| buffer = io.BytesIO(file_content) | |||
| buffer.name = 'temp.wav' | |||
| buffer.name = 'temp.mp3' | |||
| return Whisper(provider_service.provider).transcribe(buffer) | |||
| @@ -1,23 +1,13 @@ | |||
| from services.errors.base import BaseServiceError | |||
| class NoAudioUploadedServiceError(Exception): | |||
| pass | |||
| class NoAudioUploadedServiceError(BaseServiceError): | |||
| error_code = 'no_audio_uploaded' | |||
| description = "Please upload your audio." | |||
| code = 400 | |||
| class AudioTooLargeServiceError(Exception): | |||
| pass | |||
| class AudioTooLargeServiceError(BaseServiceError): | |||
| error_code = 'audio_too_large' | |||
| description = "Audio size exceeded. {message}" | |||
| code = 413 | |||
| class UnsupportedAudioTypeServiceError(Exception): | |||
| pass | |||
| class UnsupportedAudioTypeServiceError(BaseServiceError): | |||
| error_code = 'unsupported_audio_type' | |||
| description = "Audio type not allowed." | |||
| code = 415 | |||
| class ProviderNotSupportSpeechToTextServiceError(BaseServiceError): | |||
| error_code = 'provider_not_support_speech_to_text' | |||
| description = "Provider not support speech to text. {message}" | |||
| code = 400 | |||
| class ProviderNotSupportSpeechToTextServiceError(Exception): | |||
| pass | |||
| @@ -4,6 +4,7 @@ import { useParams, usePathname } from 'next/navigation' | |||
| import cn from 'classnames' | |||
| import Recorder from 'js-audio-recorder' | |||
| import { useRafInterval } from 'ahooks' | |||
| import { convertToMp3 } from './utils' | |||
| import s from './index.module.css' | |||
| import { StopCircle } from '@/app/components/base/icons/src/vender/solid/mediaAndDevices' | |||
| import { Loading02, XClose } from '@/app/components/base/icons/src/vender/line/general' | |||
| @@ -19,7 +20,12 @@ const VoiceInput = ({ | |||
| onConverted, | |||
| }: VoiceInputTypes) => { | |||
| const { t } = useTranslation() | |||
| const recorder = useRef(new Recorder()) | |||
| const recorder = useRef(new Recorder({ | |||
| sampleBits: 16, | |||
| sampleRate: 16000, | |||
| numChannels: 1, | |||
| compiling: false, | |||
| })) | |||
| const canvasRef = useRef<HTMLCanvasElement | null>(null) | |||
| const ctxRef = useRef<CanvasRenderingContext2D | null>(null) | |||
| const drawRecordId = useRef<number | null>(null) | |||
| @@ -75,10 +81,10 @@ const VoiceInput = ({ | |||
| const canvas = canvasRef.current! | |||
| const ctx = ctxRef.current! | |||
| ctx.clearRect(0, 0, canvas.width, canvas.height) | |||
| const wavBlob = recorder.current.getWAVBlob() | |||
| const wavFile = new File([wavBlob], 'a.wav', { type: 'audio/wav' }) | |||
| const mp3Blob = convertToMp3(recorder.current) | |||
| const mp3File = new File([mp3Blob], 'temp.mp3', { type: 'audio/mp3' }) | |||
| const formData = new FormData() | |||
| formData.append('file', wavFile) | |||
| formData.append('file', mp3File) | |||
| let url = '' | |||
| let isPublic = false | |||
| @@ -0,0 +1,38 @@ | |||
| import lamejs from 'lamejs' | |||
| export const convertToMp3 = (recorder: any) => { | |||
| const wav = lamejs.WavHeader.readHeader(recorder.getWAV()) | |||
| const { channels, sampleRate } = wav | |||
| const mp3enc = new lamejs.Mp3Encoder(channels, sampleRate, 128) | |||
| const result = recorder.getChannelData() | |||
| const buffer = [] | |||
| const leftData = result.left && new Int16Array(result.left.buffer, 0, result.left.byteLength / 2) | |||
| const rightData = result.right && new Int16Array(result.right.buffer, 0, result.right.byteLength / 2) | |||
| const remaining = leftData.length + (rightData ? rightData.length : 0) | |||
| const maxSamples = 1152 | |||
| for (let i = 0; i < remaining; i += maxSamples) { | |||
| const left = leftData.subarray(i, i + maxSamples) | |||
| let right = null | |||
| let mp3buf = null | |||
| if (channels === 2) { | |||
| right = rightData.subarray(i, i + maxSamples) | |||
| mp3buf = mp3enc.encodeBuffer(left, right) | |||
| } | |||
| else { | |||
| mp3buf = mp3enc.encodeBuffer(left) | |||
| } | |||
| if (mp3buf.length > 0) | |||
| buffer.push(mp3buf) | |||
| } | |||
| const enc = mp3enc.flush() | |||
| if (enc.length > 0) | |||
| buffer.push(enc) | |||
| return new Blob(buffer, { type: 'audio/mp3' }) | |||
| } | |||
| @@ -0,0 +1 @@ | |||
| declare module 'lamejs'; | |||
| @@ -81,7 +81,8 @@ | |||
| "swr": "^2.1.0", | |||
| "tailwindcss": "^3.2.7", | |||
| "typescript": "4.9.5", | |||
| "use-context-selector": "^1.4.1" | |||
| "use-context-selector": "^1.4.1", | |||
| "lamejs": "1.2.0" | |||
| }, | |||
| "devDependencies": { | |||
| "@antfu/eslint-config": "^0.36.0", | |||