diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py index 0f65a551e2..c81b10cf23 100644 --- a/backend/apps/audio/main.py +++ b/backend/apps/audio/main.py @@ -170,6 +170,7 @@ async def speech(request: Request, user=Depends(get_verified_user)): @app.post("/transcriptions") def transcribe( file: UploadFile = File(...), + model: str = Form(...), user=Depends(get_current_user), ): log.info(f"file.content_type: {file.content_type}") @@ -189,7 +190,7 @@ def transcribe( f.close() whisper_kwargs = { - "model_size_or_path": WHISPER_MODEL, + "model_size_or_path": model, "device": whisper_device_type, "compute_type": "int8", "download_root": WHISPER_MODEL_DIR, diff --git a/src/lib/apis/openai/index.ts b/src/lib/apis/openai/index.ts index 2a52ebb320..26c089622f 100644 --- a/src/lib/apis/openai/index.ts +++ b/src/lib/apis/openai/index.ts @@ -386,6 +386,41 @@ export const generateTitle = async ( return res?.choices[0]?.message?.content.replace(/["']/g, '') ?? 'New Chat'; }; +export const transcribe = async ( + token: string = '', + file: File, + model: string = 'whisper-1' +) => { + let error = null; + + const formData = new FormData(); + formData.append('file', file); + formData.append('model', model); + + const res = await fetch(`${OPENAI_API_BASE_URL}/transcriptions`, { + method: 'POST', + headers: { + Authorization: `Bearer ${token}` + }, + body: formData + }) + .then(async (res) => { + if (!res.ok) throw await res.json(); + return res.json(); + }) + .catch((err) => { + console.log(err); + error = err; + return null; + }); + + if (error) { + throw error; + } + + return res; +}; + export const generateSearchQuery = async ( token: string = '', model: string, diff --git a/src/lib/components/chat/Settings/Audio.svelte b/src/lib/components/chat/Settings/Audio.svelte index 039b710afe..d04f959559 100644 --- a/src/lib/components/chat/Settings/Audio.svelte +++ b/src/lib/components/chat/Settings/Audio.svelte @@ -16,8 +16,9 @@ let OpenAIKey = ''; let OpenAISpeaker = ''; - let STTEngines = ['', 'openai']; + let STTEngines = ['', 'openai', 'whisper-openai']; let STTEngine = ''; + let STTModel = ''; let conversationMode = false; let speechAutoSend = false; @@ -144,6 +145,7 @@ saveSettings({ audio: { STTEngine: STTEngine !== '' ? STTEngine : undefined, + STTModel: STTModel !== '' ? STTModel : undefined, TTSEngine: TTSEngine !== '' ? TTSEngine : undefined, speaker: (TTSEngine === 'openai' ? OpenAISpeaker : speaker) !== '' @@ -162,7 +164,7 @@
{$i18n.t('STT Settings')}
-
+
{$i18n.t('Speech-to-Text Engine')}
+ {#if STTEngine === 'whisper-openai'} +
+
{$i18n.t('STT Model Name')}
+
+ +
+
+ {/if} +
{$i18n.t('Conversation Mode')}