From 08d40eb24b82d8ab041874107fb0b58c41547913 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 8 Jun 2024 00:12:46 +0200 Subject: [PATCH 1/3] feat(ui): add page to talk with voice, transcription, and tts Signed-off-by: Ettore Di Giacinto --- core/http/routes/ui.go | 20 +++ core/http/static/talk.js | 174 +++++++++++++++++++++++++++ core/http/views/partials/navbar.html | 2 + core/http/views/talk.html | 104 ++++++++++++++++ 4 files changed, 300 insertions(+) create mode 100644 core/http/static/talk.js create mode 100644 core/http/views/talk.html diff --git a/core/http/routes/ui.go b/core/http/routes/ui.go index efd08315895..e0313abf17a 100644 --- a/core/http/routes/ui.go +++ b/core/http/routes/ui.go @@ -247,6 +247,26 @@ func RegisterUIRoutes(app *fiber.App, // Render index return c.Render("views/chat", summary) }) + + app.Get("/talk/", auth, func(c *fiber.Ctx) error { + backendConfigs := cl.GetAllBackendConfigs() + + if len(backendConfigs) == 0 { + // If no model is available redirect to the index which suggests how to install models + return c.Redirect("/") + } + + summary := fiber.Map{ + "Title": "LocalAI - Talk", + "ModelsConfig": backendConfigs, + "Model": backendConfigs[0].Name, + "Version": internal.PrintableVersion(), + } + + // Render index + return c.Render("views/talk", summary) + }) + app.Get("/chat/", auth, func(c *fiber.Ctx) error { backendConfigs := cl.GetAllBackendConfigs() diff --git a/core/http/static/talk.js b/core/http/static/talk.js new file mode 100644 index 00000000000..1ab98ca7657 --- /dev/null +++ b/core/http/static/talk.js @@ -0,0 +1,174 @@ + +const recordButton = document.getElementById('recordButton'); +const audioPlayback = document.getElementById('audioPlayback'); +const resetButton = document.getElementById('resetButton'); + +let mediaRecorder; +let audioChunks = []; +let isRecording = false; +let conversationHistory = []; +let resetTimer; + +function getApiKey() { + return document.getElementById('apiKey').value; +} + +function getModel() { + return document.getElementById('modelSelect').value; +} + +function getWhisperModel() { + return document.getElementById('whisperModelSelect').value; +} + +function getTTSModel() { + return document.getElementById('ttsModelSelect').value; +} + +function resetConversation() { + conversationHistory = []; + console.log("Conversation has been reset."); + clearTimeout(resetTimer); +} + +function setResetTimer() { + clearTimeout(resetTimer); + resetTimer = setTimeout(resetConversation, 300000); // Reset after 5 minutes +} + +recordButton.addEventListener('click', toggleRecording); +resetButton.addEventListener('click', resetConversation); + +function toggleRecording() { + if (!isRecording) { + startRecording(); + } else { + stopRecording(); + } +} + +async function startRecording() { + if (!navigator.mediaDevices) { + alert('MediaDevices API not supported!'); + return; + } + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream); + audioChunks = []; + mediaRecorder.ondataavailable = (event) => { + audioChunks.push(event.data); + }; + mediaRecorder.start(); + recordButton.textContent = 'Stop Recording'; + isRecording = true; +} + +function stopRecording() { + mediaRecorder.stop(); + mediaRecorder.onstop = async () => { + document.getElementById("loader").style.display = "block"; + const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + const transcript = await sendAudioToWhisper(audioBlob); + console.log("Transcript:", transcript) + const responseText = await sendTextToChatGPT(transcript); + console.log("Response:", responseText) + + const ttsAudio = await getTextToSpeechAudio(responseText); + playAudioResponse(ttsAudio); + + recordButton.textContent = 'Record'; + isRecording = false; + document.getElementById("loader").style.display = "none"; + }; +} + +function submitKey(event) { + event.preventDefault(); + localStorage.setItem("key", document.getElementById("apiKey").value); + document.getElementById("apiKey").blur(); +} + +document.getElementById("key").addEventListener("submit", submitKey); + + +storeKey = localStorage.getItem("key"); +if (storeKey) { + document.getElementById("apiKey").value = storeKey; +} else { + document.getElementById("apiKey").value = null; +} + + +async function sendAudioToWhisper(audioBlob) { + const formData = new FormData(); + formData.append('file', audioBlob); + formData.append('model', getWhisperModel()); + API_KEY = localStorage.getItem("key"); + + const response = await fetch('/v1/audio/transcriptions', { + method: 'POST', + headers: { + 'Authorization': `Bearer ${API_KEY}` + }, + body: formData + }); + + const result = await response.json(); + console.log("Whisper result:", result) + return result.text; +} + +async function sendTextToChatGPT(text) { + conversationHistory.push({ role: "user", content: text }); + API_KEY = localStorage.getItem("key"); + + const response = await fetch('/v1/chat/completions', { + method: 'POST', + headers: { + 'Authorization': `Bearer ${API_KEY}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + model: getModel(), + messages: conversationHistory + }) + }); + + const result = await response.json(); + const responseText = result.choices[0].message.content; + conversationHistory.push({ role: "assistant", content: responseText }); + + setResetTimer(); + + return responseText; +} + +async function getTextToSpeechAudio(text) { + API_KEY = localStorage.getItem("key"); + + const response = await fetch('/v1/audio/speech', { + + method: 'POST', + headers: { + 'Authorization': `Bearer ${API_KEY}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify({ + // "backend": "string", + input: text, + model: getTTSModel(), + // "voice": "string" + }) + }); + + const audioBlob = await response.blob(); + return audioBlob; // Return the blob directly +} + +function playAudioResponse(audioBlob) { + const audioUrl = URL.createObjectURL(audioBlob); + audioPlayback.src = audioUrl; + audioPlayback.hidden = false; + audioPlayback.play(); +} + diff --git a/core/http/views/partials/navbar.html b/core/http/views/partials/navbar.html index be238479f8b..caa1f3b77c9 100644 --- a/core/http/views/partials/navbar.html +++ b/core/http/views/partials/navbar.html @@ -20,6 +20,7 @@ Chat Generate images TTS + Talk API @@ -32,6 +33,7 @@ Chat Generate images TTS + Talk API diff --git a/core/http/views/talk.html b/core/http/views/talk.html new file mode 100644 index 00000000000..dbf9d5e4b4f --- /dev/null +++ b/core/http/views/talk.html @@ -0,0 +1,104 @@ + + + {{template "views/partials/head" .}} + + + +
+ + {{template "views/partials/navbar"}} +
+ +
+ +
+ + + +
+ + +
+
+
+ +
+
+
+ + +
+ + +
+ +
+ + +
+ + +
+ + +
+ + + + Reset conversation + + +
+
+
+
+ + From 3d6a7343d75fff360bb2e121950bec3668337bf1 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 8 Jun 2024 00:57:54 +0200 Subject: [PATCH 2/3] Enhance graphics and status reporting Signed-off-by: Ettore Di Giacinto --- core/http/static/talk.js | 16 ++++++++++++++-- core/http/views/talk.html | 10 +++++++--- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/core/http/static/talk.js b/core/http/static/talk.js index 1ab98ca7657..b11ca07a4a6 100644 --- a/core/http/static/talk.js +++ b/core/http/static/talk.js @@ -48,6 +48,7 @@ function toggleRecording() { } async function startRecording() { + document.getElementById("recording").style.display = "block"; if (!navigator.mediaDevices) { alert('MediaDevices API not supported!'); return; @@ -60,25 +61,36 @@ async function startRecording() { }; mediaRecorder.start(); recordButton.textContent = 'Stop Recording'; + // add class bg-red-500 to recordButton + recordButton.classList.add("bg-gray-500"); + isRecording = true; } function stopRecording() { mediaRecorder.stop(); mediaRecorder.onstop = async () => { + document.getElementById("recording").style.display = "none"; document.getElementById("loader").style.display = "block"; const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); + document.getElementById("statustext").textContent = "Processing audio..."; const transcript = await sendAudioToWhisper(audioBlob); - console.log("Transcript:", transcript) + console.log("Transcript:", transcript); + document.getElementById("statustext").textContent = "Seems you said: " + transcript+ ". Generating response..."; const responseText = await sendTextToChatGPT(transcript); - console.log("Response:", responseText) + + console.log("Response:", responseText); + document.getElementById("statustext").textContent = "Response generated: '" + responseText + "'. Generating audio response..."; const ttsAudio = await getTextToSpeechAudio(responseText); playAudioResponse(ttsAudio); recordButton.textContent = 'Record'; + // remove class bg-red-500 from recordButton + recordButton.classList.remove("bg-gray-500"); isRecording = false; document.getElementById("loader").style.display = "none"; + document.getElementById("statustext").textContent = "Press the record button to start recording."; }; } diff --git a/core/http/views/talk.html b/core/http/views/talk.html index dbf9d5e4b4f..05cc2b39ae3 100644 --- a/core/http/views/talk.html +++ b/core/http/views/talk.html @@ -44,8 +44,12 @@
+ - +