From f245c6269227a0febd72ebc9d2c067173183ed62 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Sun, 11 Feb 2024 02:12:49 -0800 Subject: [PATCH] feat: whisper voice input --- src/lib/components/chat/MessageInput.svelte | 217 +++++++++++++----- .../chat/Messages/ResponseMessage.svelte | 6 +- src/lib/components/chat/Settings/Voice.svelte | 72 ++++-- 3 files changed, 219 insertions(+), 76 deletions(-) diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index aae99992..29354610 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -35,7 +35,6 @@ export let fileUploadEnabled = true; export let speechRecognitionEnabled = true; - export let speechRecognitionListening = false; export let prompt = ''; export let messages = []; @@ -51,62 +50,170 @@ } } + let mediaRecorder; + let audioChunks = []; + let isRecording = false; + const MIN_DECIBELS = -45; + + const startRecording = async () => { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream); + mediaRecorder.onstart = () => { + isRecording = true; + console.log('Recording started'); + }; + mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data); + mediaRecorder.onstop = async () => { + isRecording = false; + console.log('Recording stopped'); + + // Create a blob from the audio chunks + const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); + + const file = blobToFile(audioBlob, 'recording.wav'); + + const res = await transcribeAudio(localStorage.token, file).catch((error) => { + toast.error(error); + return null; + }); + + if (res) { + prompt = res.text; + await tick(); + + const inputElement = document.getElementById('chat-textarea'); + inputElement?.focus(); + + if (prompt !== '' && $settings?.speechAutoSend === true) { + submitPrompt(prompt, user); + } + } + + // saveRecording(audioBlob); + audioChunks = []; + }; + + // Start recording + mediaRecorder.start(); + + // Monitor silence + monitorSilence(stream); + }; + + const monitorSilence = (stream) => { + const audioContext = new AudioContext(); + const audioStreamSource = audioContext.createMediaStreamSource(stream); + const analyser = audioContext.createAnalyser(); + analyser.minDecibels = MIN_DECIBELS; + audioStreamSource.connect(analyser); + + const bufferLength = analyser.frequencyBinCount; + const domainData = new Uint8Array(bufferLength); + + let lastSoundTime = Date.now(); + + const detectSound = () => { + analyser.getByteFrequencyData(domainData); + + if (domainData.some((value) => value > 0)) { + lastSoundTime = Date.now(); + } + + if (isRecording && Date.now() - lastSoundTime > 3000) { + mediaRecorder.stop(); + audioContext.close(); + return; + } + + window.requestAnimationFrame(detectSound); + }; + + window.requestAnimationFrame(detectSound); + }; + + const saveRecording = (blob) => { + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + document.body.appendChild(a); + a.style = 'display: none'; + a.href = url; + a.download = 'recording.wav'; + a.click(); + window.URL.revokeObjectURL(url); + }; + const speechRecognitionHandler = () => { // Check if SpeechRecognition is supported - if (speechRecognitionListening) { - speechRecognition.stop(); + if (isRecording) { + if (speechRecognition) { + speechRecognition.stop(); + } + + if (mediaRecorder) { + mediaRecorder.stop(); + } } else { - if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { - // Create a SpeechRecognition object - speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); + isRecording = true; - // Set continuous to true for continuous recognition - speechRecognition.continuous = true; - - // Set the timeout for turning off the recognition after inactivity (in milliseconds) - const inactivityTimeout = 3000; // 3 seconds - - let timeoutId; - // Start recognition - speechRecognition.start(); - speechRecognitionListening = true; - - // Event triggered when speech is recognized - speechRecognition.onresult = function (event) { - // Clear the inactivity timeout - clearTimeout(timeoutId); - - // Handle recognized speech - console.log(event); - const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; - prompt = `${prompt}${transcript}`; - - // Restart the inactivity timeout - timeoutId = setTimeout(() => { - console.log('Speech recognition turned off due to inactivity.'); - speechRecognition.stop(); - }, inactivityTimeout); - }; - - // Event triggered when recognition is ended - speechRecognition.onend = function () { - // Restart recognition after it ends - console.log('recognition ended'); - speechRecognitionListening = false; - if (prompt !== '' && $settings?.speechAutoSend === true) { - submitPrompt(prompt, user); - } - }; - - // Event triggered when an error occurs - speechRecognition.onerror = function (event) { - console.log(event); - toast.error(`Speech recognition error: ${event.error}`); - speechRecognitionListening = false; - }; + if ($settings?.voice?.STTEngine ?? '' !== '') { + startRecording(); } else { - toast.error('SpeechRecognition API is not supported in this browser.'); + if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { + // Create a SpeechRecognition object + speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); + + // Set continuous to true for continuous recognition + speechRecognition.continuous = true; + + // Set the timeout for turning off the recognition after inactivity (in milliseconds) + const inactivityTimeout = 3000; // 3 seconds + + let timeoutId; + // Start recognition + speechRecognition.start(); + + // Event triggered when speech is recognized + speechRecognition.onresult = async (event) => { + // Clear the inactivity timeout + clearTimeout(timeoutId); + + // Handle recognized speech + console.log(event); + const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; + + prompt = `${prompt}${transcript}`; + + await tick(); + const inputElement = document.getElementById('chat-textarea'); + inputElement?.focus(); + + // Restart the inactivity timeout + timeoutId = setTimeout(() => { + console.log('Speech recognition turned off due to inactivity.'); + speechRecognition.stop(); + }, inactivityTimeout); + }; + + // Event triggered when recognition is ended + speechRecognition.onend = function () { + // Restart recognition after it ends + console.log('recognition ended'); + isRecording = false; + if (prompt !== '' && $settings?.speechAutoSend === true) { + submitPrompt(prompt, user); + } + }; + + // Event triggered when an error occurs + speechRecognition.onerror = function (event) { + console.log(event); + toast.error(`Speech recognition error: ${event.error}`); + isRecording = false; + }; + } else { + toast.error('SpeechRecognition API is not supported in this browser.'); + } } } }; @@ -550,7 +657,7 @@ : ' pl-4'} rounded-xl resize-none h-[48px]" placeholder={chatInputPlaceholder !== '' ? chatInputPlaceholder - : speechRecognitionListening + : isRecording ? 'Listening...' : 'Send a message'} bind:value={prompt} @@ -659,6 +766,10 @@ e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; user = null; }} + on:focus={(e) => { + e.target.style.height = ''; + e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; + }} on:paste={(e) => { const clipboardData = e.clipboardData || window.clipboardData; @@ -696,7 +807,7 @@ speechRecognitionHandler(); }} > - {#if speechRecognitionListening} + {#if isRecording} { @@ -179,7 +179,7 @@ for (const [idx, sentence] of sentences.entries()) { const res = await synthesizeOpenAISpeech( localStorage.token, - $settings?.speech?.speaker, + $settings?.voice?.speaker, sentence ).catch((error) => { toast.error(error); @@ -204,7 +204,7 @@ clearInterval(getVoicesLoop); const voice = - voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; + voices?.filter((v) => v.name === $settings?.voice?.speaker)?.at(0) ?? undefined; const speak = new SpeechSynthesisUtterance(message.content); diff --git a/src/lib/components/chat/Settings/Voice.svelte b/src/lib/components/chat/Settings/Voice.svelte index 0dc7f0c5..5867b917 100644 --- a/src/lib/components/chat/Settings/Voice.svelte +++ b/src/lib/components/chat/Settings/Voice.svelte @@ -1,17 +1,21 @@
+
-
TTS Settings
+
STT Settings
-
Speech Engine
+
Speech-to-Text Engine
@@ -155,6 +160,33 @@ {/if}
+
+ +
+
TTS Settings
+ +
+
Text-to-Speech Engine
+
+ +
+
Auto-playback response
@@ -177,7 +209,7 @@
- {#if engine === ''} + {#if TTSEngine === ''}
Set Voice
@@ -196,7 +228,7 @@
- {:else if engine === 'openai'} + {:else if TTSEngine === 'openai'}
Set Voice