feat: whisper voice input

2024-02-11 02:12:49 -08:00 · 2024-02-11 02:12:49 -08:00 · f245c62692
commit f245c62692
parent c6c69924d1
3 changed files with 219 additions and 76 deletions
--- a/src/lib/components/chat/MessageInput.svelte
+++ b/src/lib/components/chat/MessageInput.svelte
@ -35,7 +35,6 @@
 	export let fileUploadEnabled = true;
 	export let speechRecognitionEnabled = true;
 	export let speechRecognitionListening = false;
 	export let prompt = '';
 	export let messages = [];
@ -51,11 +50,114 @@
 		}
 	}
 	let mediaRecorder;
 	let audioChunks = [];
 	let isRecording = false;
 	const MIN_DECIBELS = -45;
 	const startRecording = async () => {
 		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
 		mediaRecorder = new MediaRecorder(stream);
 		mediaRecorder.onstart = () => {
 			isRecording = true;
 			console.log('Recording started');
 		};
 		mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
 		mediaRecorder.onstop = async () => {
 			isRecording = false;
 			console.log('Recording stopped');
 			// Create a blob from the audio chunks
 			const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
 			const file = blobToFile(audioBlob, 'recording.wav');
 			const res = await transcribeAudio(localStorage.token, file).catch((error) => {
 				toast.error(error);
 				return null;
 			});
 			if (res) {
 				prompt = res.text;
 				await tick();
 				const inputElement = document.getElementById('chat-textarea');
 				inputElement?.focus();
 				if (prompt !== '' && $settings?.speechAutoSend === true) {
 					submitPrompt(prompt, user);
 				}
 			}
 			// saveRecording(audioBlob);
 			audioChunks = [];
 		};
 		// Start recording
 		mediaRecorder.start();
 		// Monitor silence
 		monitorSilence(stream);
 	};
 	const monitorSilence = (stream) => {
 		const audioContext = new AudioContext();
 		const audioStreamSource = audioContext.createMediaStreamSource(stream);
 		const analyser = audioContext.createAnalyser();
 		analyser.minDecibels = MIN_DECIBELS;
 		audioStreamSource.connect(analyser);
 		const bufferLength = analyser.frequencyBinCount;
 		const domainData = new Uint8Array(bufferLength);
 		let lastSoundTime = Date.now();
 		const detectSound = () => {
 			analyser.getByteFrequencyData(domainData);
 			if (domainData.some((value) => value > 0)) {
 				lastSoundTime = Date.now();
 			}
 			if (isRecording && Date.now() - lastSoundTime > 3000) {
 				mediaRecorder.stop();
 				audioContext.close();
 				return;
 			}
 			window.requestAnimationFrame(detectSound);
 		};
 		window.requestAnimationFrame(detectSound);
 	};
 	const saveRecording = (blob) => {
 		const url = URL.createObjectURL(blob);
 		const a = document.createElement('a');
 		document.body.appendChild(a);
 		a.style = 'display: none';
 		a.href = url;
 		a.download = 'recording.wav';
 		a.click();
 		window.URL.revokeObjectURL(url);
 	};
 	const speechRecognitionHandler = () => {
 		// Check if SpeechRecognition is supported
-		if (speechRecognitionListening) {
+		if (isRecording) {
 			if (speechRecognition) {
 				speechRecognition.stop();
 			}
 			if (mediaRecorder) {
 				mediaRecorder.stop();
 			}
 		} else {
 			isRecording = true;
 			if ($settings?.voice?.STTEngine ?? '' !== '') {
 				startRecording();
 			} else {
 				if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
 					// Create a SpeechRecognition object
@ -70,18 +172,22 @@
 					let timeoutId;
 					// Start recognition
 					speechRecognition.start();
 				speechRecognitionListening = true;
 					// Event triggered when speech is recognized
-				speechRecognition.onresult = function (event) {
+					speechRecognition.onresult = async (event) => {
 						// Clear the inactivity timeout
 						clearTimeout(timeoutId);
 						// Handle recognized speech
 						console.log(event);
 						const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
 						prompt = `${prompt}${transcript}`;
 						await tick();
 						const inputElement = document.getElementById('chat-textarea');
 						inputElement?.focus();
 						// Restart the inactivity timeout
 						timeoutId = setTimeout(() => {
 							console.log('Speech recognition turned off due to inactivity.');
@ -93,7 +199,7 @@
 					speechRecognition.onend = function () {
 						// Restart recognition after it ends
 						console.log('recognition ended');
-					speechRecognitionListening = false;
+						isRecording = false;
 						if (prompt !== '' && $settings?.speechAutoSend === true) {
 							submitPrompt(prompt, user);
 						}
@ -103,12 +209,13 @@
 					speechRecognition.onerror = function (event) {
 						console.log(event);
 						toast.error(`Speech recognition error: ${event.error}`);
-					speechRecognitionListening = false;
+						isRecording = false;
 					};
 				} else {
 					toast.error('SpeechRecognition API is not supported in this browser.');
 				}
 			}
 		}
 	};
 	const uploadDoc = async (file) => {
@ -550,7 +657,7 @@
 								: ' pl-4'} rounded-xl resize-none h-[48px]"
 							placeholder={chatInputPlaceholder !== ''
 								? chatInputPlaceholder
-								: speechRecognitionListening
+								: isRecording
 								? 'Listening...'
 								: 'Send a message'}
 							bind:value={prompt}
@ -659,6 +766,10 @@
 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
 								user = null;
 							}}
 							on:focus={(e) => {
 								e.target.style.height = '';
 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
 							}}
 							on:paste={(e) => {
 								const clipboardData = e.clipboardData || window.clipboardData;
@ -696,7 +807,7 @@
 											speechRecognitionHandler();
 										}}
 									>
-										{#if speechRecognitionListening}
+										{#if isRecording}
 											<svg
 												class=" w-5 h-5 translate-y-[0.5px]"
 												fill="currentColor"
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@ -148,7 +148,7 @@
 		} else {
 			speaking = true;
-			if ($settings?.speech?.engine === 'openai') {
+			if ($settings?.voice?.TTSEngine === 'openai') {
 				loadingSpeech = true;
 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
@ -179,7 +179,7 @@
 				for (const [idx, sentence] of sentences.entries()) {
 					const res = await synthesizeOpenAISpeech(
 						localStorage.token,
-						$settings?.speech?.speaker,
+						$settings?.voice?.speaker,
 						sentence
 					).catch((error) => {
 						toast.error(error);
@ -204,7 +204,7 @@
 						clearInterval(getVoicesLoop);
 						const voice =
-							voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined;
+							voices?.filter((v) => v.name === $settings?.voice?.speaker)?.at(0) ?? undefined;
 						const speak = new SpeechSynthesisUtterance(message.content);
--- a/src/lib/components/chat/Settings/Voice.svelte
+++ b/src/lib/components/chat/Settings/Voice.svelte
@ -1,17 +1,21 @@
 <script lang="ts">
 	import { createEventDispatcher, onMount } from 'svelte';
 	import toast from 'svelte-french-toast';
 	const dispatch = createEventDispatcher();
 	export let saveSettings: Function;
 	// Voice
 	let STTEngines = ['', 'openai'];
 	let STTEngine = '';
 	let conversationMode = false;
 	let speechAutoSend = false;
 	let responseAutoPlayback = false;
-	let engines = ['', 'openai'];
+	let TTSEngines = ['', 'openai'];
-	let engine = '';
+	let TTSEngine = '';
 	let voices = [];
 	let speaker = '';
@ -70,10 +74,11 @@
 		speechAutoSend = settings.speechAutoSend ?? false;
 		responseAutoPlayback = settings.responseAutoPlayback ?? false;
-		engine = settings?.speech?.engine ?? '';
+		STTEngine = settings?.voice?.STTEngine ?? '';
-		speaker = settings?.speech?.speaker ?? '';
+		TTSEngine = settings?.voice?.TTSEngine ?? '';
 		speaker = settings?.voice?.speaker ?? '';
-		if (engine === 'openai') {
+		if (TTSEngine === 'openai') {
 			getOpenAIVoices();
 		} else {
 			getWebAPIVoices();
@ -85,37 +90,37 @@
 	class="flex flex-col h-full justify-between space-y-3 text-sm"
 	on:submit|preventDefault={() => {
 		saveSettings({
-			speech: {
+			voice: {
-				engine: engine !== '' ? engine : undefined,
+				STTEngine: STTEngine !== '' ? STTEngine : undefined,
 				TTSEngine: TTSEngine !== '' ? TTSEngine : undefined,
 				speaker: speaker !== '' ? speaker : undefined
 			}
 		});
 		dispatch('save');
 	}}
 >
-	<div class=" space-y-3">
+	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
 		<div>
-			<div class=" mb-1 text-sm font-medium">TTS Settings</div>
+			<div class=" mb-1 text-sm font-medium">STT Settings</div>
 			<div class=" py-0.5 flex w-full justify-between">
-				<div class=" self-center text-xs font-medium">Speech Engine</div>
+				<div class=" self-center text-xs font-medium">Speech-to-Text Engine</div>
 				<div class="flex items-center relative">
 					<select
 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
-						bind:value={engine}
+						bind:value={STTEngine}
 						placeholder="Select a mode"
 						on:change={(e) => {
-							if (e.target.value === 'openai') {
+							if (e.target.value !== '') {
-								getOpenAIVoices();
+								navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
-								speaker = 'alloy';
+									toast.error(`Permission denied when accessing microphone: ${err}`);
-							} else {
+									STTEngine = '';
-								getWebAPIVoices();
+								});
 								speaker = '';
 							}
 						}}
 					>
 						<option value="">Default (Web API)</option>
-						<option value="openai">Open AI</option>
+						<option value="whisper-local">Whisper (Local)</option>
 					</select>
 				</div>
 			</div>
@ -155,6 +160,33 @@
 					{/if}
 				</button>
 			</div>
 		</div>
 		<div>
 			<div class=" mb-1 text-sm font-medium">TTS Settings</div>
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div>
 				<div class="flex items-center relative">
 					<select
 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 						bind:value={TTSEngine}
 						placeholder="Select a mode"
 						on:change={(e) => {
 							if (e.target.value === 'openai') {
 								getOpenAIVoices();
 								speaker = 'alloy';
 							} else {
 								getWebAPIVoices();
 								speaker = '';
 							}
 						}}
 					>
 						<option value="">Default (Web API)</option>
 						<option value="openai">Open AI</option>
 					</select>
 				</div>
 			</div>
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">Auto-playback response</div>
@ -177,7 +209,7 @@
 		<hr class=" dark:border-gray-700" />
-		{#if engine === ''}
+		{#if TTSEngine === ''}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
 				<div class="flex w-full">
@ -196,7 +228,7 @@
 					</div>
 				</div>
 			</div>
-		{:else if engine === 'openai'}
+		{:else if TTSEngine === 'openai'}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
 				<div class="flex w-full">