forked from open-webui/open-webui
		
	feat: whisper voice input
This commit is contained in:
		
							parent
							
								
									c6c69924d1
								
							
						
					
					
						commit
						f245c62692
					
				
					 3 changed files with 219 additions and 76 deletions
				
			
		|  | @ -35,7 +35,6 @@ | |||
| 
 | ||||
| 	export let fileUploadEnabled = true; | ||||
| 	export let speechRecognitionEnabled = true; | ||||
| 	export let speechRecognitionListening = false; | ||||
| 
 | ||||
| 	export let prompt = ''; | ||||
| 	export let messages = []; | ||||
|  | @ -51,62 +50,170 @@ | |||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	let mediaRecorder; | ||||
| 	let audioChunks = []; | ||||
| 	let isRecording = false; | ||||
| 	const MIN_DECIBELS = -45; | ||||
| 
 | ||||
| 	const startRecording = async () => { | ||||
| 		const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); | ||||
| 		mediaRecorder = new MediaRecorder(stream); | ||||
| 		mediaRecorder.onstart = () => { | ||||
| 			isRecording = true; | ||||
| 			console.log('Recording started'); | ||||
| 		}; | ||||
| 		mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data); | ||||
| 		mediaRecorder.onstop = async () => { | ||||
| 			isRecording = false; | ||||
| 			console.log('Recording stopped'); | ||||
| 
 | ||||
| 			// Create a blob from the audio chunks | ||||
| 			const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); | ||||
| 
 | ||||
| 			const file = blobToFile(audioBlob, 'recording.wav'); | ||||
| 
 | ||||
| 			const res = await transcribeAudio(localStorage.token, file).catch((error) => { | ||||
| 				toast.error(error); | ||||
| 				return null; | ||||
| 			}); | ||||
| 
 | ||||
| 			if (res) { | ||||
| 				prompt = res.text; | ||||
| 				await tick(); | ||||
| 
 | ||||
| 				const inputElement = document.getElementById('chat-textarea'); | ||||
| 				inputElement?.focus(); | ||||
| 
 | ||||
| 				if (prompt !== '' && $settings?.speechAutoSend === true) { | ||||
| 					submitPrompt(prompt, user); | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			// saveRecording(audioBlob); | ||||
| 			audioChunks = []; | ||||
| 		}; | ||||
| 
 | ||||
| 		// Start recording | ||||
| 		mediaRecorder.start(); | ||||
| 
 | ||||
| 		// Monitor silence | ||||
| 		monitorSilence(stream); | ||||
| 	}; | ||||
| 
 | ||||
| 	const monitorSilence = (stream) => { | ||||
| 		const audioContext = new AudioContext(); | ||||
| 		const audioStreamSource = audioContext.createMediaStreamSource(stream); | ||||
| 		const analyser = audioContext.createAnalyser(); | ||||
| 		analyser.minDecibels = MIN_DECIBELS; | ||||
| 		audioStreamSource.connect(analyser); | ||||
| 
 | ||||
| 		const bufferLength = analyser.frequencyBinCount; | ||||
| 		const domainData = new Uint8Array(bufferLength); | ||||
| 
 | ||||
| 		let lastSoundTime = Date.now(); | ||||
| 
 | ||||
| 		const detectSound = () => { | ||||
| 			analyser.getByteFrequencyData(domainData); | ||||
| 
 | ||||
| 			if (domainData.some((value) => value > 0)) { | ||||
| 				lastSoundTime = Date.now(); | ||||
| 			} | ||||
| 
 | ||||
| 			if (isRecording && Date.now() - lastSoundTime > 3000) { | ||||
| 				mediaRecorder.stop(); | ||||
| 				audioContext.close(); | ||||
| 				return; | ||||
| 			} | ||||
| 
 | ||||
| 			window.requestAnimationFrame(detectSound); | ||||
| 		}; | ||||
| 
 | ||||
| 		window.requestAnimationFrame(detectSound); | ||||
| 	}; | ||||
| 
 | ||||
| 	const saveRecording = (blob) => { | ||||
| 		const url = URL.createObjectURL(blob); | ||||
| 		const a = document.createElement('a'); | ||||
| 		document.body.appendChild(a); | ||||
| 		a.style = 'display: none'; | ||||
| 		a.href = url; | ||||
| 		a.download = 'recording.wav'; | ||||
| 		a.click(); | ||||
| 		window.URL.revokeObjectURL(url); | ||||
| 	}; | ||||
| 
 | ||||
| 	const speechRecognitionHandler = () => { | ||||
| 		// Check if SpeechRecognition is supported | ||||
| 
 | ||||
| 		if (speechRecognitionListening) { | ||||
| 			speechRecognition.stop(); | ||||
| 		if (isRecording) { | ||||
| 			if (speechRecognition) { | ||||
| 				speechRecognition.stop(); | ||||
| 			} | ||||
| 
 | ||||
| 			if (mediaRecorder) { | ||||
| 				mediaRecorder.stop(); | ||||
| 			} | ||||
| 		} else { | ||||
| 			if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { | ||||
| 				// Create a SpeechRecognition object | ||||
| 				speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); | ||||
| 			isRecording = true; | ||||
| 
 | ||||
| 				// Set continuous to true for continuous recognition | ||||
| 				speechRecognition.continuous = true; | ||||
| 
 | ||||
| 				// Set the timeout for turning off the recognition after inactivity (in milliseconds) | ||||
| 				const inactivityTimeout = 3000; // 3 seconds | ||||
| 
 | ||||
| 				let timeoutId; | ||||
| 				// Start recognition | ||||
| 				speechRecognition.start(); | ||||
| 				speechRecognitionListening = true; | ||||
| 
 | ||||
| 				// Event triggered when speech is recognized | ||||
| 				speechRecognition.onresult = function (event) { | ||||
| 					// Clear the inactivity timeout | ||||
| 					clearTimeout(timeoutId); | ||||
| 
 | ||||
| 					// Handle recognized speech | ||||
| 					console.log(event); | ||||
| 					const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; | ||||
| 					prompt = `${prompt}${transcript}`; | ||||
| 
 | ||||
| 					// Restart the inactivity timeout | ||||
| 					timeoutId = setTimeout(() => { | ||||
| 						console.log('Speech recognition turned off due to inactivity.'); | ||||
| 						speechRecognition.stop(); | ||||
| 					}, inactivityTimeout); | ||||
| 				}; | ||||
| 
 | ||||
| 				// Event triggered when recognition is ended | ||||
| 				speechRecognition.onend = function () { | ||||
| 					// Restart recognition after it ends | ||||
| 					console.log('recognition ended'); | ||||
| 					speechRecognitionListening = false; | ||||
| 					if (prompt !== '' && $settings?.speechAutoSend === true) { | ||||
| 						submitPrompt(prompt, user); | ||||
| 					} | ||||
| 				}; | ||||
| 
 | ||||
| 				// Event triggered when an error occurs | ||||
| 				speechRecognition.onerror = function (event) { | ||||
| 					console.log(event); | ||||
| 					toast.error(`Speech recognition error: ${event.error}`); | ||||
| 					speechRecognitionListening = false; | ||||
| 				}; | ||||
| 			if ($settings?.voice?.STTEngine ?? '' !== '') { | ||||
| 				startRecording(); | ||||
| 			} else { | ||||
| 				toast.error('SpeechRecognition API is not supported in this browser.'); | ||||
| 				if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { | ||||
| 					// Create a SpeechRecognition object | ||||
| 					speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); | ||||
| 
 | ||||
| 					// Set continuous to true for continuous recognition | ||||
| 					speechRecognition.continuous = true; | ||||
| 
 | ||||
| 					// Set the timeout for turning off the recognition after inactivity (in milliseconds) | ||||
| 					const inactivityTimeout = 3000; // 3 seconds | ||||
| 
 | ||||
| 					let timeoutId; | ||||
| 					// Start recognition | ||||
| 					speechRecognition.start(); | ||||
| 
 | ||||
| 					// Event triggered when speech is recognized | ||||
| 					speechRecognition.onresult = async (event) => { | ||||
| 						// Clear the inactivity timeout | ||||
| 						clearTimeout(timeoutId); | ||||
| 
 | ||||
| 						// Handle recognized speech | ||||
| 						console.log(event); | ||||
| 						const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; | ||||
| 
 | ||||
| 						prompt = `${prompt}${transcript}`; | ||||
| 
 | ||||
| 						await tick(); | ||||
| 						const inputElement = document.getElementById('chat-textarea'); | ||||
| 						inputElement?.focus(); | ||||
| 
 | ||||
| 						// Restart the inactivity timeout | ||||
| 						timeoutId = setTimeout(() => { | ||||
| 							console.log('Speech recognition turned off due to inactivity.'); | ||||
| 							speechRecognition.stop(); | ||||
| 						}, inactivityTimeout); | ||||
| 					}; | ||||
| 
 | ||||
| 					// Event triggered when recognition is ended | ||||
| 					speechRecognition.onend = function () { | ||||
| 						// Restart recognition after it ends | ||||
| 						console.log('recognition ended'); | ||||
| 						isRecording = false; | ||||
| 						if (prompt !== '' && $settings?.speechAutoSend === true) { | ||||
| 							submitPrompt(prompt, user); | ||||
| 						} | ||||
| 					}; | ||||
| 
 | ||||
| 					// Event triggered when an error occurs | ||||
| 					speechRecognition.onerror = function (event) { | ||||
| 						console.log(event); | ||||
| 						toast.error(`Speech recognition error: ${event.error}`); | ||||
| 						isRecording = false; | ||||
| 					}; | ||||
| 				} else { | ||||
| 					toast.error('SpeechRecognition API is not supported in this browser.'); | ||||
| 				} | ||||
| 			} | ||||
| 		} | ||||
| 	}; | ||||
|  | @ -550,7 +657,7 @@ | |||
| 								: ' pl-4'} rounded-xl resize-none h-[48px]" | ||||
| 							placeholder={chatInputPlaceholder !== '' | ||||
| 								? chatInputPlaceholder | ||||
| 								: speechRecognitionListening | ||||
| 								: isRecording | ||||
| 								? 'Listening...' | ||||
| 								: 'Send a message'} | ||||
| 							bind:value={prompt} | ||||
|  | @ -659,6 +766,10 @@ | |||
| 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; | ||||
| 								user = null; | ||||
| 							}} | ||||
| 							on:focus={(e) => { | ||||
| 								e.target.style.height = ''; | ||||
| 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; | ||||
| 							}} | ||||
| 							on:paste={(e) => { | ||||
| 								const clipboardData = e.clipboardData || window.clipboardData; | ||||
| 
 | ||||
|  | @ -696,7 +807,7 @@ | |||
| 											speechRecognitionHandler(); | ||||
| 										}} | ||||
| 									> | ||||
| 										{#if speechRecognitionListening} | ||||
| 										{#if isRecording} | ||||
| 											<svg | ||||
| 												class=" w-5 h-5 translate-y-[0.5px]" | ||||
| 												fill="currentColor" | ||||
|  |  | |||
|  | @ -148,7 +148,7 @@ | |||
| 		} else { | ||||
| 			speaking = true; | ||||
| 
 | ||||
| 			if ($settings?.speech?.engine === 'openai') { | ||||
| 			if ($settings?.voice?.TTSEngine === 'openai') { | ||||
| 				loadingSpeech = true; | ||||
| 
 | ||||
| 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => { | ||||
|  | @ -179,7 +179,7 @@ | |||
| 				for (const [idx, sentence] of sentences.entries()) { | ||||
| 					const res = await synthesizeOpenAISpeech( | ||||
| 						localStorage.token, | ||||
| 						$settings?.speech?.speaker, | ||||
| 						$settings?.voice?.speaker, | ||||
| 						sentence | ||||
| 					).catch((error) => { | ||||
| 						toast.error(error); | ||||
|  | @ -204,7 +204,7 @@ | |||
| 						clearInterval(getVoicesLoop); | ||||
| 
 | ||||
| 						const voice = | ||||
| 							voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; | ||||
| 							voices?.filter((v) => v.name === $settings?.voice?.speaker)?.at(0) ?? undefined; | ||||
| 
 | ||||
| 						const speak = new SpeechSynthesisUtterance(message.content); | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,17 +1,21 @@ | |||
| <script lang="ts"> | ||||
| 	import { createEventDispatcher, onMount } from 'svelte'; | ||||
| 	import toast from 'svelte-french-toast'; | ||||
| 	const dispatch = createEventDispatcher(); | ||||
| 
 | ||||
| 	export let saveSettings: Function; | ||||
| 
 | ||||
| 	// Voice | ||||
| 
 | ||||
| 	let STTEngines = ['', 'openai']; | ||||
| 	let STTEngine = ''; | ||||
| 
 | ||||
| 	let conversationMode = false; | ||||
| 	let speechAutoSend = false; | ||||
| 	let responseAutoPlayback = false; | ||||
| 
 | ||||
| 	let engines = ['', 'openai']; | ||||
| 	let engine = ''; | ||||
| 	let TTSEngines = ['', 'openai']; | ||||
| 	let TTSEngine = ''; | ||||
| 
 | ||||
| 	let voices = []; | ||||
| 	let speaker = ''; | ||||
|  | @ -70,10 +74,11 @@ | |||
| 		speechAutoSend = settings.speechAutoSend ?? false; | ||||
| 		responseAutoPlayback = settings.responseAutoPlayback ?? false; | ||||
| 
 | ||||
| 		engine = settings?.speech?.engine ?? ''; | ||||
| 		speaker = settings?.speech?.speaker ?? ''; | ||||
| 		STTEngine = settings?.voice?.STTEngine ?? ''; | ||||
| 		TTSEngine = settings?.voice?.TTSEngine ?? ''; | ||||
| 		speaker = settings?.voice?.speaker ?? ''; | ||||
| 
 | ||||
| 		if (engine === 'openai') { | ||||
| 		if (TTSEngine === 'openai') { | ||||
| 			getOpenAIVoices(); | ||||
| 		} else { | ||||
| 			getWebAPIVoices(); | ||||
|  | @ -85,37 +90,37 @@ | |||
| 	class="flex flex-col h-full justify-between space-y-3 text-sm" | ||||
| 	on:submit|preventDefault={() => { | ||||
| 		saveSettings({ | ||||
| 			speech: { | ||||
| 				engine: engine !== '' ? engine : undefined, | ||||
| 			voice: { | ||||
| 				STTEngine: STTEngine !== '' ? STTEngine : undefined, | ||||
| 				TTSEngine: TTSEngine !== '' ? TTSEngine : undefined, | ||||
| 				speaker: speaker !== '' ? speaker : undefined | ||||
| 			} | ||||
| 		}); | ||||
| 		dispatch('save'); | ||||
| 	}} | ||||
| > | ||||
| 	<div class=" space-y-3"> | ||||
| 	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80"> | ||||
| 		<div> | ||||
| 			<div class=" mb-1 text-sm font-medium">TTS Settings</div> | ||||
| 			<div class=" mb-1 text-sm font-medium">STT Settings</div> | ||||
| 
 | ||||
| 			<div class=" py-0.5 flex w-full justify-between"> | ||||
| 				<div class=" self-center text-xs font-medium">Speech Engine</div> | ||||
| 				<div class=" self-center text-xs font-medium">Speech-to-Text Engine</div> | ||||
| 				<div class="flex items-center relative"> | ||||
| 					<select | ||||
| 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | ||||
| 						bind:value={engine} | ||||
| 						bind:value={STTEngine} | ||||
| 						placeholder="Select a mode" | ||||
| 						on:change={(e) => { | ||||
| 							if (e.target.value === 'openai') { | ||||
| 								getOpenAIVoices(); | ||||
| 								speaker = 'alloy'; | ||||
| 							} else { | ||||
| 								getWebAPIVoices(); | ||||
| 								speaker = ''; | ||||
| 							if (e.target.value !== '') { | ||||
| 								navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) { | ||||
| 									toast.error(`Permission denied when accessing microphone: ${err}`); | ||||
| 									STTEngine = ''; | ||||
| 								}); | ||||
| 							} | ||||
| 						}} | ||||
| 					> | ||||
| 						<option value="">Default (Web API)</option> | ||||
| 						<option value="openai">Open AI</option> | ||||
| 						<option value="whisper-local">Whisper (Local)</option> | ||||
| 					</select> | ||||
| 				</div> | ||||
| 			</div> | ||||
|  | @ -155,6 +160,33 @@ | |||
| 					{/if} | ||||
| 				</button> | ||||
| 			</div> | ||||
| 		</div> | ||||
| 
 | ||||
| 		<div> | ||||
| 			<div class=" mb-1 text-sm font-medium">TTS Settings</div> | ||||
| 
 | ||||
| 			<div class=" py-0.5 flex w-full justify-between"> | ||||
| 				<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div> | ||||
| 				<div class="flex items-center relative"> | ||||
| 					<select | ||||
| 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | ||||
| 						bind:value={TTSEngine} | ||||
| 						placeholder="Select a mode" | ||||
| 						on:change={(e) => { | ||||
| 							if (e.target.value === 'openai') { | ||||
| 								getOpenAIVoices(); | ||||
| 								speaker = 'alloy'; | ||||
| 							} else { | ||||
| 								getWebAPIVoices(); | ||||
| 								speaker = ''; | ||||
| 							} | ||||
| 						}} | ||||
| 					> | ||||
| 						<option value="">Default (Web API)</option> | ||||
| 						<option value="openai">Open AI</option> | ||||
| 					</select> | ||||
| 				</div> | ||||
| 			</div> | ||||
| 
 | ||||
| 			<div class=" py-0.5 flex w-full justify-between"> | ||||
| 				<div class=" self-center text-xs font-medium">Auto-playback response</div> | ||||
|  | @ -177,7 +209,7 @@ | |||
| 
 | ||||
| 		<hr class=" dark:border-gray-700" /> | ||||
| 
 | ||||
| 		{#if engine === ''} | ||||
| 		{#if TTSEngine === ''} | ||||
| 			<div> | ||||
| 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||
| 				<div class="flex w-full"> | ||||
|  | @ -196,7 +228,7 @@ | |||
| 					</div> | ||||
| 				</div> | ||||
| 			</div> | ||||
| 		{:else if engine === 'openai'} | ||||
| 		{:else if TTSEngine === 'openai'} | ||||
| 			<div> | ||||
| 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||
| 				<div class="flex w-full"> | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy J. Baek
						Timothy J. Baek