feat: whisper voice input

This commit is contained in:
Timothy J. Baek 2024-02-11 02:12:49 -08:00
parent c6c69924d1
commit f245c62692
3 changed files with 219 additions and 76 deletions

View file

@ -35,7 +35,6 @@
export let fileUploadEnabled = true;
export let speechRecognitionEnabled = true;
export let speechRecognitionListening = false;
export let prompt = '';
export let messages = [];
@ -51,62 +50,170 @@
}
}
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
const MIN_DECIBELS = -45;
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
isRecording = true;
console.log('Recording started');
};
mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
mediaRecorder.onstop = async () => {
isRecording = false;
console.log('Recording stopped');
// Create a blob from the audio chunks
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
prompt = res.text;
await tick();
const inputElement = document.getElementById('chat-textarea');
inputElement?.focus();
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
}
// saveRecording(audioBlob);
audioChunks = [];
};
// Start recording
mediaRecorder.start();
// Monitor silence
monitorSilence(stream);
};
const monitorSilence = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
let lastSoundTime = Date.now();
const detectSound = () => {
analyser.getByteFrequencyData(domainData);
if (domainData.some((value) => value > 0)) {
lastSoundTime = Date.now();
}
if (isRecording && Date.now() - lastSoundTime > 3000) {
mediaRecorder.stop();
audioContext.close();
return;
}
window.requestAnimationFrame(detectSound);
};
window.requestAnimationFrame(detectSound);
};
const saveRecording = (blob) => {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
document.body.appendChild(a);
a.style = 'display: none';
a.href = url;
a.download = 'recording.wav';
a.click();
window.URL.revokeObjectURL(url);
};
const speechRecognitionHandler = () => {
// Check if SpeechRecognition is supported
if (speechRecognitionListening) {
speechRecognition.stop();
if (isRecording) {
if (speechRecognition) {
speechRecognition.stop();
}
if (mediaRecorder) {
mediaRecorder.stop();
}
} else {
if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
// Create a SpeechRecognition object
speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
isRecording = true;
// Set continuous to true for continuous recognition
speechRecognition.continuous = true;
// Set the timeout for turning off the recognition after inactivity (in milliseconds)
const inactivityTimeout = 3000; // 3 seconds
let timeoutId;
// Start recognition
speechRecognition.start();
speechRecognitionListening = true;
// Event triggered when speech is recognized
speechRecognition.onresult = function (event) {
// Clear the inactivity timeout
clearTimeout(timeoutId);
// Handle recognized speech
console.log(event);
const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
prompt = `${prompt}${transcript}`;
// Restart the inactivity timeout
timeoutId = setTimeout(() => {
console.log('Speech recognition turned off due to inactivity.');
speechRecognition.stop();
}, inactivityTimeout);
};
// Event triggered when recognition is ended
speechRecognition.onend = function () {
// Restart recognition after it ends
console.log('recognition ended');
speechRecognitionListening = false;
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
};
// Event triggered when an error occurs
speechRecognition.onerror = function (event) {
console.log(event);
toast.error(`Speech recognition error: ${event.error}`);
speechRecognitionListening = false;
};
if ($settings?.voice?.STTEngine ?? '' !== '') {
startRecording();
} else {
toast.error('SpeechRecognition API is not supported in this browser.');
if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
// Create a SpeechRecognition object
speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
// Set continuous to true for continuous recognition
speechRecognition.continuous = true;
// Set the timeout for turning off the recognition after inactivity (in milliseconds)
const inactivityTimeout = 3000; // 3 seconds
let timeoutId;
// Start recognition
speechRecognition.start();
// Event triggered when speech is recognized
speechRecognition.onresult = async (event) => {
// Clear the inactivity timeout
clearTimeout(timeoutId);
// Handle recognized speech
console.log(event);
const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
prompt = `${prompt}${transcript}`;
await tick();
const inputElement = document.getElementById('chat-textarea');
inputElement?.focus();
// Restart the inactivity timeout
timeoutId = setTimeout(() => {
console.log('Speech recognition turned off due to inactivity.');
speechRecognition.stop();
}, inactivityTimeout);
};
// Event triggered when recognition is ended
speechRecognition.onend = function () {
// Restart recognition after it ends
console.log('recognition ended');
isRecording = false;
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
};
// Event triggered when an error occurs
speechRecognition.onerror = function (event) {
console.log(event);
toast.error(`Speech recognition error: ${event.error}`);
isRecording = false;
};
} else {
toast.error('SpeechRecognition API is not supported in this browser.');
}
}
}
};
@ -550,7 +657,7 @@
: ' pl-4'} rounded-xl resize-none h-[48px]"
placeholder={chatInputPlaceholder !== ''
? chatInputPlaceholder
: speechRecognitionListening
: isRecording
? 'Listening...'
: 'Send a message'}
bind:value={prompt}
@ -659,6 +766,10 @@
e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
user = null;
}}
on:focus={(e) => {
e.target.style.height = '';
e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
}}
on:paste={(e) => {
const clipboardData = e.clipboardData || window.clipboardData;
@ -696,7 +807,7 @@
speechRecognitionHandler();
}}
>
{#if speechRecognitionListening}
{#if isRecording}
<svg
class=" w-5 h-5 translate-y-[0.5px]"
fill="currentColor"

View file

@ -148,7 +148,7 @@
} else {
speaking = true;
if ($settings?.speech?.engine === 'openai') {
if ($settings?.voice?.TTSEngine === 'openai') {
loadingSpeech = true;
const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
@ -179,7 +179,7 @@
for (const [idx, sentence] of sentences.entries()) {
const res = await synthesizeOpenAISpeech(
localStorage.token,
$settings?.speech?.speaker,
$settings?.voice?.speaker,
sentence
).catch((error) => {
toast.error(error);
@ -204,7 +204,7 @@
clearInterval(getVoicesLoop);
const voice =
voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined;
voices?.filter((v) => v.name === $settings?.voice?.speaker)?.at(0) ?? undefined;
const speak = new SpeechSynthesisUtterance(message.content);

View file

@ -1,17 +1,21 @@
<script lang="ts">
import { createEventDispatcher, onMount } from 'svelte';
import toast from 'svelte-french-toast';
const dispatch = createEventDispatcher();
export let saveSettings: Function;
// Voice
let STTEngines = ['', 'openai'];
let STTEngine = '';
let conversationMode = false;
let speechAutoSend = false;
let responseAutoPlayback = false;
let engines = ['', 'openai'];
let engine = '';
let TTSEngines = ['', 'openai'];
let TTSEngine = '';
let voices = [];
let speaker = '';
@ -70,10 +74,11 @@
speechAutoSend = settings.speechAutoSend ?? false;
responseAutoPlayback = settings.responseAutoPlayback ?? false;
engine = settings?.speech?.engine ?? '';
speaker = settings?.speech?.speaker ?? '';
STTEngine = settings?.voice?.STTEngine ?? '';
TTSEngine = settings?.voice?.TTSEngine ?? '';
speaker = settings?.voice?.speaker ?? '';
if (engine === 'openai') {
if (TTSEngine === 'openai') {
getOpenAIVoices();
} else {
getWebAPIVoices();
@ -85,37 +90,37 @@
class="flex flex-col h-full justify-between space-y-3 text-sm"
on:submit|preventDefault={() => {
saveSettings({
speech: {
engine: engine !== '' ? engine : undefined,
voice: {
STTEngine: STTEngine !== '' ? STTEngine : undefined,
TTSEngine: TTSEngine !== '' ? TTSEngine : undefined,
speaker: speaker !== '' ? speaker : undefined
}
});
dispatch('save');
}}
>
<div class=" space-y-3">
<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
<div>
<div class=" mb-1 text-sm font-medium">TTS Settings</div>
<div class=" mb-1 text-sm font-medium">STT Settings</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Speech Engine</div>
<div class=" self-center text-xs font-medium">Speech-to-Text Engine</div>
<div class="flex items-center relative">
<select
class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={engine}
bind:value={STTEngine}
placeholder="Select a mode"
on:change={(e) => {
if (e.target.value === 'openai') {
getOpenAIVoices();
speaker = 'alloy';
} else {
getWebAPIVoices();
speaker = '';
if (e.target.value !== '') {
navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
toast.error(`Permission denied when accessing microphone: ${err}`);
STTEngine = '';
});
}
}}
>
<option value="">Default (Web API)</option>
<option value="openai">Open AI</option>
<option value="whisper-local">Whisper (Local)</option>
</select>
</div>
</div>
@ -155,6 +160,33 @@
{/if}
</button>
</div>
</div>
<div>
<div class=" mb-1 text-sm font-medium">TTS Settings</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div>
<div class="flex items-center relative">
<select
class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngine}
placeholder="Select a mode"
on:change={(e) => {
if (e.target.value === 'openai') {
getOpenAIVoices();
speaker = 'alloy';
} else {
getWebAPIVoices();
speaker = '';
}
}}
>
<option value="">Default (Web API)</option>
<option value="openai">Open AI</option>
</select>
</div>
</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Auto-playback response</div>
@ -177,7 +209,7 @@
<hr class=" dark:border-gray-700" />
{#if engine === ''}
{#if TTSEngine === ''}
<div>
<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
<div class="flex w-full">
@ -196,7 +228,7 @@
</div>
</div>
</div>
{:else if engine === 'openai'}
{:else if TTSEngine === 'openai'}
<div>
<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
<div class="flex w-full">