feat: whisper voice input

This commit is contained in:
Timothy J. Baek 2024-02-11 02:12:49 -08:00
parent c6c69924d1
commit f245c62692
3 changed files with 219 additions and 76 deletions

View file

@ -35,7 +35,6 @@
export let fileUploadEnabled = true; export let fileUploadEnabled = true;
export let speechRecognitionEnabled = true; export let speechRecognitionEnabled = true;
export let speechRecognitionListening = false;
export let prompt = ''; export let prompt = '';
export let messages = []; export let messages = [];
@ -51,11 +50,114 @@
} }
} }
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
const MIN_DECIBELS = -45;
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.onstart = () => {
isRecording = true;
console.log('Recording started');
};
mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
mediaRecorder.onstop = async () => {
isRecording = false;
console.log('Recording stopped');
// Create a blob from the audio chunks
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
const file = blobToFile(audioBlob, 'recording.wav');
const res = await transcribeAudio(localStorage.token, file).catch((error) => {
toast.error(error);
return null;
});
if (res) {
prompt = res.text;
await tick();
const inputElement = document.getElementById('chat-textarea');
inputElement?.focus();
if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user);
}
}
// saveRecording(audioBlob);
audioChunks = [];
};
// Start recording
mediaRecorder.start();
// Monitor silence
monitorSilence(stream);
};
const monitorSilence = (stream) => {
const audioContext = new AudioContext();
const audioStreamSource = audioContext.createMediaStreamSource(stream);
const analyser = audioContext.createAnalyser();
analyser.minDecibels = MIN_DECIBELS;
audioStreamSource.connect(analyser);
const bufferLength = analyser.frequencyBinCount;
const domainData = new Uint8Array(bufferLength);
let lastSoundTime = Date.now();
const detectSound = () => {
analyser.getByteFrequencyData(domainData);
if (domainData.some((value) => value > 0)) {
lastSoundTime = Date.now();
}
if (isRecording && Date.now() - lastSoundTime > 3000) {
mediaRecorder.stop();
audioContext.close();
return;
}
window.requestAnimationFrame(detectSound);
};
window.requestAnimationFrame(detectSound);
};
const saveRecording = (blob) => {
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
document.body.appendChild(a);
a.style = 'display: none';
a.href = url;
a.download = 'recording.wav';
a.click();
window.URL.revokeObjectURL(url);
};
const speechRecognitionHandler = () => { const speechRecognitionHandler = () => {
// Check if SpeechRecognition is supported // Check if SpeechRecognition is supported
if (speechRecognitionListening) { if (isRecording) {
if (speechRecognition) {
speechRecognition.stop(); speechRecognition.stop();
}
if (mediaRecorder) {
mediaRecorder.stop();
}
} else {
isRecording = true;
if ($settings?.voice?.STTEngine ?? '' !== '') {
startRecording();
} else { } else {
if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
// Create a SpeechRecognition object // Create a SpeechRecognition object
@ -70,18 +172,22 @@
let timeoutId; let timeoutId;
// Start recognition // Start recognition
speechRecognition.start(); speechRecognition.start();
speechRecognitionListening = true;
// Event triggered when speech is recognized // Event triggered when speech is recognized
speechRecognition.onresult = function (event) { speechRecognition.onresult = async (event) => {
// Clear the inactivity timeout // Clear the inactivity timeout
clearTimeout(timeoutId); clearTimeout(timeoutId);
// Handle recognized speech // Handle recognized speech
console.log(event); console.log(event);
const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
prompt = `${prompt}${transcript}`; prompt = `${prompt}${transcript}`;
await tick();
const inputElement = document.getElementById('chat-textarea');
inputElement?.focus();
// Restart the inactivity timeout // Restart the inactivity timeout
timeoutId = setTimeout(() => { timeoutId = setTimeout(() => {
console.log('Speech recognition turned off due to inactivity.'); console.log('Speech recognition turned off due to inactivity.');
@ -93,7 +199,7 @@
speechRecognition.onend = function () { speechRecognition.onend = function () {
// Restart recognition after it ends // Restart recognition after it ends
console.log('recognition ended'); console.log('recognition ended');
speechRecognitionListening = false; isRecording = false;
if (prompt !== '' && $settings?.speechAutoSend === true) { if (prompt !== '' && $settings?.speechAutoSend === true) {
submitPrompt(prompt, user); submitPrompt(prompt, user);
} }
@ -103,12 +209,13 @@
speechRecognition.onerror = function (event) { speechRecognition.onerror = function (event) {
console.log(event); console.log(event);
toast.error(`Speech recognition error: ${event.error}`); toast.error(`Speech recognition error: ${event.error}`);
speechRecognitionListening = false; isRecording = false;
}; };
} else { } else {
toast.error('SpeechRecognition API is not supported in this browser.'); toast.error('SpeechRecognition API is not supported in this browser.');
} }
} }
}
}; };
const uploadDoc = async (file) => { const uploadDoc = async (file) => {
@ -550,7 +657,7 @@
: ' pl-4'} rounded-xl resize-none h-[48px]" : ' pl-4'} rounded-xl resize-none h-[48px]"
placeholder={chatInputPlaceholder !== '' placeholder={chatInputPlaceholder !== ''
? chatInputPlaceholder ? chatInputPlaceholder
: speechRecognitionListening : isRecording
? 'Listening...' ? 'Listening...'
: 'Send a message'} : 'Send a message'}
bind:value={prompt} bind:value={prompt}
@ -659,6 +766,10 @@
e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
user = null; user = null;
}} }}
on:focus={(e) => {
e.target.style.height = '';
e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
}}
on:paste={(e) => { on:paste={(e) => {
const clipboardData = e.clipboardData || window.clipboardData; const clipboardData = e.clipboardData || window.clipboardData;
@ -696,7 +807,7 @@
speechRecognitionHandler(); speechRecognitionHandler();
}} }}
> >
{#if speechRecognitionListening} {#if isRecording}
<svg <svg
class=" w-5 h-5 translate-y-[0.5px]" class=" w-5 h-5 translate-y-[0.5px]"
fill="currentColor" fill="currentColor"

View file

@ -148,7 +148,7 @@
} else { } else {
speaking = true; speaking = true;
if ($settings?.speech?.engine === 'openai') { if ($settings?.voice?.TTSEngine === 'openai') {
loadingSpeech = true; loadingSpeech = true;
const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => { const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
@ -179,7 +179,7 @@
for (const [idx, sentence] of sentences.entries()) { for (const [idx, sentence] of sentences.entries()) {
const res = await synthesizeOpenAISpeech( const res = await synthesizeOpenAISpeech(
localStorage.token, localStorage.token,
$settings?.speech?.speaker, $settings?.voice?.speaker,
sentence sentence
).catch((error) => { ).catch((error) => {
toast.error(error); toast.error(error);
@ -204,7 +204,7 @@
clearInterval(getVoicesLoop); clearInterval(getVoicesLoop);
const voice = const voice =
voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; voices?.filter((v) => v.name === $settings?.voice?.speaker)?.at(0) ?? undefined;
const speak = new SpeechSynthesisUtterance(message.content); const speak = new SpeechSynthesisUtterance(message.content);

View file

@ -1,17 +1,21 @@
<script lang="ts"> <script lang="ts">
import { createEventDispatcher, onMount } from 'svelte'; import { createEventDispatcher, onMount } from 'svelte';
import toast from 'svelte-french-toast';
const dispatch = createEventDispatcher(); const dispatch = createEventDispatcher();
export let saveSettings: Function; export let saveSettings: Function;
// Voice // Voice
let STTEngines = ['', 'openai'];
let STTEngine = '';
let conversationMode = false; let conversationMode = false;
let speechAutoSend = false; let speechAutoSend = false;
let responseAutoPlayback = false; let responseAutoPlayback = false;
let engines = ['', 'openai']; let TTSEngines = ['', 'openai'];
let engine = ''; let TTSEngine = '';
let voices = []; let voices = [];
let speaker = ''; let speaker = '';
@ -70,10 +74,11 @@
speechAutoSend = settings.speechAutoSend ?? false; speechAutoSend = settings.speechAutoSend ?? false;
responseAutoPlayback = settings.responseAutoPlayback ?? false; responseAutoPlayback = settings.responseAutoPlayback ?? false;
engine = settings?.speech?.engine ?? ''; STTEngine = settings?.voice?.STTEngine ?? '';
speaker = settings?.speech?.speaker ?? ''; TTSEngine = settings?.voice?.TTSEngine ?? '';
speaker = settings?.voice?.speaker ?? '';
if (engine === 'openai') { if (TTSEngine === 'openai') {
getOpenAIVoices(); getOpenAIVoices();
} else { } else {
getWebAPIVoices(); getWebAPIVoices();
@ -85,37 +90,37 @@
class="flex flex-col h-full justify-between space-y-3 text-sm" class="flex flex-col h-full justify-between space-y-3 text-sm"
on:submit|preventDefault={() => { on:submit|preventDefault={() => {
saveSettings({ saveSettings({
speech: { voice: {
engine: engine !== '' ? engine : undefined, STTEngine: STTEngine !== '' ? STTEngine : undefined,
TTSEngine: TTSEngine !== '' ? TTSEngine : undefined,
speaker: speaker !== '' ? speaker : undefined speaker: speaker !== '' ? speaker : undefined
} }
}); });
dispatch('save'); dispatch('save');
}} }}
> >
<div class=" space-y-3"> <div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
<div> <div>
<div class=" mb-1 text-sm font-medium">TTS Settings</div> <div class=" mb-1 text-sm font-medium">STT Settings</div>
<div class=" py-0.5 flex w-full justify-between"> <div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Speech Engine</div> <div class=" self-center text-xs font-medium">Speech-to-Text Engine</div>
<div class="flex items-center relative"> <div class="flex items-center relative">
<select <select
class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={engine} bind:value={STTEngine}
placeholder="Select a mode" placeholder="Select a mode"
on:change={(e) => { on:change={(e) => {
if (e.target.value === 'openai') { if (e.target.value !== '') {
getOpenAIVoices(); navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
speaker = 'alloy'; toast.error(`Permission denied when accessing microphone: ${err}`);
} else { STTEngine = '';
getWebAPIVoices(); });
speaker = '';
} }
}} }}
> >
<option value="">Default (Web API)</option> <option value="">Default (Web API)</option>
<option value="openai">Open AI</option> <option value="whisper-local">Whisper (Local)</option>
</select> </select>
</div> </div>
</div> </div>
@ -155,6 +160,33 @@
{/if} {/if}
</button> </button>
</div> </div>
</div>
<div>
<div class=" mb-1 text-sm font-medium">TTS Settings</div>
<div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div>
<div class="flex items-center relative">
<select
class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
bind:value={TTSEngine}
placeholder="Select a mode"
on:change={(e) => {
if (e.target.value === 'openai') {
getOpenAIVoices();
speaker = 'alloy';
} else {
getWebAPIVoices();
speaker = '';
}
}}
>
<option value="">Default (Web API)</option>
<option value="openai">Open AI</option>
</select>
</div>
</div>
<div class=" py-0.5 flex w-full justify-between"> <div class=" py-0.5 flex w-full justify-between">
<div class=" self-center text-xs font-medium">Auto-playback response</div> <div class=" self-center text-xs font-medium">Auto-playback response</div>
@ -177,7 +209,7 @@
<hr class=" dark:border-gray-700" /> <hr class=" dark:border-gray-700" />
{#if engine === ''} {#if TTSEngine === ''}
<div> <div>
<div class=" mb-2.5 text-sm font-medium">Set Voice</div> <div class=" mb-2.5 text-sm font-medium">Set Voice</div>
<div class="flex w-full"> <div class="flex w-full">
@ -196,7 +228,7 @@
</div> </div>
</div> </div>
</div> </div>
{:else if engine === 'openai'} {:else if TTSEngine === 'openai'}
<div> <div>
<div class=" mb-2.5 text-sm font-medium">Set Voice</div> <div class=" mb-2.5 text-sm font-medium">Set Voice</div>
<div class="flex w-full"> <div class="flex w-full">