Merge pull request #707 from ollama-webui/whisper

feat: whisper support
2024-02-11 02:23:24 -08:00 · 2024-02-11 02:23:24 -08:00 · e1a6ccd1aa
commit e1a6ccd1aa
parent 182ab8b8a2 3ce8e8a2ae
11 changed files with 374 additions and 85 deletions
--- a/backend/apps/audio/main.py
+++ b/backend/apps/audio/main.py
@ -0,0 +1,80 @@
 from fastapi import (
    FastAPI,
    Request,
    Depends,
    HTTPException,
    status,
    UploadFile,
    File,
    Form,
 )
 from fastapi.middleware.cors import CORSMiddleware
 from faster_whisper import WhisperModel
 from constants import ERROR_MESSAGES
 from utils.utils import (
    decode_token,
    get_current_user,
    get_verified_user,
    get_admin_user,
 )
 from utils.misc import calculate_sha256
 from config import CACHE_DIR, UPLOAD_DIR, WHISPER_MODEL_NAME
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
@app.post("/transcribe")
 def transcribe(
    file: UploadFile = File(...),
    user=Depends(get_current_user),
 ):
    print(file.content_type)
    if file.content_type not in ["audio/mpeg", "audio/wav"]:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
        )
    try:
        filename = file.filename
        file_path = f"{UPLOAD_DIR}/{filename}"
        contents = file.file.read()
        with open(file_path, "wb") as f:
            f.write(contents)
            f.close()
        model_name = WHISPER_MODEL_NAME
        model = WhisperModel(
            model_name,
            device="cpu",
            compute_type="int8",
            download_root=f"{CACHE_DIR}/whisper/models",
        )
        segments, info = model.transcribe(file_path, beam_size=5)
        print(
            "Detected language '%s' with probability %f"
            % (info.language, info.language_probability)
        )
        transcript = "".join([segment.text for segment in list(segments)])
        return {"text": transcript.strip()}
    except Exception as e:
        print(e)
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.DEFAULT(e),
        )
--- a/backend/config.py
+++ b/backend/config.py
@ -132,3 +132,8 @@ CHROMA_CLIENT = chromadb.PersistentClient(
 )
 CHUNK_SIZE = 1500
 CHUNK_OVERLAP = 100
 ####################################
 # Transcribe
 ####################################
 WHISPER_MODEL_NAME = "base"
--- a/backend/main.py
+++ b/backend/main.py
@ -10,6 +10,8 @@ from starlette.exceptions import HTTPException as StarletteHTTPException
 from apps.ollama.main import app as ollama_app
 from apps.openai.main import app as openai_app
 from apps.audio.main import app as audio_app
 from apps.web.main import app as webui_app
 from apps.rag.main import app as rag_app
@ -55,6 +57,8 @@ app.mount("/api/v1", webui_app)
 app.mount("/ollama/api", ollama_app)
 app.mount("/openai/api", openai_app)
 app.mount("/audio/api/v1", audio_app)
 app.mount("/rag/api/v1", rag_app)
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -30,6 +30,8 @@ openpyxl
 pyxlsb
 xlrd
 faster-whisper
 PyJWT
 pyjwt[crypto]
--- a/src/lib/apis/audio/index.ts
+++ b/src/lib/apis/audio/index.ts
@ -0,0 +1,31 @@
 import { AUDIO_API_BASE_URL } from '$lib/constants';
 export const transcribeAudio = async (token: string, file: File) => {
 	const data = new FormData();
 	data.append('file', file);
 	let error = null;
 	const res = await fetch(`${AUDIO_API_BASE_URL}/transcribe`, {
 		method: 'POST',
 		headers: {
 			Accept: 'application/json',
 			authorization: `Bearer ${token}`
 		},
 		body: data
 	})
 		.then(async (res) => {
 			if (!res.ok) throw await res.json();
 			return res.json();
 		})
 		.catch((err) => {
 			error = err.detail;
 			console.log(err);
 			return null;
 		});
 	if (error) {
 		throw error;
 	}
 	return res;
 };
--- a/src/lib/components/chat/MessageInput.svelte
+++ b/src/lib/components/chat/MessageInput.svelte
@ -2,7 +2,7 @@
 	import toast from 'svelte-french-toast';
 	import { onMount, tick } from 'svelte';
 	import { settings } from '$lib/stores';
-	import { calculateSHA256, findWordIndices } from '$lib/utils';
+	import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils';
 	import Prompts from './MessageInput/PromptCommands.svelte';
 	import Suggestions from './MessageInput/Suggestions.svelte';
@ -11,6 +11,7 @@
 	import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
 	import Documents from './MessageInput/Documents.svelte';
 	import Models from './MessageInput/Models.svelte';
 	import { transcribeAudio } from '$lib/apis/audio';
 	export let submitPrompt: Function;
 	export let stopResponse: Function;
@ -34,7 +35,6 @@
 	export let fileUploadEnabled = true;
 	export let speechRecognitionEnabled = true;
 	export let speechRecognitionListening = false;
 	export let prompt = '';
 	export let messages = [];
@ -50,62 +50,170 @@
 		}
 	}
 	let mediaRecorder;
 	let audioChunks = [];
 	let isRecording = false;
 	const MIN_DECIBELS = -45;
 	const startRecording = async () => {
 		const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
 		mediaRecorder = new MediaRecorder(stream);
 		mediaRecorder.onstart = () => {
 			isRecording = true;
 			console.log('Recording started');
 		};
 		mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data);
 		mediaRecorder.onstop = async () => {
 			isRecording = false;
 			console.log('Recording stopped');
 			// Create a blob from the audio chunks
 			const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
 			const file = blobToFile(audioBlob, 'recording.wav');
 			const res = await transcribeAudio(localStorage.token, file).catch((error) => {
 				toast.error(error);
 				return null;
 			});
 			if (res) {
 				prompt = res.text;
 				await tick();
 				const inputElement = document.getElementById('chat-textarea');
 				inputElement?.focus();
 				if (prompt !== '' && $settings?.speechAutoSend === true) {
 					submitPrompt(prompt, user);
 				}
 			}
 			// saveRecording(audioBlob);
 			audioChunks = [];
 		};
 		// Start recording
 		mediaRecorder.start();
 		// Monitor silence
 		monitorSilence(stream);
 	};
 	const monitorSilence = (stream) => {
 		const audioContext = new AudioContext();
 		const audioStreamSource = audioContext.createMediaStreamSource(stream);
 		const analyser = audioContext.createAnalyser();
 		analyser.minDecibels = MIN_DECIBELS;
 		audioStreamSource.connect(analyser);
 		const bufferLength = analyser.frequencyBinCount;
 		const domainData = new Uint8Array(bufferLength);
 		let lastSoundTime = Date.now();
 		const detectSound = () => {
 			analyser.getByteFrequencyData(domainData);
 			if (domainData.some((value) => value > 0)) {
 				lastSoundTime = Date.now();
 			}
 			if (isRecording && Date.now() - lastSoundTime > 3000) {
 				mediaRecorder.stop();
 				audioContext.close();
 				return;
 			}
 			window.requestAnimationFrame(detectSound);
 		};
 		window.requestAnimationFrame(detectSound);
 	};
 	const saveRecording = (blob) => {
 		const url = URL.createObjectURL(blob);
 		const a = document.createElement('a');
 		document.body.appendChild(a);
 		a.style = 'display: none';
 		a.href = url;
 		a.download = 'recording.wav';
 		a.click();
 		window.URL.revokeObjectURL(url);
 	};
 	const speechRecognitionHandler = () => {
 		// Check if SpeechRecognition is supported
-		if (speechRecognitionListening) {
+		if (isRecording) {
-			speechRecognition.stop();
+			if (speechRecognition) {
 				speechRecognition.stop();
 			}
 			if (mediaRecorder) {
 				mediaRecorder.stop();
 			}
 		} else {
-			if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
+			isRecording = true;
 				// Create a SpeechRecognition object
 				speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
-				// Set continuous to true for continuous recognition
+			if ($settings?.voice?.STTEngine ?? '' !== '') {
-				speechRecognition.continuous = true;
+				startRecording();
 				// Set the timeout for turning off the recognition after inactivity (in milliseconds)
 				const inactivityTimeout = 3000; // 3 seconds
 				let timeoutId;
 				// Start recognition
 				speechRecognition.start();
 				speechRecognitionListening = true;
 				// Event triggered when speech is recognized
 				speechRecognition.onresult = function (event) {
 					// Clear the inactivity timeout
 					clearTimeout(timeoutId);
 					// Handle recognized speech
 					console.log(event);
 					const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
 					prompt = `${prompt}${transcript}`;
 					// Restart the inactivity timeout
 					timeoutId = setTimeout(() => {
 						console.log('Speech recognition turned off due to inactivity.');
 						speechRecognition.stop();
 					}, inactivityTimeout);
 				};
 				// Event triggered when recognition is ended
 				speechRecognition.onend = function () {
 					// Restart recognition after it ends
 					console.log('recognition ended');
 					speechRecognitionListening = false;
 					if (prompt !== '' && $settings?.speechAutoSend === true) {
 						submitPrompt(prompt, user);
 					}
 				};
 				// Event triggered when an error occurs
 				speechRecognition.onerror = function (event) {
 					console.log(event);
 					toast.error(`Speech recognition error: ${event.error}`);
 					speechRecognitionListening = false;
 				};
 			} else {
-				toast.error('SpeechRecognition API is not supported in this browser.');
+				if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) {
 					// Create a SpeechRecognition object
 					speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)();
 					// Set continuous to true for continuous recognition
 					speechRecognition.continuous = true;
 					// Set the timeout for turning off the recognition after inactivity (in milliseconds)
 					const inactivityTimeout = 3000; // 3 seconds
 					let timeoutId;
 					// Start recognition
 					speechRecognition.start();
 					// Event triggered when speech is recognized
 					speechRecognition.onresult = async (event) => {
 						// Clear the inactivity timeout
 						clearTimeout(timeoutId);
 						// Handle recognized speech
 						console.log(event);
 						const transcript = event.results[Object.keys(event.results).length - 1][0].transcript;
 						prompt = `${prompt}${transcript}`;
 						await tick();
 						const inputElement = document.getElementById('chat-textarea');
 						inputElement?.focus();
 						// Restart the inactivity timeout
 						timeoutId = setTimeout(() => {
 							console.log('Speech recognition turned off due to inactivity.');
 							speechRecognition.stop();
 						}, inactivityTimeout);
 					};
 					// Event triggered when recognition is ended
 					speechRecognition.onend = function () {
 						// Restart recognition after it ends
 						console.log('recognition ended');
 						isRecording = false;
 						if (prompt !== '' && $settings?.speechAutoSend === true) {
 							submitPrompt(prompt, user);
 						}
 					};
 					// Event triggered when an error occurs
 					speechRecognition.onerror = function (event) {
 						console.log(event);
 						toast.error(`Speech recognition error: ${event.error}`);
 						isRecording = false;
 					};
 				} else {
 					toast.error('SpeechRecognition API is not supported in this browser.');
 				}
 			}
 		}
 	};
@ -123,6 +231,20 @@
 		try {
 			files = [...files, doc];
 			if (['audio/mpeg', 'audio/wav'].includes(file['type'])) {
 				const res = await transcribeAudio(localStorage.token, file).catch((error) => {
 					toast.error(error);
 					return null;
 				});
 				if (res) {
 					console.log(res);
 					const blob = new Blob([res.text], { type: 'text/plain' });
 					file = blobToFile(blob, `${file.name}.txt`);
 				}
 			}
 			const res = await uploadDocToVectorDB(localStorage.token, '', file);
 			if (res) {
@ -535,7 +657,7 @@
 								: ' pl-4'} rounded-xl resize-none h-[48px]"
 							placeholder={chatInputPlaceholder !== ''
 								? chatInputPlaceholder
-								: speechRecognitionListening
+								: isRecording
 								? 'Listening...'
 								: 'Send a message'}
 							bind:value={prompt}
@ -644,6 +766,10 @@
 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
 								user = null;
 							}}
 							on:focus={(e) => {
 								e.target.style.height = '';
 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px';
 							}}
 							on:paste={(e) => {
 								const clipboardData = e.clipboardData || window.clipboardData;
@ -681,7 +807,7 @@
 											speechRecognitionHandler();
 										}}
 									>
-										{#if speechRecognitionListening}
+										{#if isRecording}
 											<svg
 												class=" w-5 h-5 translate-y-[0.5px]"
 												fill="currentColor"
--- a/src/lib/components/chat/Messages/ResponseMessage.svelte
+++ b/src/lib/components/chat/Messages/ResponseMessage.svelte
@ -148,7 +148,7 @@
 		} else {
 			speaking = true;
-			if ($settings?.speech?.engine === 'openai') {
+			if ($settings?.audio?.TTSEngine === 'openai') {
 				loadingSpeech = true;
 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => {
@ -179,7 +179,7 @@
 				for (const [idx, sentence] of sentences.entries()) {
 					const res = await synthesizeOpenAISpeech(
 						localStorage.token,
-						$settings?.speech?.speaker,
+						$settings?.audio?.speaker,
 						sentence
 					).catch((error) => {
 						toast.error(error);
@ -204,7 +204,7 @@
 						clearInterval(getVoicesLoop);
 						const voice =
-							voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined;
+							voices?.filter((v) => v.name === $settings?.audio?.speaker)?.at(0) ?? undefined;
 						const speak = new SpeechSynthesisUtterance(message.content);
--- a/src/lib/components/chat/Settings/Audio.svelte
+++ b/src/lib/components/chat/Settings/Audio.svelte
@ -1,17 +1,21 @@
 <script lang="ts">
 	import { createEventDispatcher, onMount } from 'svelte';
 	import toast from 'svelte-french-toast';
 	const dispatch = createEventDispatcher();
 	export let saveSettings: Function;
-	// Voice
+	// Audio
 	let STTEngines = ['', 'openai'];
 	let STTEngine = '';
 	let conversationMode = false;
 	let speechAutoSend = false;
 	let responseAutoPlayback = false;
-	let engines = ['', 'openai'];
+	let TTSEngines = ['', 'openai'];
-	let engine = '';
+	let TTSEngine = '';
 	let voices = [];
 	let speaker = '';
@ -70,10 +74,11 @@
 		speechAutoSend = settings.speechAutoSend ?? false;
 		responseAutoPlayback = settings.responseAutoPlayback ?? false;
-		engine = settings?.speech?.engine ?? '';
+		STTEngine = settings?.audio?.STTEngine ?? '';
-		speaker = settings?.speech?.speaker ?? '';
+		TTSEngine = settings?.audio?.TTSEngine ?? '';
 		speaker = settings?.audio?.speaker ?? '';
-		if (engine === 'openai') {
+		if (TTSEngine === 'openai') {
 			getOpenAIVoices();
 		} else {
 			getWebAPIVoices();
@ -85,37 +90,37 @@
 	class="flex flex-col h-full justify-between space-y-3 text-sm"
 	on:submit|preventDefault={() => {
 		saveSettings({
-			speech: {
+			audio: {
-				engine: engine !== '' ? engine : undefined,
+				STTEngine: STTEngine !== '' ? STTEngine : undefined,
 				TTSEngine: TTSEngine !== '' ? TTSEngine : undefined,
 				speaker: speaker !== '' ? speaker : undefined
 			}
 		});
 		dispatch('save');
 	}}
 >
-	<div class=" space-y-3">
+	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
 		<div>
-			<div class=" mb-1 text-sm font-medium">TTS Settings</div>
+			<div class=" mb-1 text-sm font-medium">STT Settings</div>
 			<div class=" py-0.5 flex w-full justify-between">
-				<div class=" self-center text-xs font-medium">Speech Engine</div>
+				<div class=" self-center text-xs font-medium">Speech-to-Text Engine</div>
 				<div class="flex items-center relative">
 					<select
 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
-						bind:value={engine}
+						bind:value={STTEngine}
 						placeholder="Select a mode"
 						on:change={(e) => {
-							if (e.target.value === 'openai') {
+							if (e.target.value !== '') {
-								getOpenAIVoices();
+								navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) {
-								speaker = 'alloy';
+									toast.error(`Permission denied when accessing microphone: ${err}`);
-							} else {
+									STTEngine = '';
-								getWebAPIVoices();
+								});
 								speaker = '';
 							}
 						}}
 					>
 						<option value="">Default (Web API)</option>
-						<option value="openai">Open AI</option>
+						<option value="whisper-local">Whisper (Local)</option>
 					</select>
 				</div>
 			</div>
@ -155,6 +160,33 @@
 					{/if}
 				</button>
 			</div>
 		</div>
 		<div>
 			<div class=" mb-1 text-sm font-medium">TTS Settings</div>
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div>
 				<div class="flex items-center relative">
 					<select
 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right"
 						bind:value={TTSEngine}
 						placeholder="Select a mode"
 						on:change={(e) => {
 							if (e.target.value === 'openai') {
 								getOpenAIVoices();
 								speaker = 'alloy';
 							} else {
 								getWebAPIVoices();
 								speaker = '';
 							}
 						}}
 					>
 						<option value="">Default (Web API)</option>
 						<option value="openai">Open AI</option>
 					</select>
 				</div>
 			</div>
 			<div class=" py-0.5 flex w-full justify-between">
 				<div class=" self-center text-xs font-medium">Auto-playback response</div>
@ -177,7 +209,7 @@
 		<hr class=" dark:border-gray-700" />
-		{#if engine === ''}
+		{#if TTSEngine === ''}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
 				<div class="flex w-full">
@ -196,7 +228,7 @@
 					</div>
 				</div>
 			</div>
-		{:else if engine === 'openai'}
+		{:else if TTSEngine === 'openai'}
 			<div>
 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div>
 				<div class="flex w-full">
--- a/src/lib/components/chat/SettingsModal.svelte
+++ b/src/lib/components/chat/SettingsModal.svelte
@ -13,7 +13,7 @@
 	import General from './Settings/General.svelte';
 	import External from './Settings/External.svelte';
 	import Interface from './Settings/Interface.svelte';
-	import Voice from './Settings/Voice.svelte';
+	import Audio from './Settings/Audio.svelte';
 	import Chats from './Settings/Chats.svelte';
 	export let show = false;
@ -206,11 +206,11 @@
 				<button
 					class="px-2.5 py-2.5 min-w-fit rounded-lg flex-1 md:flex-none flex text-right transition {selectedTab ===
-					'voice'
+					'audio'
 						? 'bg-gray-200 dark:bg-gray-700'
 						: ' hover:bg-gray-300 dark:hover:bg-gray-800'}"
 					on:click={() => {
-						selectedTab = 'voice';
+						selectedTab = 'audio';
 					}}
 				>
 					<div class=" self-center mr-2">
@ -228,7 +228,7 @@
 							/>
 						</svg>
 					</div>
-					<div class=" self-center">Voice</div>
+					<div class=" self-center">Audio</div>
 				</button>
 				<button
@ -341,8 +341,8 @@
 							show = false;
 						}}
 					/>
-				{:else if selectedTab === 'voice'}
+				{:else if selectedTab === 'audio'}
-					<Voice
+					<Audio
 						{saveSettings}
 						on:save={() => {
 							show = false;
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@ -7,6 +7,7 @@ export const WEBUI_API_BASE_URL = `${WEBUI_BASE_URL}/api/v1`;
 export const OLLAMA_API_BASE_URL = `${WEBUI_BASE_URL}/ollama/api`;
 export const OPENAI_API_BASE_URL = `${WEBUI_BASE_URL}/openai/api`;
 export const RAG_API_BASE_URL = `${WEBUI_BASE_URL}/rag/api/v1`;
 export const AUDIO_API_BASE_URL = `${WEBUI_BASE_URL}/audio/api/v1`;
 export const WEB_UI_VERSION = 'v1.0.0-alpha-static';
@ -23,7 +24,9 @@ export const SUPPORTED_FILE_TYPE = [
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 	'application/octet-stream',
 	'application/x-javascript',
-	'text/markdown'
+	'text/markdown',
 	'audio/mpeg',
 	'audio/wav'
 ];
 export const SUPPORTED_FILE_EXTENSIONS = [
--- a/src/lib/utils/index.ts
+++ b/src/lib/utils/index.ts
@ -341,3 +341,9 @@ export const extractSentences = (text) => {
 		.map((sentence) => removeEmojis(sentence.trim()))
 		.filter((sentence) => sentence !== '');
 };
 export const blobToFile = (blob, fileName) => {
 	// Create a new File object from the Blob
 	const file = new File([blob], fileName, { type: blob.type });
 	return file;
 };