forked from open-webui/open-webui
		
	Merge pull request #707 from ollama-webui/whisper
feat: whisper support
This commit is contained in:
		
						commit
						e1a6ccd1aa
					
				
					 11 changed files with 374 additions and 85 deletions
				
			
		
							
								
								
									
										80
									
								
								backend/apps/audio/main.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								backend/apps/audio/main.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,80 @@ | ||||||
|  | from fastapi import ( | ||||||
|  |     FastAPI, | ||||||
|  |     Request, | ||||||
|  |     Depends, | ||||||
|  |     HTTPException, | ||||||
|  |     status, | ||||||
|  |     UploadFile, | ||||||
|  |     File, | ||||||
|  |     Form, | ||||||
|  | ) | ||||||
|  | from fastapi.middleware.cors import CORSMiddleware | ||||||
|  | from faster_whisper import WhisperModel | ||||||
|  | 
 | ||||||
|  | from constants import ERROR_MESSAGES | ||||||
|  | from utils.utils import ( | ||||||
|  |     decode_token, | ||||||
|  |     get_current_user, | ||||||
|  |     get_verified_user, | ||||||
|  |     get_admin_user, | ||||||
|  | ) | ||||||
|  | from utils.misc import calculate_sha256 | ||||||
|  | 
 | ||||||
|  | from config import CACHE_DIR, UPLOAD_DIR, WHISPER_MODEL_NAME | ||||||
|  | 
 | ||||||
|  | app = FastAPI() | ||||||
|  | app.add_middleware( | ||||||
|  |     CORSMiddleware, | ||||||
|  |     allow_origins=["*"], | ||||||
|  |     allow_credentials=True, | ||||||
|  |     allow_methods=["*"], | ||||||
|  |     allow_headers=["*"], | ||||||
|  | ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @app.post("/transcribe") | ||||||
|  | def transcribe( | ||||||
|  |     file: UploadFile = File(...), | ||||||
|  |     user=Depends(get_current_user), | ||||||
|  | ): | ||||||
|  |     print(file.content_type) | ||||||
|  | 
 | ||||||
|  |     if file.content_type not in ["audio/mpeg", "audio/wav"]: | ||||||
|  |         raise HTTPException( | ||||||
|  |             status_code=status.HTTP_400_BAD_REQUEST, | ||||||
|  |             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         filename = file.filename | ||||||
|  |         file_path = f"{UPLOAD_DIR}/{filename}" | ||||||
|  |         contents = file.file.read() | ||||||
|  |         with open(file_path, "wb") as f: | ||||||
|  |             f.write(contents) | ||||||
|  |             f.close() | ||||||
|  | 
 | ||||||
|  |         model_name = WHISPER_MODEL_NAME | ||||||
|  |         model = WhisperModel( | ||||||
|  |             model_name, | ||||||
|  |             device="cpu", | ||||||
|  |             compute_type="int8", | ||||||
|  |             download_root=f"{CACHE_DIR}/whisper/models", | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         segments, info = model.transcribe(file_path, beam_size=5) | ||||||
|  |         print( | ||||||
|  |             "Detected language '%s' with probability %f" | ||||||
|  |             % (info.language, info.language_probability) | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         transcript = "".join([segment.text for segment in list(segments)]) | ||||||
|  | 
 | ||||||
|  |         return {"text": transcript.strip()} | ||||||
|  | 
 | ||||||
|  |     except Exception as e: | ||||||
|  |         print(e) | ||||||
|  | 
 | ||||||
|  |         raise HTTPException( | ||||||
|  |             status_code=status.HTTP_400_BAD_REQUEST, | ||||||
|  |             detail=ERROR_MESSAGES.DEFAULT(e), | ||||||
|  |         ) | ||||||
|  | @ -132,3 +132,8 @@ CHROMA_CLIENT = chromadb.PersistentClient( | ||||||
| ) | ) | ||||||
| CHUNK_SIZE = 1500 | CHUNK_SIZE = 1500 | ||||||
| CHUNK_OVERLAP = 100 | CHUNK_OVERLAP = 100 | ||||||
|  | 
 | ||||||
|  | #################################### | ||||||
|  | # Transcribe | ||||||
|  | #################################### | ||||||
|  | WHISPER_MODEL_NAME = "base" | ||||||
|  |  | ||||||
|  | @ -10,6 +10,8 @@ from starlette.exceptions import HTTPException as StarletteHTTPException | ||||||
| 
 | 
 | ||||||
| from apps.ollama.main import app as ollama_app | from apps.ollama.main import app as ollama_app | ||||||
| from apps.openai.main import app as openai_app | from apps.openai.main import app as openai_app | ||||||
|  | from apps.audio.main import app as audio_app | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| from apps.web.main import app as webui_app | from apps.web.main import app as webui_app | ||||||
| from apps.rag.main import app as rag_app | from apps.rag.main import app as rag_app | ||||||
|  | @ -55,6 +57,8 @@ app.mount("/api/v1", webui_app) | ||||||
| 
 | 
 | ||||||
| app.mount("/ollama/api", ollama_app) | app.mount("/ollama/api", ollama_app) | ||||||
| app.mount("/openai/api", openai_app) | app.mount("/openai/api", openai_app) | ||||||
|  | 
 | ||||||
|  | app.mount("/audio/api/v1", audio_app) | ||||||
| app.mount("/rag/api/v1", rag_app) | app.mount("/rag/api/v1", rag_app) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -30,6 +30,8 @@ openpyxl | ||||||
| pyxlsb | pyxlsb | ||||||
| xlrd | xlrd | ||||||
| 
 | 
 | ||||||
|  | faster-whisper | ||||||
|  | 
 | ||||||
| PyJWT | PyJWT | ||||||
| pyjwt[crypto] | pyjwt[crypto] | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										31
									
								
								src/lib/apis/audio/index.ts
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								src/lib/apis/audio/index.ts
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,31 @@ | ||||||
|  | import { AUDIO_API_BASE_URL } from '$lib/constants'; | ||||||
|  | 
 | ||||||
|  | export const transcribeAudio = async (token: string, file: File) => { | ||||||
|  | 	const data = new FormData(); | ||||||
|  | 	data.append('file', file); | ||||||
|  | 
 | ||||||
|  | 	let error = null; | ||||||
|  | 	const res = await fetch(`${AUDIO_API_BASE_URL}/transcribe`, { | ||||||
|  | 		method: 'POST', | ||||||
|  | 		headers: { | ||||||
|  | 			Accept: 'application/json', | ||||||
|  | 			authorization: `Bearer ${token}` | ||||||
|  | 		}, | ||||||
|  | 		body: data | ||||||
|  | 	}) | ||||||
|  | 		.then(async (res) => { | ||||||
|  | 			if (!res.ok) throw await res.json(); | ||||||
|  | 			return res.json(); | ||||||
|  | 		}) | ||||||
|  | 		.catch((err) => { | ||||||
|  | 			error = err.detail; | ||||||
|  | 			console.log(err); | ||||||
|  | 			return null; | ||||||
|  | 		}); | ||||||
|  | 
 | ||||||
|  | 	if (error) { | ||||||
|  | 		throw error; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return res; | ||||||
|  | }; | ||||||
|  | @ -2,7 +2,7 @@ | ||||||
| 	import toast from 'svelte-french-toast'; | 	import toast from 'svelte-french-toast'; | ||||||
| 	import { onMount, tick } from 'svelte'; | 	import { onMount, tick } from 'svelte'; | ||||||
| 	import { settings } from '$lib/stores'; | 	import { settings } from '$lib/stores'; | ||||||
| 	import { calculateSHA256, findWordIndices } from '$lib/utils'; | 	import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils'; | ||||||
| 
 | 
 | ||||||
| 	import Prompts from './MessageInput/PromptCommands.svelte'; | 	import Prompts from './MessageInput/PromptCommands.svelte'; | ||||||
| 	import Suggestions from './MessageInput/Suggestions.svelte'; | 	import Suggestions from './MessageInput/Suggestions.svelte'; | ||||||
|  | @ -11,6 +11,7 @@ | ||||||
| 	import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; | 	import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; | ||||||
| 	import Documents from './MessageInput/Documents.svelte'; | 	import Documents from './MessageInput/Documents.svelte'; | ||||||
| 	import Models from './MessageInput/Models.svelte'; | 	import Models from './MessageInput/Models.svelte'; | ||||||
|  | 	import { transcribeAudio } from '$lib/apis/audio'; | ||||||
| 
 | 
 | ||||||
| 	export let submitPrompt: Function; | 	export let submitPrompt: Function; | ||||||
| 	export let stopResponse: Function; | 	export let stopResponse: Function; | ||||||
|  | @ -34,7 +35,6 @@ | ||||||
| 
 | 
 | ||||||
| 	export let fileUploadEnabled = true; | 	export let fileUploadEnabled = true; | ||||||
| 	export let speechRecognitionEnabled = true; | 	export let speechRecognitionEnabled = true; | ||||||
| 	export let speechRecognitionListening = false; |  | ||||||
| 
 | 
 | ||||||
| 	export let prompt = ''; | 	export let prompt = ''; | ||||||
| 	export let messages = []; | 	export let messages = []; | ||||||
|  | @ -50,62 +50,170 @@ | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	let mediaRecorder; | ||||||
|  | 	let audioChunks = []; | ||||||
|  | 	let isRecording = false; | ||||||
|  | 	const MIN_DECIBELS = -45; | ||||||
|  | 
 | ||||||
|  | 	const startRecording = async () => { | ||||||
|  | 		const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); | ||||||
|  | 		mediaRecorder = new MediaRecorder(stream); | ||||||
|  | 		mediaRecorder.onstart = () => { | ||||||
|  | 			isRecording = true; | ||||||
|  | 			console.log('Recording started'); | ||||||
|  | 		}; | ||||||
|  | 		mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data); | ||||||
|  | 		mediaRecorder.onstop = async () => { | ||||||
|  | 			isRecording = false; | ||||||
|  | 			console.log('Recording stopped'); | ||||||
|  | 
 | ||||||
|  | 			// Create a blob from the audio chunks | ||||||
|  | 			const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); | ||||||
|  | 
 | ||||||
|  | 			const file = blobToFile(audioBlob, 'recording.wav'); | ||||||
|  | 
 | ||||||
|  | 			const res = await transcribeAudio(localStorage.token, file).catch((error) => { | ||||||
|  | 				toast.error(error); | ||||||
|  | 				return null; | ||||||
|  | 			}); | ||||||
|  | 
 | ||||||
|  | 			if (res) { | ||||||
|  | 				prompt = res.text; | ||||||
|  | 				await tick(); | ||||||
|  | 
 | ||||||
|  | 				const inputElement = document.getElementById('chat-textarea'); | ||||||
|  | 				inputElement?.focus(); | ||||||
|  | 
 | ||||||
|  | 				if (prompt !== '' && $settings?.speechAutoSend === true) { | ||||||
|  | 					submitPrompt(prompt, user); | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			// saveRecording(audioBlob); | ||||||
|  | 			audioChunks = []; | ||||||
|  | 		}; | ||||||
|  | 
 | ||||||
|  | 		// Start recording | ||||||
|  | 		mediaRecorder.start(); | ||||||
|  | 
 | ||||||
|  | 		// Monitor silence | ||||||
|  | 		monitorSilence(stream); | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
|  | 	const monitorSilence = (stream) => { | ||||||
|  | 		const audioContext = new AudioContext(); | ||||||
|  | 		const audioStreamSource = audioContext.createMediaStreamSource(stream); | ||||||
|  | 		const analyser = audioContext.createAnalyser(); | ||||||
|  | 		analyser.minDecibels = MIN_DECIBELS; | ||||||
|  | 		audioStreamSource.connect(analyser); | ||||||
|  | 
 | ||||||
|  | 		const bufferLength = analyser.frequencyBinCount; | ||||||
|  | 		const domainData = new Uint8Array(bufferLength); | ||||||
|  | 
 | ||||||
|  | 		let lastSoundTime = Date.now(); | ||||||
|  | 
 | ||||||
|  | 		const detectSound = () => { | ||||||
|  | 			analyser.getByteFrequencyData(domainData); | ||||||
|  | 
 | ||||||
|  | 			if (domainData.some((value) => value > 0)) { | ||||||
|  | 				lastSoundTime = Date.now(); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			if (isRecording && Date.now() - lastSoundTime > 3000) { | ||||||
|  | 				mediaRecorder.stop(); | ||||||
|  | 				audioContext.close(); | ||||||
|  | 				return; | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			window.requestAnimationFrame(detectSound); | ||||||
|  | 		}; | ||||||
|  | 
 | ||||||
|  | 		window.requestAnimationFrame(detectSound); | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
|  | 	const saveRecording = (blob) => { | ||||||
|  | 		const url = URL.createObjectURL(blob); | ||||||
|  | 		const a = document.createElement('a'); | ||||||
|  | 		document.body.appendChild(a); | ||||||
|  | 		a.style = 'display: none'; | ||||||
|  | 		a.href = url; | ||||||
|  | 		a.download = 'recording.wav'; | ||||||
|  | 		a.click(); | ||||||
|  | 		window.URL.revokeObjectURL(url); | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
| 	const speechRecognitionHandler = () => { | 	const speechRecognitionHandler = () => { | ||||||
| 		// Check if SpeechRecognition is supported | 		// Check if SpeechRecognition is supported | ||||||
| 
 | 
 | ||||||
| 		if (speechRecognitionListening) { | 		if (isRecording) { | ||||||
| 			speechRecognition.stop(); | 			if (speechRecognition) { | ||||||
|  | 				speechRecognition.stop(); | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
|  | 			if (mediaRecorder) { | ||||||
|  | 				mediaRecorder.stop(); | ||||||
|  | 			} | ||||||
| 		} else { | 		} else { | ||||||
| 			if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { | 			isRecording = true; | ||||||
| 				// Create a SpeechRecognition object |  | ||||||
| 				speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); |  | ||||||
| 
 | 
 | ||||||
| 				// Set continuous to true for continuous recognition | 			if ($settings?.voice?.STTEngine ?? '' !== '') { | ||||||
| 				speechRecognition.continuous = true; | 				startRecording(); | ||||||
| 
 |  | ||||||
| 				// Set the timeout for turning off the recognition after inactivity (in milliseconds) |  | ||||||
| 				const inactivityTimeout = 3000; // 3 seconds |  | ||||||
| 
 |  | ||||||
| 				let timeoutId; |  | ||||||
| 				// Start recognition |  | ||||||
| 				speechRecognition.start(); |  | ||||||
| 				speechRecognitionListening = true; |  | ||||||
| 
 |  | ||||||
| 				// Event triggered when speech is recognized |  | ||||||
| 				speechRecognition.onresult = function (event) { |  | ||||||
| 					// Clear the inactivity timeout |  | ||||||
| 					clearTimeout(timeoutId); |  | ||||||
| 
 |  | ||||||
| 					// Handle recognized speech |  | ||||||
| 					console.log(event); |  | ||||||
| 					const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; |  | ||||||
| 					prompt = `${prompt}${transcript}`; |  | ||||||
| 
 |  | ||||||
| 					// Restart the inactivity timeout |  | ||||||
| 					timeoutId = setTimeout(() => { |  | ||||||
| 						console.log('Speech recognition turned off due to inactivity.'); |  | ||||||
| 						speechRecognition.stop(); |  | ||||||
| 					}, inactivityTimeout); |  | ||||||
| 				}; |  | ||||||
| 
 |  | ||||||
| 				// Event triggered when recognition is ended |  | ||||||
| 				speechRecognition.onend = function () { |  | ||||||
| 					// Restart recognition after it ends |  | ||||||
| 					console.log('recognition ended'); |  | ||||||
| 					speechRecognitionListening = false; |  | ||||||
| 					if (prompt !== '' && $settings?.speechAutoSend === true) { |  | ||||||
| 						submitPrompt(prompt, user); |  | ||||||
| 					} |  | ||||||
| 				}; |  | ||||||
| 
 |  | ||||||
| 				// Event triggered when an error occurs |  | ||||||
| 				speechRecognition.onerror = function (event) { |  | ||||||
| 					console.log(event); |  | ||||||
| 					toast.error(`Speech recognition error: ${event.error}`); |  | ||||||
| 					speechRecognitionListening = false; |  | ||||||
| 				}; |  | ||||||
| 			} else { | 			} else { | ||||||
| 				toast.error('SpeechRecognition API is not supported in this browser.'); | 				if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { | ||||||
|  | 					// Create a SpeechRecognition object | ||||||
|  | 					speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); | ||||||
|  | 
 | ||||||
|  | 					// Set continuous to true for continuous recognition | ||||||
|  | 					speechRecognition.continuous = true; | ||||||
|  | 
 | ||||||
|  | 					// Set the timeout for turning off the recognition after inactivity (in milliseconds) | ||||||
|  | 					const inactivityTimeout = 3000; // 3 seconds | ||||||
|  | 
 | ||||||
|  | 					let timeoutId; | ||||||
|  | 					// Start recognition | ||||||
|  | 					speechRecognition.start(); | ||||||
|  | 
 | ||||||
|  | 					// Event triggered when speech is recognized | ||||||
|  | 					speechRecognition.onresult = async (event) => { | ||||||
|  | 						// Clear the inactivity timeout | ||||||
|  | 						clearTimeout(timeoutId); | ||||||
|  | 
 | ||||||
|  | 						// Handle recognized speech | ||||||
|  | 						console.log(event); | ||||||
|  | 						const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; | ||||||
|  | 
 | ||||||
|  | 						prompt = `${prompt}${transcript}`; | ||||||
|  | 
 | ||||||
|  | 						await tick(); | ||||||
|  | 						const inputElement = document.getElementById('chat-textarea'); | ||||||
|  | 						inputElement?.focus(); | ||||||
|  | 
 | ||||||
|  | 						// Restart the inactivity timeout | ||||||
|  | 						timeoutId = setTimeout(() => { | ||||||
|  | 							console.log('Speech recognition turned off due to inactivity.'); | ||||||
|  | 							speechRecognition.stop(); | ||||||
|  | 						}, inactivityTimeout); | ||||||
|  | 					}; | ||||||
|  | 
 | ||||||
|  | 					// Event triggered when recognition is ended | ||||||
|  | 					speechRecognition.onend = function () { | ||||||
|  | 						// Restart recognition after it ends | ||||||
|  | 						console.log('recognition ended'); | ||||||
|  | 						isRecording = false; | ||||||
|  | 						if (prompt !== '' && $settings?.speechAutoSend === true) { | ||||||
|  | 							submitPrompt(prompt, user); | ||||||
|  | 						} | ||||||
|  | 					}; | ||||||
|  | 
 | ||||||
|  | 					// Event triggered when an error occurs | ||||||
|  | 					speechRecognition.onerror = function (event) { | ||||||
|  | 						console.log(event); | ||||||
|  | 						toast.error(`Speech recognition error: ${event.error}`); | ||||||
|  | 						isRecording = false; | ||||||
|  | 					}; | ||||||
|  | 				} else { | ||||||
|  | 					toast.error('SpeechRecognition API is not supported in this browser.'); | ||||||
|  | 				} | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 	}; | 	}; | ||||||
|  | @ -123,6 +231,20 @@ | ||||||
| 
 | 
 | ||||||
| 		try { | 		try { | ||||||
| 			files = [...files, doc]; | 			files = [...files, doc]; | ||||||
|  | 
 | ||||||
|  | 			if (['audio/mpeg', 'audio/wav'].includes(file['type'])) { | ||||||
|  | 				const res = await transcribeAudio(localStorage.token, file).catch((error) => { | ||||||
|  | 					toast.error(error); | ||||||
|  | 					return null; | ||||||
|  | 				}); | ||||||
|  | 
 | ||||||
|  | 				if (res) { | ||||||
|  | 					console.log(res); | ||||||
|  | 					const blob = new Blob([res.text], { type: 'text/plain' }); | ||||||
|  | 					file = blobToFile(blob, `${file.name}.txt`); | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 
 | ||||||
| 			const res = await uploadDocToVectorDB(localStorage.token, '', file); | 			const res = await uploadDocToVectorDB(localStorage.token, '', file); | ||||||
| 
 | 
 | ||||||
| 			if (res) { | 			if (res) { | ||||||
|  | @ -535,7 +657,7 @@ | ||||||
| 								: ' pl-4'} rounded-xl resize-none h-[48px]" | 								: ' pl-4'} rounded-xl resize-none h-[48px]" | ||||||
| 							placeholder={chatInputPlaceholder !== '' | 							placeholder={chatInputPlaceholder !== '' | ||||||
| 								? chatInputPlaceholder | 								? chatInputPlaceholder | ||||||
| 								: speechRecognitionListening | 								: isRecording | ||||||
| 								? 'Listening...' | 								? 'Listening...' | ||||||
| 								: 'Send a message'} | 								: 'Send a message'} | ||||||
| 							bind:value={prompt} | 							bind:value={prompt} | ||||||
|  | @ -644,6 +766,10 @@ | ||||||
| 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; | 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; | ||||||
| 								user = null; | 								user = null; | ||||||
| 							}} | 							}} | ||||||
|  | 							on:focus={(e) => { | ||||||
|  | 								e.target.style.height = ''; | ||||||
|  | 								e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; | ||||||
|  | 							}} | ||||||
| 							on:paste={(e) => { | 							on:paste={(e) => { | ||||||
| 								const clipboardData = e.clipboardData || window.clipboardData; | 								const clipboardData = e.clipboardData || window.clipboardData; | ||||||
| 
 | 
 | ||||||
|  | @ -681,7 +807,7 @@ | ||||||
| 											speechRecognitionHandler(); | 											speechRecognitionHandler(); | ||||||
| 										}} | 										}} | ||||||
| 									> | 									> | ||||||
| 										{#if speechRecognitionListening} | 										{#if isRecording} | ||||||
| 											<svg | 											<svg | ||||||
| 												class=" w-5 h-5 translate-y-[0.5px]" | 												class=" w-5 h-5 translate-y-[0.5px]" | ||||||
| 												fill="currentColor" | 												fill="currentColor" | ||||||
|  |  | ||||||
|  | @ -148,7 +148,7 @@ | ||||||
| 		} else { | 		} else { | ||||||
| 			speaking = true; | 			speaking = true; | ||||||
| 
 | 
 | ||||||
| 			if ($settings?.speech?.engine === 'openai') { | 			if ($settings?.audio?.TTSEngine === 'openai') { | ||||||
| 				loadingSpeech = true; | 				loadingSpeech = true; | ||||||
| 
 | 
 | ||||||
| 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => { | 				const sentences = extractSentences(message.content).reduce((mergedTexts, currentText) => { | ||||||
|  | @ -179,7 +179,7 @@ | ||||||
| 				for (const [idx, sentence] of sentences.entries()) { | 				for (const [idx, sentence] of sentences.entries()) { | ||||||
| 					const res = await synthesizeOpenAISpeech( | 					const res = await synthesizeOpenAISpeech( | ||||||
| 						localStorage.token, | 						localStorage.token, | ||||||
| 						$settings?.speech?.speaker, | 						$settings?.audio?.speaker, | ||||||
| 						sentence | 						sentence | ||||||
| 					).catch((error) => { | 					).catch((error) => { | ||||||
| 						toast.error(error); | 						toast.error(error); | ||||||
|  | @ -204,7 +204,7 @@ | ||||||
| 						clearInterval(getVoicesLoop); | 						clearInterval(getVoicesLoop); | ||||||
| 
 | 
 | ||||||
| 						const voice = | 						const voice = | ||||||
| 							voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; | 							voices?.filter((v) => v.name === $settings?.audio?.speaker)?.at(0) ?? undefined; | ||||||
| 
 | 
 | ||||||
| 						const speak = new SpeechSynthesisUtterance(message.content); | 						const speak = new SpeechSynthesisUtterance(message.content); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,17 +1,21 @@ | ||||||
| <script lang="ts"> | <script lang="ts"> | ||||||
| 	import { createEventDispatcher, onMount } from 'svelte'; | 	import { createEventDispatcher, onMount } from 'svelte'; | ||||||
|  | 	import toast from 'svelte-french-toast'; | ||||||
| 	const dispatch = createEventDispatcher(); | 	const dispatch = createEventDispatcher(); | ||||||
| 
 | 
 | ||||||
| 	export let saveSettings: Function; | 	export let saveSettings: Function; | ||||||
| 
 | 
 | ||||||
| 	// Voice | 	// Audio | ||||||
|  | 
 | ||||||
|  | 	let STTEngines = ['', 'openai']; | ||||||
|  | 	let STTEngine = ''; | ||||||
| 
 | 
 | ||||||
| 	let conversationMode = false; | 	let conversationMode = false; | ||||||
| 	let speechAutoSend = false; | 	let speechAutoSend = false; | ||||||
| 	let responseAutoPlayback = false; | 	let responseAutoPlayback = false; | ||||||
| 
 | 
 | ||||||
| 	let engines = ['', 'openai']; | 	let TTSEngines = ['', 'openai']; | ||||||
| 	let engine = ''; | 	let TTSEngine = ''; | ||||||
| 
 | 
 | ||||||
| 	let voices = []; | 	let voices = []; | ||||||
| 	let speaker = ''; | 	let speaker = ''; | ||||||
|  | @ -70,10 +74,11 @@ | ||||||
| 		speechAutoSend = settings.speechAutoSend ?? false; | 		speechAutoSend = settings.speechAutoSend ?? false; | ||||||
| 		responseAutoPlayback = settings.responseAutoPlayback ?? false; | 		responseAutoPlayback = settings.responseAutoPlayback ?? false; | ||||||
| 
 | 
 | ||||||
| 		engine = settings?.speech?.engine ?? ''; | 		STTEngine = settings?.audio?.STTEngine ?? ''; | ||||||
| 		speaker = settings?.speech?.speaker ?? ''; | 		TTSEngine = settings?.audio?.TTSEngine ?? ''; | ||||||
|  | 		speaker = settings?.audio?.speaker ?? ''; | ||||||
| 
 | 
 | ||||||
| 		if (engine === 'openai') { | 		if (TTSEngine === 'openai') { | ||||||
| 			getOpenAIVoices(); | 			getOpenAIVoices(); | ||||||
| 		} else { | 		} else { | ||||||
| 			getWebAPIVoices(); | 			getWebAPIVoices(); | ||||||
|  | @ -85,37 +90,37 @@ | ||||||
| 	class="flex flex-col h-full justify-between space-y-3 text-sm" | 	class="flex flex-col h-full justify-between space-y-3 text-sm" | ||||||
| 	on:submit|preventDefault={() => { | 	on:submit|preventDefault={() => { | ||||||
| 		saveSettings({ | 		saveSettings({ | ||||||
| 			speech: { | 			audio: { | ||||||
| 				engine: engine !== '' ? engine : undefined, | 				STTEngine: STTEngine !== '' ? STTEngine : undefined, | ||||||
|  | 				TTSEngine: TTSEngine !== '' ? TTSEngine : undefined, | ||||||
| 				speaker: speaker !== '' ? speaker : undefined | 				speaker: speaker !== '' ? speaker : undefined | ||||||
| 			} | 			} | ||||||
| 		}); | 		}); | ||||||
| 		dispatch('save'); | 		dispatch('save'); | ||||||
| 	}} | 	}} | ||||||
| > | > | ||||||
| 	<div class=" space-y-3"> | 	<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80"> | ||||||
| 		<div> | 		<div> | ||||||
| 			<div class=" mb-1 text-sm font-medium">TTS Settings</div> | 			<div class=" mb-1 text-sm font-medium">STT Settings</div> | ||||||
| 
 | 
 | ||||||
| 			<div class=" py-0.5 flex w-full justify-between"> | 			<div class=" py-0.5 flex w-full justify-between"> | ||||||
| 				<div class=" self-center text-xs font-medium">Speech Engine</div> | 				<div class=" self-center text-xs font-medium">Speech-to-Text Engine</div> | ||||||
| 				<div class="flex items-center relative"> | 				<div class="flex items-center relative"> | ||||||
| 					<select | 					<select | ||||||
| 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | ||||||
| 						bind:value={engine} | 						bind:value={STTEngine} | ||||||
| 						placeholder="Select a mode" | 						placeholder="Select a mode" | ||||||
| 						on:change={(e) => { | 						on:change={(e) => { | ||||||
| 							if (e.target.value === 'openai') { | 							if (e.target.value !== '') { | ||||||
| 								getOpenAIVoices(); | 								navigator.mediaDevices.getUserMedia({ audio: true }).catch(function (err) { | ||||||
| 								speaker = 'alloy'; | 									toast.error(`Permission denied when accessing microphone: ${err}`); | ||||||
| 							} else { | 									STTEngine = ''; | ||||||
| 								getWebAPIVoices(); | 								}); | ||||||
| 								speaker = ''; |  | ||||||
| 							} | 							} | ||||||
| 						}} | 						}} | ||||||
| 					> | 					> | ||||||
| 						<option value="">Default (Web API)</option> | 						<option value="">Default (Web API)</option> | ||||||
| 						<option value="openai">Open AI</option> | 						<option value="whisper-local">Whisper (Local)</option> | ||||||
| 					</select> | 					</select> | ||||||
| 				</div> | 				</div> | ||||||
| 			</div> | 			</div> | ||||||
|  | @ -155,6 +160,33 @@ | ||||||
| 					{/if} | 					{/if} | ||||||
| 				</button> | 				</button> | ||||||
| 			</div> | 			</div> | ||||||
|  | 		</div> | ||||||
|  | 
 | ||||||
|  | 		<div> | ||||||
|  | 			<div class=" mb-1 text-sm font-medium">TTS Settings</div> | ||||||
|  | 
 | ||||||
|  | 			<div class=" py-0.5 flex w-full justify-between"> | ||||||
|  | 				<div class=" self-center text-xs font-medium">Text-to-Speech Engine</div> | ||||||
|  | 				<div class="flex items-center relative"> | ||||||
|  | 					<select | ||||||
|  | 						class="w-fit pr-8 rounded px-2 p-1 text-xs bg-transparent outline-none text-right" | ||||||
|  | 						bind:value={TTSEngine} | ||||||
|  | 						placeholder="Select a mode" | ||||||
|  | 						on:change={(e) => { | ||||||
|  | 							if (e.target.value === 'openai') { | ||||||
|  | 								getOpenAIVoices(); | ||||||
|  | 								speaker = 'alloy'; | ||||||
|  | 							} else { | ||||||
|  | 								getWebAPIVoices(); | ||||||
|  | 								speaker = ''; | ||||||
|  | 							} | ||||||
|  | 						}} | ||||||
|  | 					> | ||||||
|  | 						<option value="">Default (Web API)</option> | ||||||
|  | 						<option value="openai">Open AI</option> | ||||||
|  | 					</select> | ||||||
|  | 				</div> | ||||||
|  | 			</div> | ||||||
| 
 | 
 | ||||||
| 			<div class=" py-0.5 flex w-full justify-between"> | 			<div class=" py-0.5 flex w-full justify-between"> | ||||||
| 				<div class=" self-center text-xs font-medium">Auto-playback response</div> | 				<div class=" self-center text-xs font-medium">Auto-playback response</div> | ||||||
|  | @ -177,7 +209,7 @@ | ||||||
| 
 | 
 | ||||||
| 		<hr class=" dark:border-gray-700" /> | 		<hr class=" dark:border-gray-700" /> | ||||||
| 
 | 
 | ||||||
| 		{#if engine === ''} | 		{#if TTSEngine === ''} | ||||||
| 			<div> | 			<div> | ||||||
| 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||||
| 				<div class="flex w-full"> | 				<div class="flex w-full"> | ||||||
|  | @ -196,7 +228,7 @@ | ||||||
| 					</div> | 					</div> | ||||||
| 				</div> | 				</div> | ||||||
| 			</div> | 			</div> | ||||||
| 		{:else if engine === 'openai'} | 		{:else if TTSEngine === 'openai'} | ||||||
| 			<div> | 			<div> | ||||||
| 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||||
| 				<div class="flex w-full"> | 				<div class="flex w-full"> | ||||||
|  | @ -13,7 +13,7 @@ | ||||||
| 	import General from './Settings/General.svelte'; | 	import General from './Settings/General.svelte'; | ||||||
| 	import External from './Settings/External.svelte'; | 	import External from './Settings/External.svelte'; | ||||||
| 	import Interface from './Settings/Interface.svelte'; | 	import Interface from './Settings/Interface.svelte'; | ||||||
| 	import Voice from './Settings/Voice.svelte'; | 	import Audio from './Settings/Audio.svelte'; | ||||||
| 	import Chats from './Settings/Chats.svelte'; | 	import Chats from './Settings/Chats.svelte'; | ||||||
| 
 | 
 | ||||||
| 	export let show = false; | 	export let show = false; | ||||||
|  | @ -206,11 +206,11 @@ | ||||||
| 
 | 
 | ||||||
| 				<button | 				<button | ||||||
| 					class="px-2.5 py-2.5 min-w-fit rounded-lg flex-1 md:flex-none flex text-right transition {selectedTab === | 					class="px-2.5 py-2.5 min-w-fit rounded-lg flex-1 md:flex-none flex text-right transition {selectedTab === | ||||||
| 					'voice' | 					'audio' | ||||||
| 						? 'bg-gray-200 dark:bg-gray-700' | 						? 'bg-gray-200 dark:bg-gray-700' | ||||||
| 						: ' hover:bg-gray-300 dark:hover:bg-gray-800'}" | 						: ' hover:bg-gray-300 dark:hover:bg-gray-800'}" | ||||||
| 					on:click={() => { | 					on:click={() => { | ||||||
| 						selectedTab = 'voice'; | 						selectedTab = 'audio'; | ||||||
| 					}} | 					}} | ||||||
| 				> | 				> | ||||||
| 					<div class=" self-center mr-2"> | 					<div class=" self-center mr-2"> | ||||||
|  | @ -228,7 +228,7 @@ | ||||||
| 							/> | 							/> | ||||||
| 						</svg> | 						</svg> | ||||||
| 					</div> | 					</div> | ||||||
| 					<div class=" self-center">Voice</div> | 					<div class=" self-center">Audio</div> | ||||||
| 				</button> | 				</button> | ||||||
| 
 | 
 | ||||||
| 				<button | 				<button | ||||||
|  | @ -341,8 +341,8 @@ | ||||||
| 							show = false; | 							show = false; | ||||||
| 						}} | 						}} | ||||||
| 					/> | 					/> | ||||||
| 				{:else if selectedTab === 'voice'} | 				{:else if selectedTab === 'audio'} | ||||||
| 					<Voice | 					<Audio | ||||||
| 						{saveSettings} | 						{saveSettings} | ||||||
| 						on:save={() => { | 						on:save={() => { | ||||||
| 							show = false; | 							show = false; | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ export const WEBUI_API_BASE_URL = `${WEBUI_BASE_URL}/api/v1`; | ||||||
| export const OLLAMA_API_BASE_URL = `${WEBUI_BASE_URL}/ollama/api`; | export const OLLAMA_API_BASE_URL = `${WEBUI_BASE_URL}/ollama/api`; | ||||||
| export const OPENAI_API_BASE_URL = `${WEBUI_BASE_URL}/openai/api`; | export const OPENAI_API_BASE_URL = `${WEBUI_BASE_URL}/openai/api`; | ||||||
| export const RAG_API_BASE_URL = `${WEBUI_BASE_URL}/rag/api/v1`; | export const RAG_API_BASE_URL = `${WEBUI_BASE_URL}/rag/api/v1`; | ||||||
|  | export const AUDIO_API_BASE_URL = `${WEBUI_BASE_URL}/audio/api/v1`; | ||||||
| 
 | 
 | ||||||
| export const WEB_UI_VERSION = 'v1.0.0-alpha-static'; | export const WEB_UI_VERSION = 'v1.0.0-alpha-static'; | ||||||
| 
 | 
 | ||||||
|  | @ -23,7 +24,9 @@ export const SUPPORTED_FILE_TYPE = [ | ||||||
| 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||||||
| 	'application/octet-stream', | 	'application/octet-stream', | ||||||
| 	'application/x-javascript', | 	'application/x-javascript', | ||||||
| 	'text/markdown' | 	'text/markdown', | ||||||
|  | 	'audio/mpeg', | ||||||
|  | 	'audio/wav' | ||||||
| ]; | ]; | ||||||
| 
 | 
 | ||||||
| export const SUPPORTED_FILE_EXTENSIONS = [ | export const SUPPORTED_FILE_EXTENSIONS = [ | ||||||
|  |  | ||||||
|  | @ -341,3 +341,9 @@ export const extractSentences = (text) => { | ||||||
| 		.map((sentence) => removeEmojis(sentence.trim())) | 		.map((sentence) => removeEmojis(sentence.trim())) | ||||||
| 		.filter((sentence) => sentence !== ''); | 		.filter((sentence) => sentence !== ''); | ||||||
| }; | }; | ||||||
|  | 
 | ||||||
|  | export const blobToFile = (blob, fileName) => { | ||||||
|  | 	// Create a new File object from the Blob
 | ||||||
|  | 	const file = new File([blob], fileName, { type: blob.type }); | ||||||
|  | 	return file; | ||||||
|  | }; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Jaeryang Baek
						Timothy Jaeryang Baek