forked from open-webui/open-webui
		
	feat: openai tts support
This commit is contained in:
		
							parent
							
								
									ce31113abd
								
							
						
					
					
						commit
						0b8df52c97
					
				
					 5 changed files with 216 additions and 23 deletions
				
			
		|  | @ -1,15 +1,19 @@ | ||||||
| from fastapi import FastAPI, Request, Response, HTTPException, Depends | from fastapi import FastAPI, Request, Response, HTTPException, Depends | ||||||
| from fastapi.middleware.cors import CORSMiddleware | from fastapi.middleware.cors import CORSMiddleware | ||||||
| from fastapi.responses import StreamingResponse, JSONResponse | from fastapi.responses import StreamingResponse, JSONResponse, FileResponse | ||||||
| 
 | 
 | ||||||
| import requests | import requests | ||||||
| import json | import json | ||||||
| from pydantic import BaseModel | from pydantic import BaseModel | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| from apps.web.models.users import Users | from apps.web.models.users import Users | ||||||
| from constants import ERROR_MESSAGES | from constants import ERROR_MESSAGES | ||||||
| from utils.utils import decode_token, get_current_user | from utils.utils import decode_token, get_current_user | ||||||
| from config import OPENAI_API_BASE_URL, OPENAI_API_KEY | from config import OPENAI_API_BASE_URL, OPENAI_API_KEY, CACHE_DIR | ||||||
|  | 
 | ||||||
|  | import hashlib | ||||||
|  | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| app = FastAPI() | app = FastAPI() | ||||||
| app.add_middleware( | app.add_middleware( | ||||||
|  | @ -66,6 +70,73 @@ async def update_openai_key(form_data: KeyUpdateForm, user=Depends(get_current_u | ||||||
|         raise HTTPException(status_code=401, detail=ERROR_MESSAGES.ACCESS_PROHIBITED) |         raise HTTPException(status_code=401, detail=ERROR_MESSAGES.ACCESS_PROHIBITED) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @app.post("/audio/speech") | ||||||
|  | async def speech(request: Request, user=Depends(get_current_user)): | ||||||
|  |     target_url = f"{app.state.OPENAI_API_BASE_URL}/audio/speech" | ||||||
|  | 
 | ||||||
|  |     if user.role not in ["user", "admin"]: | ||||||
|  |         raise HTTPException(status_code=401, detail=ERROR_MESSAGES.ACCESS_PROHIBITED) | ||||||
|  |     if app.state.OPENAI_API_KEY == "": | ||||||
|  |         raise HTTPException(status_code=401, detail=ERROR_MESSAGES.API_KEY_NOT_FOUND) | ||||||
|  | 
 | ||||||
|  |     body = await request.body() | ||||||
|  | 
 | ||||||
|  |     filename = hashlib.sha256(body).hexdigest() + ".mp3" | ||||||
|  |     SPEECH_CACHE_DIR = Path(CACHE_DIR).joinpath("./audio/speech/") | ||||||
|  |     SPEECH_CACHE_DIR.mkdir(parents=True, exist_ok=True) | ||||||
|  |     file_path = SPEECH_CACHE_DIR.joinpath(filename) | ||||||
|  | 
 | ||||||
|  |     print(file_path) | ||||||
|  | 
 | ||||||
|  |     # Check if the file already exists in the cache | ||||||
|  |     if file_path.is_file(): | ||||||
|  |         return FileResponse(file_path) | ||||||
|  | 
 | ||||||
|  |     headers = {} | ||||||
|  |     headers["Authorization"] = f"Bearer {app.state.OPENAI_API_KEY}" | ||||||
|  |     headers["Content-Type"] = "application/json" | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         print("openai") | ||||||
|  |         r = requests.post( | ||||||
|  |             url=target_url, | ||||||
|  |             data=body, | ||||||
|  |             headers=headers, | ||||||
|  |             stream=True, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         print(r) | ||||||
|  | 
 | ||||||
|  |         r.raise_for_status() | ||||||
|  | 
 | ||||||
|  |         # Save the streaming content to a file | ||||||
|  |         with open(file_path, "wb") as f: | ||||||
|  |             for chunk in r.iter_content(chunk_size=8192): | ||||||
|  |                 f.write(chunk) | ||||||
|  | 
 | ||||||
|  |         # Return the saved file | ||||||
|  |         return FileResponse(file_path) | ||||||
|  | 
 | ||||||
|  |         # return StreamingResponse( | ||||||
|  |         #     r.iter_content(chunk_size=8192), | ||||||
|  |         #     status_code=r.status_code, | ||||||
|  |         #     headers=dict(r.headers), | ||||||
|  |         # ) | ||||||
|  | 
 | ||||||
|  |     except Exception as e: | ||||||
|  |         print(e) | ||||||
|  |         error_detail = "Ollama WebUI: Server Connection Error" | ||||||
|  |         if r is not None: | ||||||
|  |             try: | ||||||
|  |                 res = r.json() | ||||||
|  |                 if "error" in res: | ||||||
|  |                     error_detail = f"External: {res['error']}" | ||||||
|  |             except: | ||||||
|  |                 error_detail = f"External: {e}" | ||||||
|  | 
 | ||||||
|  |         raise HTTPException(status_code=r.status_code, detail=error_detail) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"]) | @app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE"]) | ||||||
| async def proxy(path: str, request: Request, user=Depends(get_current_user)): | async def proxy(path: str, request: Request, user=Depends(get_current_user)): | ||||||
|     target_url = f"{app.state.OPENAI_API_BASE_URL}/{path}" |     target_url = f"{app.state.OPENAI_API_BASE_URL}/{path}" | ||||||
|  | @ -129,8 +200,6 @@ async def proxy(path: str, request: Request, user=Depends(get_current_user)): | ||||||
| 
 | 
 | ||||||
|             response_data = r.json() |             response_data = r.json() | ||||||
| 
 | 
 | ||||||
|             print(type(response_data)) |  | ||||||
| 
 |  | ||||||
|             if "openai" in app.state.OPENAI_API_BASE_URL and path == "models": |             if "openai" in app.state.OPENAI_API_BASE_URL and path == "models": | ||||||
|                 response_data["data"] = list( |                 response_data["data"] = list( | ||||||
|                     filter(lambda model: "gpt" in model["id"], response_data["data"]) |                     filter(lambda model: "gpt" in model["id"], response_data["data"]) | ||||||
|  |  | ||||||
|  | @ -35,6 +35,14 @@ FRONTEND_BUILD_DIR = str(Path(os.getenv("FRONTEND_BUILD_DIR", "../build"))) | ||||||
| UPLOAD_DIR = f"{DATA_DIR}/uploads" | UPLOAD_DIR = f"{DATA_DIR}/uploads" | ||||||
| Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True) | Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True) | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | #################################### | ||||||
|  | # Cache DIR | ||||||
|  | #################################### | ||||||
|  | 
 | ||||||
|  | CACHE_DIR = f"{DATA_DIR}/cache" | ||||||
|  | Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) | ||||||
|  | 
 | ||||||
| #################################### | #################################### | ||||||
| # OLLAMA_API_BASE_URL | # OLLAMA_API_BASE_URL | ||||||
| #################################### | #################################### | ||||||
|  |  | ||||||
|  | @ -229,3 +229,34 @@ export const generateOpenAIChatCompletion = async (token: string = '', body: obj | ||||||
| 
 | 
 | ||||||
| 	return res; | 	return res; | ||||||
| }; | }; | ||||||
|  | 
 | ||||||
|  | export const synthesizeOpenAISpeech = async ( | ||||||
|  | 	token: string = '', | ||||||
|  | 	speaker: string = 'alloy', | ||||||
|  | 	text: string = '' | ||||||
|  | ) => { | ||||||
|  | 	let error = null; | ||||||
|  | 
 | ||||||
|  | 	const res = await fetch(`${OPENAI_API_BASE_URL}/audio/speech`, { | ||||||
|  | 		method: 'POST', | ||||||
|  | 		headers: { | ||||||
|  | 			Authorization: `Bearer ${token}`, | ||||||
|  | 			'Content-Type': 'application/json' | ||||||
|  | 		}, | ||||||
|  | 		body: JSON.stringify({ | ||||||
|  | 			model: 'tts-1', | ||||||
|  | 			input: text, | ||||||
|  | 			voice: speaker | ||||||
|  | 		}) | ||||||
|  | 	}).catch((err) => { | ||||||
|  | 		console.log(err); | ||||||
|  | 		error = err; | ||||||
|  | 		return null; | ||||||
|  | 	}); | ||||||
|  | 
 | ||||||
|  | 	if (error) { | ||||||
|  | 		throw error; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return res; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | @ -1,4 +1,5 @@ | ||||||
| <script lang="ts"> | <script lang="ts"> | ||||||
|  | 	import toast from 'svelte-french-toast'; | ||||||
| 	import dayjs from 'dayjs'; | 	import dayjs from 'dayjs'; | ||||||
| 	import { marked } from 'marked'; | 	import { marked } from 'marked'; | ||||||
| 	import { settings } from '$lib/stores'; | 	import { settings } from '$lib/stores'; | ||||||
|  | @ -13,6 +14,8 @@ | ||||||
| 	import Skeleton from './Skeleton.svelte'; | 	import Skeleton from './Skeleton.svelte'; | ||||||
| 	import CodeBlock from './CodeBlock.svelte'; | 	import CodeBlock from './CodeBlock.svelte'; | ||||||
| 
 | 
 | ||||||
|  | 	import { synthesizeOpenAISpeech } from '$lib/apis/openai'; | ||||||
|  | 
 | ||||||
| 	export let modelfiles = []; | 	export let modelfiles = []; | ||||||
| 	export let message; | 	export let message; | ||||||
| 	export let siblings; | 	export let siblings; | ||||||
|  | @ -27,6 +30,8 @@ | ||||||
| 	export let copyToClipboard: Function; | 	export let copyToClipboard: Function; | ||||||
| 	export let regenerateResponse: Function; | 	export let regenerateResponse: Function; | ||||||
| 
 | 
 | ||||||
|  | 	let audioMap = {}; | ||||||
|  | 
 | ||||||
| 	let edit = false; | 	let edit = false; | ||||||
| 	let editedContent = ''; | 	let editedContent = ''; | ||||||
| 
 | 
 | ||||||
|  | @ -114,23 +119,56 @@ | ||||||
| 		if (speaking) { | 		if (speaking) { | ||||||
| 			speechSynthesis.cancel(); | 			speechSynthesis.cancel(); | ||||||
| 			speaking = null; | 			speaking = null; | ||||||
|  | 
 | ||||||
|  | 			audioMap[message.id].pause(); | ||||||
|  | 			audioMap[message.id].currentTime = 0; | ||||||
| 		} else { | 		} else { | ||||||
| 			speaking = true; | 			speaking = true; | ||||||
| 
 | 
 | ||||||
|  | 			if ($settings?.speech?.engine === 'openai') { | ||||||
|  | 				const res = await synthesizeOpenAISpeech( | ||||||
|  | 					localStorage.token, | ||||||
|  | 					$settings?.speech?.speaker, | ||||||
|  | 					message.content | ||||||
|  | 				).catch((error) => { | ||||||
|  | 					toast.error(error); | ||||||
|  | 					return null; | ||||||
|  | 				}); | ||||||
|  | 
 | ||||||
|  | 				if (res) { | ||||||
|  | 					const blob = await res.blob(); | ||||||
|  | 					const blobUrl = URL.createObjectURL(blob); | ||||||
|  | 					console.log(blobUrl); | ||||||
|  | 
 | ||||||
|  | 					const audio = new Audio(blobUrl); | ||||||
|  | 					audioMap[message.id] = audio; | ||||||
|  | 
 | ||||||
|  | 					audio.onended = () => { | ||||||
|  | 						speaking = null; | ||||||
|  | 					}; | ||||||
|  | 					audio.play().catch((e) => console.error('Error playing audio:', e)); | ||||||
|  | 				} | ||||||
|  | 			} else { | ||||||
| 				let voices = []; | 				let voices = []; | ||||||
| 				const getVoicesLoop = setInterval(async () => { | 				const getVoicesLoop = setInterval(async () => { | ||||||
| 					voices = await speechSynthesis.getVoices(); | 					voices = await speechSynthesis.getVoices(); | ||||||
| 					if (voices.length > 0) { | 					if (voices.length > 0) { | ||||||
| 						clearInterval(getVoicesLoop); | 						clearInterval(getVoicesLoop); | ||||||
| 
 | 
 | ||||||
| 					const voice = voices?.filter((v) => v.name === $settings?.speaker)?.at(0) ?? undefined; | 						const voice = | ||||||
|  | 							voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; | ||||||
| 
 | 
 | ||||||
| 						const speak = new SpeechSynthesisUtterance(message.content); | 						const speak = new SpeechSynthesisUtterance(message.content); | ||||||
|  | 
 | ||||||
|  | 						speak.onend = () => { | ||||||
|  | 							speaking = null; | ||||||
|  | 						}; | ||||||
| 						speak.voice = voice; | 						speak.voice = voice; | ||||||
| 						speechSynthesis.speak(speak); | 						speechSynthesis.speak(speak); | ||||||
| 					} | 					} | ||||||
| 				}, 100); | 				}, 100); | ||||||
| 			} | 			} | ||||||
|  | 		} | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
| 	const editMessageHandler = async () => { | 	const editMessageHandler = async () => { | ||||||
|  |  | ||||||
|  | @ -6,16 +6,23 @@ | ||||||
| 
 | 
 | ||||||
| 	// Voice | 	// Voice | ||||||
| 	let engines = ['', 'openai']; | 	let engines = ['', 'openai']; | ||||||
| 	let selectedEngine = ''; | 	let engine = ''; | ||||||
| 
 | 
 | ||||||
| 	let voices = []; | 	let voices = []; | ||||||
| 	let speaker = ''; | 	let speaker = ''; | ||||||
| 
 | 
 | ||||||
| 	onMount(async () => { | 	const getOpenAIVoices = () => { | ||||||
| 		let settings = JSON.parse(localStorage.getItem('settings') ?? '{}'); | 		voices = [ | ||||||
| 
 | 			{ name: 'alloy' }, | ||||||
| 		speaker = settings.speaker ?? ''; | 			{ name: 'echo' }, | ||||||
|  | 			{ name: 'fable' }, | ||||||
|  | 			{ name: 'onyx' }, | ||||||
|  | 			{ name: 'nova' }, | ||||||
|  | 			{ name: 'shimmer' } | ||||||
|  | 		]; | ||||||
|  | 	}; | ||||||
| 
 | 
 | ||||||
|  | 	const getWebAPIVoices = () => { | ||||||
| 		const getVoicesLoop = setInterval(async () => { | 		const getVoicesLoop = setInterval(async () => { | ||||||
| 			voices = await speechSynthesis.getVoices(); | 			voices = await speechSynthesis.getVoices(); | ||||||
| 
 | 
 | ||||||
|  | @ -24,6 +31,19 @@ | ||||||
| 				clearInterval(getVoicesLoop); | 				clearInterval(getVoicesLoop); | ||||||
| 			} | 			} | ||||||
| 		}, 100); | 		}, 100); | ||||||
|  | 	}; | ||||||
|  | 
 | ||||||
|  | 	onMount(async () => { | ||||||
|  | 		let settings = JSON.parse(localStorage.getItem('settings') ?? '{}'); | ||||||
|  | 
 | ||||||
|  | 		engine = settings?.speech?.engine ?? ''; | ||||||
|  | 		speaker = settings?.speech?.speaker ?? ''; | ||||||
|  | 
 | ||||||
|  | 		if (engine === 'openai') { | ||||||
|  | 			getOpenAIVoices(); | ||||||
|  | 		} else { | ||||||
|  | 			getWebAPIVoices(); | ||||||
|  | 		} | ||||||
| 	}); | 	}); | ||||||
| </script> | </script> | ||||||
| 
 | 
 | ||||||
|  | @ -31,7 +51,10 @@ | ||||||
| 	class="flex flex-col h-full justify-between space-y-3 text-sm" | 	class="flex flex-col h-full justify-between space-y-3 text-sm" | ||||||
| 	on:submit|preventDefault={() => { | 	on:submit|preventDefault={() => { | ||||||
| 		saveSettings({ | 		saveSettings({ | ||||||
|  | 			speech: { | ||||||
|  | 				engine: engine !== '' ? engine : undefined, | ||||||
| 				speaker: speaker !== '' ? speaker : undefined | 				speaker: speaker !== '' ? speaker : undefined | ||||||
|  | 			} | ||||||
| 		}); | 		}); | ||||||
| 		dispatch('save'); | 		dispatch('save'); | ||||||
| 	}} | 	}} | ||||||
|  | @ -42,10 +65,16 @@ | ||||||
| 			<div class="flex items-center relative"> | 			<div class="flex items-center relative"> | ||||||
| 				<select | 				<select | ||||||
| 					class="w-fit pr-8 rounded py-2 px-2 text-xs bg-transparent outline-none text-right" | 					class="w-fit pr-8 rounded py-2 px-2 text-xs bg-transparent outline-none text-right" | ||||||
| 					bind:value={selectedEngine} | 					bind:value={engine} | ||||||
| 					placeholder="Select a mode" | 					placeholder="Select a mode" | ||||||
| 					on:change={(e) => { | 					on:change={(e) => { | ||||||
| 						console.log(e); | 						if (e.target.value === 'openai') { | ||||||
|  | 							getOpenAIVoices(); | ||||||
|  | 							speaker = 'alloy'; | ||||||
|  | 						} else { | ||||||
|  | 							getWebAPIVoices(); | ||||||
|  | 							speaker = ''; | ||||||
|  | 						} | ||||||
| 					}} | 					}} | ||||||
| 				> | 				> | ||||||
| 					<option value="">Default (Web API)</option> | 					<option value="">Default (Web API)</option> | ||||||
|  | @ -56,7 +85,7 @@ | ||||||
| 
 | 
 | ||||||
| 		<hr class=" dark:border-gray-700" /> | 		<hr class=" dark:border-gray-700" /> | ||||||
| 
 | 
 | ||||||
| 		{#if selectedEngine === ''} | 		{#if engine === ''} | ||||||
| 			<div> | 			<div> | ||||||
| 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||||
| 				<div class="flex w-full"> | 				<div class="flex w-full"> | ||||||
|  | @ -75,6 +104,24 @@ | ||||||
| 					</div> | 					</div> | ||||||
| 				</div> | 				</div> | ||||||
| 			</div> | 			</div> | ||||||
|  | 		{:else if engine === 'openai'} | ||||||
|  | 			<div> | ||||||
|  | 				<div class=" mb-2.5 text-sm font-medium">Set Voice</div> | ||||||
|  | 				<div class="flex w-full"> | ||||||
|  | 					<div class="flex-1"> | ||||||
|  | 						<select | ||||||
|  | 							class="w-full rounded py-2 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none" | ||||||
|  | 							bind:value={speaker} | ||||||
|  | 							placeholder="Select a voice" | ||||||
|  | 						> | ||||||
|  | 							{#each voices as voice} | ||||||
|  | 								<option value={voice.name} class="bg-gray-100 dark:bg-gray-700">{voice.name}</option | ||||||
|  | 								> | ||||||
|  | 							{/each} | ||||||
|  | 						</select> | ||||||
|  | 					</div> | ||||||
|  | 				</div> | ||||||
|  | 			</div> | ||||||
| 		{/if} | 		{/if} | ||||||
| 	</div> | 	</div> | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy J. Baek
						Timothy J. Baek