diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py new file mode 100644 index 00000000..c03aaf37 --- /dev/null +++ b/backend/apps/audio/main.py @@ -0,0 +1,80 @@ +from fastapi import ( + FastAPI, + Request, + Depends, + HTTPException, + status, + UploadFile, + File, + Form, +) +from fastapi.middleware.cors import CORSMiddleware +from faster_whisper import WhisperModel + +from constants import ERROR_MESSAGES +from utils.utils import ( + decode_token, + get_current_user, + get_verified_user, + get_admin_user, +) +from utils.misc import calculate_sha256 + +from config import CACHE_DIR, UPLOAD_DIR, WHISPER_MODEL_NAME + +app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.post("/transcribe") +def transcribe( + file: UploadFile = File(...), + user=Depends(get_current_user), +): + print(file.content_type) + + if file.content_type not in ["audio/mpeg", "audio/wav"]: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, + ) + + try: + filename = file.filename + file_path = f"{UPLOAD_DIR}/{filename}" + contents = file.file.read() + with open(file_path, "wb") as f: + f.write(contents) + f.close() + + model_name = WHISPER_MODEL_NAME + model = WhisperModel( + model_name, + device="cpu", + compute_type="int8", + download_root=f"{CACHE_DIR}/whisper/models", + ) + + segments, info = model.transcribe(file_path, beam_size=5) + print( + "Detected language '%s' with probability %f" + % (info.language, info.language_probability) + ) + + transcript = "".join([segment.text for segment in list(segments)]) + + return {"text": transcript.strip()} + + except Exception as e: + print(e) + + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.DEFAULT(e), + ) diff --git a/backend/config.py b/backend/config.py index 65ee2298..81b90084 100644 --- a/backend/config.py +++ b/backend/config.py @@ -132,3 +132,8 @@ CHROMA_CLIENT = chromadb.PersistentClient( ) CHUNK_SIZE = 1500 CHUNK_OVERLAP = 100 + +#################################### +# Transcribe +#################################### +WHISPER_MODEL_NAME = "base" diff --git a/backend/main.py b/backend/main.py index f7a82b66..3a28670e 100644 --- a/backend/main.py +++ b/backend/main.py @@ -10,6 +10,8 @@ from starlette.exceptions import HTTPException as StarletteHTTPException from apps.ollama.main import app as ollama_app from apps.openai.main import app as openai_app +from apps.audio.main import app as audio_app + from apps.web.main import app as webui_app from apps.rag.main import app as rag_app @@ -55,6 +57,8 @@ app.mount("/api/v1", webui_app) app.mount("/ollama/api", ollama_app) app.mount("/openai/api", openai_app) + +app.mount("/audio/api/v1", audio_app) app.mount("/rag/api/v1", rag_app) diff --git a/backend/requirements.txt b/backend/requirements.txt index 68cba254..56e1d36e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -30,6 +30,8 @@ openpyxl pyxlsb xlrd +faster-whisper + PyJWT pyjwt[crypto] diff --git a/src/lib/apis/audio/index.ts b/src/lib/apis/audio/index.ts new file mode 100644 index 00000000..d2848339 --- /dev/null +++ b/src/lib/apis/audio/index.ts @@ -0,0 +1,31 @@ +import { AUDIO_API_BASE_URL } from '$lib/constants'; + +export const transcribeAudio = async (token: string, file: File) => { + const data = new FormData(); + data.append('file', file); + + let error = null; + const res = await fetch(`${AUDIO_API_BASE_URL}/transcribe`, { + method: 'POST', + headers: { + Accept: 'application/json', + authorization: `Bearer ${token}` + }, + body: data + }) + .then(async (res) => { + if (!res.ok) throw await res.json(); + return res.json(); + }) + .catch((err) => { + error = err.detail; + console.log(err); + return null; + }); + + if (error) { + throw error; + } + + return res; +}; diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index 0844c489..29354610 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -2,7 +2,7 @@ import toast from 'svelte-french-toast'; import { onMount, tick } from 'svelte'; import { settings } from '$lib/stores'; - import { calculateSHA256, findWordIndices } from '$lib/utils'; + import { blobToFile, calculateSHA256, findWordIndices } from '$lib/utils'; import Prompts from './MessageInput/PromptCommands.svelte'; import Suggestions from './MessageInput/Suggestions.svelte'; @@ -11,6 +11,7 @@ import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; import Documents from './MessageInput/Documents.svelte'; import Models from './MessageInput/Models.svelte'; + import { transcribeAudio } from '$lib/apis/audio'; export let submitPrompt: Function; export let stopResponse: Function; @@ -34,7 +35,6 @@ export let fileUploadEnabled = true; export let speechRecognitionEnabled = true; - export let speechRecognitionListening = false; export let prompt = ''; export let messages = []; @@ -50,62 +50,170 @@ } } + let mediaRecorder; + let audioChunks = []; + let isRecording = false; + const MIN_DECIBELS = -45; + + const startRecording = async () => { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream); + mediaRecorder.onstart = () => { + isRecording = true; + console.log('Recording started'); + }; + mediaRecorder.ondataavailable = (event) => audioChunks.push(event.data); + mediaRecorder.onstop = async () => { + isRecording = false; + console.log('Recording stopped'); + + // Create a blob from the audio chunks + const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); + + const file = blobToFile(audioBlob, 'recording.wav'); + + const res = await transcribeAudio(localStorage.token, file).catch((error) => { + toast.error(error); + return null; + }); + + if (res) { + prompt = res.text; + await tick(); + + const inputElement = document.getElementById('chat-textarea'); + inputElement?.focus(); + + if (prompt !== '' && $settings?.speechAutoSend === true) { + submitPrompt(prompt, user); + } + } + + // saveRecording(audioBlob); + audioChunks = []; + }; + + // Start recording + mediaRecorder.start(); + + // Monitor silence + monitorSilence(stream); + }; + + const monitorSilence = (stream) => { + const audioContext = new AudioContext(); + const audioStreamSource = audioContext.createMediaStreamSource(stream); + const analyser = audioContext.createAnalyser(); + analyser.minDecibels = MIN_DECIBELS; + audioStreamSource.connect(analyser); + + const bufferLength = analyser.frequencyBinCount; + const domainData = new Uint8Array(bufferLength); + + let lastSoundTime = Date.now(); + + const detectSound = () => { + analyser.getByteFrequencyData(domainData); + + if (domainData.some((value) => value > 0)) { + lastSoundTime = Date.now(); + } + + if (isRecording && Date.now() - lastSoundTime > 3000) { + mediaRecorder.stop(); + audioContext.close(); + return; + } + + window.requestAnimationFrame(detectSound); + }; + + window.requestAnimationFrame(detectSound); + }; + + const saveRecording = (blob) => { + const url = URL.createObjectURL(blob); + const a = document.createElement('a'); + document.body.appendChild(a); + a.style = 'display: none'; + a.href = url; + a.download = 'recording.wav'; + a.click(); + window.URL.revokeObjectURL(url); + }; + const speechRecognitionHandler = () => { // Check if SpeechRecognition is supported - if (speechRecognitionListening) { - speechRecognition.stop(); + if (isRecording) { + if (speechRecognition) { + speechRecognition.stop(); + } + + if (mediaRecorder) { + mediaRecorder.stop(); + } } else { - if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { - // Create a SpeechRecognition object - speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); + isRecording = true; - // Set continuous to true for continuous recognition - speechRecognition.continuous = true; - - // Set the timeout for turning off the recognition after inactivity (in milliseconds) - const inactivityTimeout = 3000; // 3 seconds - - let timeoutId; - // Start recognition - speechRecognition.start(); - speechRecognitionListening = true; - - // Event triggered when speech is recognized - speechRecognition.onresult = function (event) { - // Clear the inactivity timeout - clearTimeout(timeoutId); - - // Handle recognized speech - console.log(event); - const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; - prompt = `${prompt}${transcript}`; - - // Restart the inactivity timeout - timeoutId = setTimeout(() => { - console.log('Speech recognition turned off due to inactivity.'); - speechRecognition.stop(); - }, inactivityTimeout); - }; - - // Event triggered when recognition is ended - speechRecognition.onend = function () { - // Restart recognition after it ends - console.log('recognition ended'); - speechRecognitionListening = false; - if (prompt !== '' && $settings?.speechAutoSend === true) { - submitPrompt(prompt, user); - } - }; - - // Event triggered when an error occurs - speechRecognition.onerror = function (event) { - console.log(event); - toast.error(`Speech recognition error: ${event.error}`); - speechRecognitionListening = false; - }; + if ($settings?.voice?.STTEngine ?? '' !== '') { + startRecording(); } else { - toast.error('SpeechRecognition API is not supported in this browser.'); + if ('SpeechRecognition' in window || 'webkitSpeechRecognition' in window) { + // Create a SpeechRecognition object + speechRecognition = new (window.SpeechRecognition || window.webkitSpeechRecognition)(); + + // Set continuous to true for continuous recognition + speechRecognition.continuous = true; + + // Set the timeout for turning off the recognition after inactivity (in milliseconds) + const inactivityTimeout = 3000; // 3 seconds + + let timeoutId; + // Start recognition + speechRecognition.start(); + + // Event triggered when speech is recognized + speechRecognition.onresult = async (event) => { + // Clear the inactivity timeout + clearTimeout(timeoutId); + + // Handle recognized speech + console.log(event); + const transcript = event.results[Object.keys(event.results).length - 1][0].transcript; + + prompt = `${prompt}${transcript}`; + + await tick(); + const inputElement = document.getElementById('chat-textarea'); + inputElement?.focus(); + + // Restart the inactivity timeout + timeoutId = setTimeout(() => { + console.log('Speech recognition turned off due to inactivity.'); + speechRecognition.stop(); + }, inactivityTimeout); + }; + + // Event triggered when recognition is ended + speechRecognition.onend = function () { + // Restart recognition after it ends + console.log('recognition ended'); + isRecording = false; + if (prompt !== '' && $settings?.speechAutoSend === true) { + submitPrompt(prompt, user); + } + }; + + // Event triggered when an error occurs + speechRecognition.onerror = function (event) { + console.log(event); + toast.error(`Speech recognition error: ${event.error}`); + isRecording = false; + }; + } else { + toast.error('SpeechRecognition API is not supported in this browser.'); + } } } }; @@ -123,6 +231,20 @@ try { files = [...files, doc]; + + if (['audio/mpeg', 'audio/wav'].includes(file['type'])) { + const res = await transcribeAudio(localStorage.token, file).catch((error) => { + toast.error(error); + return null; + }); + + if (res) { + console.log(res); + const blob = new Blob([res.text], { type: 'text/plain' }); + file = blobToFile(blob, `${file.name}.txt`); + } + } + const res = await uploadDocToVectorDB(localStorage.token, '', file); if (res) { @@ -535,7 +657,7 @@ : ' pl-4'} rounded-xl resize-none h-[48px]" placeholder={chatInputPlaceholder !== '' ? chatInputPlaceholder - : speechRecognitionListening + : isRecording ? 'Listening...' : 'Send a message'} bind:value={prompt} @@ -644,6 +766,10 @@ e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; user = null; }} + on:focus={(e) => { + e.target.style.height = ''; + e.target.style.height = Math.min(e.target.scrollHeight, 200) + 'px'; + }} on:paste={(e) => { const clipboardData = e.clipboardData || window.clipboardData; @@ -681,7 +807,7 @@ speechRecognitionHandler(); }} > - {#if speechRecognitionListening} + {#if isRecording} { @@ -179,7 +179,7 @@ for (const [idx, sentence] of sentences.entries()) { const res = await synthesizeOpenAISpeech( localStorage.token, - $settings?.speech?.speaker, + $settings?.audio?.speaker, sentence ).catch((error) => { toast.error(error); @@ -204,7 +204,7 @@ clearInterval(getVoicesLoop); const voice = - voices?.filter((v) => v.name === $settings?.speech?.speaker)?.at(0) ?? undefined; + voices?.filter((v) => v.name === $settings?.audio?.speaker)?.at(0) ?? undefined; const speak = new SpeechSynthesisUtterance(message.content); diff --git a/src/lib/components/chat/Settings/Voice.svelte b/src/lib/components/chat/Settings/Audio.svelte similarity index 77% rename from src/lib/components/chat/Settings/Voice.svelte rename to src/lib/components/chat/Settings/Audio.svelte index 0dc7f0c5..289ec0bb 100644 --- a/src/lib/components/chat/Settings/Voice.svelte +++ b/src/lib/components/chat/Settings/Audio.svelte @@ -1,17 +1,21 @@
+
-
TTS Settings
+
STT Settings
-
Speech Engine
+
Speech-to-Text Engine
@@ -155,6 +160,33 @@ {/if}
+
+ +
+
TTS Settings
+ +
+
Text-to-Speech Engine
+
+ +
+
Auto-playback response
@@ -177,7 +209,7 @@
- {#if engine === ''} + {#if TTSEngine === ''}
Set Voice
@@ -196,7 +228,7 @@
- {:else if engine === 'openai'} + {:else if TTSEngine === 'openai'}
Set Voice
diff --git a/src/lib/components/chat/SettingsModal.svelte b/src/lib/components/chat/SettingsModal.svelte index 83b2e307..3e5cf066 100644 --- a/src/lib/components/chat/SettingsModal.svelte +++ b/src/lib/components/chat/SettingsModal.svelte @@ -13,7 +13,7 @@ import General from './Settings/General.svelte'; import External from './Settings/External.svelte'; import Interface from './Settings/Interface.svelte'; - import Voice from './Settings/Voice.svelte'; + import Audio from './Settings/Audio.svelte'; import Chats from './Settings/Chats.svelte'; export let show = false; @@ -206,11 +206,11 @@