feat: youtube rag

This commit is contained in:
Timothy J. Baek 2024-05-01 17:17:00 -07:00
parent e60c87d750
commit 0595c04909
7 changed files with 180 additions and 30 deletions

View file

@ -28,6 +28,7 @@ from langchain_community.document_loaders import (
UnstructuredXMLLoader,
UnstructuredRSTLoader,
UnstructuredExcelLoader,
YoutubeLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
@ -181,7 +182,7 @@ class CollectionNameForm(BaseModel):
collection_name: Optional[str] = "test"
class StoreWebForm(CollectionNameForm):
class UrlForm(CollectionNameForm):
url: str
@ -456,8 +457,32 @@ def query_collection_handler(
)
@app.post("/youtube")
def store_youtube_video(form_data: UrlForm, user=Depends(get_current_user)):
try:
loader = YoutubeLoader.from_youtube_url(form_data.url, add_video_info=False)
data = loader.load()
collection_name = form_data.collection_name
if collection_name == "":
collection_name = calculate_sha256_string(form_data.url)[:63]
store_data_in_vector_db(data, collection_name, overwrite=True)
return {
"status": True,
"collection_name": collection_name,
"filename": form_data.url,
}
except Exception as e:
log.exception(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
@app.post("/web")
def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
def store_web(form_data: UrlForm, user=Depends(get_current_user)):
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
try:
loader = get_web_loader(form_data.url)

View file

@ -59,3 +59,4 @@ PyJWT[crypto]==2.8.0
black==24.4.2
langfuse==2.27.3
youtube-transcript-api

View file

@ -221,6 +221,37 @@ export const uploadWebToVectorDB = async (token: string, collection_name: string
return res;
};
export const uploadYoutubeTranscriptionToVectorDB = async (token: string, url: string) => {
let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/youtube`, {
method: 'POST',
headers: {
Accept: 'application/json',
'Content-Type': 'application/json',
authorization: `Bearer ${token}`
},
body: JSON.stringify({
url: url
})
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
console.log(err);
return null;
});
if (error) {
throw error;
}
return res;
};
export const queryDoc = async (
token: string,
collection_name: string,

View file

@ -6,7 +6,11 @@
import Prompts from './MessageInput/PromptCommands.svelte';
import Suggestions from './MessageInput/Suggestions.svelte';
import { uploadDocToVectorDB, uploadWebToVectorDB } from '$lib/apis/rag';
import {
uploadDocToVectorDB,
uploadWebToVectorDB,
uploadYoutubeTranscriptionToVectorDB
} from '$lib/apis/rag';
import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte';
import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
import Documents from './MessageInput/Documents.svelte';
@ -290,6 +294,34 @@
}
};
const uploadYoutubeTranscription = async (url) => {
console.log(url);
const doc = {
type: 'doc',
name: url,
collection_name: '',
upload_status: false,
url: url,
error: ''
};
try {
files = [...files, doc];
const res = await uploadYoutubeTranscriptionToVectorDB(localStorage.token, url);
if (res) {
doc.upload_status = true;
doc.collection_name = res.collection_name;
files = files;
}
} catch (e) {
// Remove the failed doc from the files array
files = files.filter((f) => f.name !== url);
toast.error(e);
}
};
onMount(() => {
console.log(document.getElementById('sidebar'));
window.setTimeout(() => chatTextAreaElement?.focus(), 0);
@ -428,6 +460,10 @@
<Documents
bind:this={documentsElement}
bind:prompt
on:youtube={(e) => {
console.log(e);
uploadYoutubeTranscription(e.detail);
}}
on:url={(e) => {
console.log(e);
uploadWeb(e.detail);

View file

@ -87,6 +87,17 @@
chatInputElement?.focus();
await tick();
};
const confirmSelectYoutube = async (url) => {
dispatch('youtube', url);
prompt = removeFirstHashWord(prompt);
const chatInputElement = document.getElementById('chat-textarea');
await tick();
chatInputElement?.focus();
await tick();
};
</script>
{#if filteredItems.length > 0 || prompt.split(' ')?.at(0)?.substring(1).startsWith('http')}
@ -132,7 +143,30 @@
</button>
{/each}
{#if prompt.split(' ')?.at(0)?.substring(1).startsWith('http')}
{#if prompt.split(' ')?.at(0)?.substring(1).startsWith('https://www.youtube.com')}
<button
class="px-3 py-1.5 rounded-xl w-full text-left bg-gray-100 selected-command-option-button"
type="button"
on:click={() => {
const url = prompt.split(' ')?.at(0)?.substring(1);
if (isValidHttpUrl(url)) {
confirmSelectYoutube(url);
} else {
toast.error(
$i18n.t(
'Oops! Looks like the URL is invalid. Please double-check and try again.'
)
);
}
}}
>
<div class=" font-medium text-black line-clamp-1">
{prompt.split(' ')?.at(0)?.substring(1)}
</div>
<div class=" text-xs text-gray-600 line-clamp-1">{$i18n.t('Youtube')}</div>
</button>
{:else if prompt.split(' ')?.at(0)?.substring(1).startsWith('http')}
<button
class="px-3 py-1.5 rounded-xl w-full text-left bg-gray-100 selected-command-option-button"
type="button"

View file

@ -30,6 +30,7 @@
let page = 1;
let showSettingsModal = false;
let showAddUserModal = false;
let showUserChatsModal = false;
let showEditUserModal = false;
@ -100,12 +101,13 @@
<div class=" mx-auto w-full">
<div class="w-full">
<div class=" flex flex-col justify-center">
<div class=" px-5 pt-3">
<div class=" px-6 pt-4">
<div class=" flex justify-between items-center">
<div class="flex items-center text-2xl font-semibold">Dashboard</div>
<div>
<Tooltip content={$i18n.t('Admin Settings')}>
<button
class="flex items-center space-x-1 px-3 py-1.5 rounded-xl bg-gray-50 hover:bg-gray-100 dark:bg-gray-800 dark:hover:bg-gray-700 transition"
class="flex items-center space-x-1 p-2 md:px-3 md:py-1.5 rounded-xl bg-gray-50 hover:bg-gray-100 dark:bg-gray-800 dark:hover:bg-gray-700 transition"
type="button"
on:click={() => {
showSettingsModal = !showSettingsModal;
@ -124,21 +126,22 @@
/>
</svg>
<div class=" text-xs">{$i18n.t('Admin Settings')}</div>
<div class="hidden md:inline text-xs">{$i18n.t('Admin Settings')}</div>
</button>
</Tooltip>
</div>
</div>
</div>
<div class="px-5 flex text-sm gap-2.5">
<div class="px-6 flex text-sm gap-2.5">
<div class="py-3 border-b font-medium text-gray-100 cursor-pointer">Overview</div>
<!-- <div class="py-3 text-gray-300 cursor-pointer">Users</div> -->
</div>
<hr class=" mb-3 dark:border-gray-800" />
<div class="px-5">
<div class="mt-0.5 mb-3 flex justify-between">
<div class="px-6">
<div class="mt-0.5 mb-3 gap-1 flex flex-col md:flex-row justify-between">
<div class="flex text-lg font-medium px-0.5">
{$i18n.t('All Users')}
<div class="flex self-center w-[1px] h-6 mx-2.5 bg-gray-200 dark:bg-gray-700" />
@ -147,12 +150,32 @@
>
</div>
<div class="">
<div class="flex gap-1">
<input
class=" w-60 rounded-lg py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
class="w-full md:w-60 rounded-xl py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-850 outline-none"
placeholder={$i18n.t('Search')}
bind:value={search}
/>
<div>
<button
class=" px-2 py-2 rounded-xl border border-gray-200 dark:border-gray-600 dark:border-0 hover:bg-gray-100 dark:bg-gray-850 dark:hover:bg-gray-800 transition font-medium text-sm flex items-center space-x-1"
on:click={() => {
showAddUserModal = !showAddUserModal;
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
fill="currentColor"
class="w-4 h-4"
>
<path
d="M8.75 3.75a.75.75 0 0 0-1.5 0v3.5h-3.5a.75.75 0 0 0 0 1.5h3.5v3.5a.75.75 0 0 0 1.5 0v-3.5h3.5a.75.75 0 0 0 0-1.5h-3.5v-3.5Z"
/>
</svg>
</button>
</div>
</div>
</div>