Merge pull request #778 from open-webui/rag-folder

feat: rag folder scan support
This commit is contained in:
Timothy Jaeryang Baek 2024-02-18 00:34:47 -05:00 committed by GitHub
commit a32ab5afbd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 390 additions and 12 deletions

View file

@ -10,6 +10,8 @@ from fastapi import (
) )
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import os, shutil import os, shutil
from pathlib import Path
from typing import List from typing import List
# from chromadb.utils import embedding_functions # from chromadb.utils import embedding_functions
@ -28,19 +30,39 @@ from langchain_community.document_loaders import (
UnstructuredExcelLoader, UnstructuredExcelLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Chroma
from pydantic import BaseModel from pydantic import BaseModel
from typing import Optional from typing import Optional
import mimetypes
import uuid import uuid
import json
import time import time
from utils.misc import calculate_sha256, calculate_sha256_string
from apps.web.models.documents import (
Documents,
DocumentForm,
DocumentResponse,
)
from utils.misc import (
calculate_sha256,
calculate_sha256_string,
sanitize_filename,
extract_folders_after_data_docs,
)
from utils.utils import get_current_user, get_admin_user from utils.utils import get_current_user, get_admin_user
from config import UPLOAD_DIR, EMBED_MODEL, CHROMA_CLIENT, CHUNK_SIZE, CHUNK_OVERLAP from config import (
UPLOAD_DIR,
DOCS_DIR,
EMBED_MODEL,
CHROMA_CLIENT,
CHUNK_SIZE,
CHUNK_OVERLAP,
)
from constants import ERROR_MESSAGES from constants import ERROR_MESSAGES
# EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction( # EMBEDDING_FUNC = embedding_functions.SentenceTransformerEmbeddingFunction(
@ -220,8 +242,8 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
) )
def get_loader(file, file_path): def get_loader(filename: str, file_content_type: str, file_path: str):
file_ext = file.filename.split(".")[-1].lower() file_ext = filename.split(".")[-1].lower()
known_type = True known_type = True
known_source_ext = [ known_source_ext = [
@ -279,20 +301,20 @@ def get_loader(file, file_path):
loader = UnstructuredXMLLoader(file_path) loader = UnstructuredXMLLoader(file_path)
elif file_ext == "md": elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path) loader = UnstructuredMarkdownLoader(file_path)
elif file.content_type == "application/epub+zip": elif file_content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(file_path) loader = UnstructuredEPubLoader(file_path)
elif ( elif (
file.content_type file_content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file_ext in ["doc", "docx"] or file_ext in ["doc", "docx"]
): ):
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file.content_type in [ elif file_content_type in [
"application/vnd.ms-excel", "application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
] or file_ext in ["xls", "xlsx"]: ] or file_ext in ["xls", "xlsx"]:
loader = UnstructuredExcelLoader(file_path) loader = UnstructuredExcelLoader(file_path)
elif file_ext in known_source_ext or file.content_type.find("text/") >= 0: elif file_ext in known_source_ext or file_content_type.find("text/") >= 0:
loader = TextLoader(file_path) loader = TextLoader(file_path)
else: else:
loader = TextLoader(file_path) loader = TextLoader(file_path)
@ -323,7 +345,7 @@ def store_doc(
collection_name = calculate_sha256(f)[:63] collection_name = calculate_sha256(f)[:63]
f.close() f.close()
loader, known_type = get_loader(file, file_path) loader, known_type = get_loader(file.filename, file.content_type, file_path)
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name) result = store_data_in_vector_db(data, collection_name)
@ -353,6 +375,63 @@ def store_doc(
) )
@app.get("/scan")
def scan_docs_dir(user=Depends(get_admin_user)):
try:
for path in Path(DOCS_DIR).rglob("./**/*"):
if path.is_file() and not path.name.startswith("."):
tags = extract_folders_after_data_docs(path)
filename = path.name
file_content_type = mimetypes.guess_type(path)
f = open(path, "rb")
collection_name = calculate_sha256(f)[:63]
f.close()
loader, known_type = get_loader(
filename, file_content_type[0], str(path)
)
data = loader.load()
result = store_data_in_vector_db(data, collection_name)
if result:
sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename)
if doc == None:
doc = Documents.insert_new_doc(
user.id,
DocumentForm(
**{
"name": sanitized_filename,
"title": filename,
"collection_name": collection_name,
"filename": filename,
"content": (
json.dumps(
{
"tags": list(
map(
lambda name: {"name": name},
tags,
)
)
}
)
if len(tags)
else "{}"
),
}
),
)
except Exception as e:
print(e)
return True
@app.get("/reset/db") @app.get("/reset/db")
def reset_vector_db(user=Depends(get_admin_user)): def reset_vector_db(user=Depends(get_admin_user)):
CHROMA_CLIENT.reset() CHROMA_CLIENT.reset()

View file

@ -96,6 +96,10 @@ async def get_doc_by_name(name: str, user=Depends(get_current_user)):
############################ ############################
class TagItem(BaseModel):
name: str
class TagDocumentForm(BaseModel): class TagDocumentForm(BaseModel):
name: str name: str
tags: List[dict] tags: List[dict]

View file

@ -43,6 +43,14 @@ Path(UPLOAD_DIR).mkdir(parents=True, exist_ok=True)
CACHE_DIR = f"{DATA_DIR}/cache" CACHE_DIR = f"{DATA_DIR}/cache"
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
####################################
# Docs DIR
####################################
DOCS_DIR = f"{DATA_DIR}/docs"
Path(DOCS_DIR).mkdir(parents=True, exist_ok=True)
#################################### ####################################
# OLLAMA_API_BASE_URL # OLLAMA_API_BASE_URL
#################################### ####################################

View file

@ -1,3 +1,4 @@
from pathlib import Path
import hashlib import hashlib
import re import re
@ -38,3 +39,40 @@ def validate_email_format(email: str) -> bool:
if not re.match(r"[^@]+@[^@]+\.[^@]+", email): if not re.match(r"[^@]+@[^@]+\.[^@]+", email):
return False return False
return True return True
def sanitize_filename(file_name):
# Convert to lowercase
lower_case_file_name = file_name.lower()
# Remove special characters using regular expression
sanitized_file_name = re.sub(r"[^\w\s]", "", lower_case_file_name)
# Replace spaces with dashes
final_file_name = re.sub(r"\s+", "-", sanitized_file_name)
return final_file_name
def extract_folders_after_data_docs(path):
# Convert the path to a Path object if it's not already
path = Path(path)
# Extract parts of the path
parts = path.parts
# Find the index of '/data/docs' in the path
try:
index_data_docs = parts.index("data") + 1
index_docs = parts.index("docs", index_data_docs) + 1
except ValueError:
return []
# Exclude the filename and accumulate folder names
tags = []
folders = parts[index_docs:-1]
for idx, part in enumerate(folders):
tags.append("/".join(folders[: idx + 1]))
return tags

View file

@ -138,6 +138,32 @@ export const queryCollection = async (
return res; return res;
}; };
export const scanDocs = async (token: string) => {
let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/scan`, {
method: 'GET',
headers: {
Accept: 'application/json',
authorization: `Bearer ${token}`
}
})
.then(async (res) => {
if (!res.ok) throw await res.json();
return res.json();
})
.catch((err) => {
error = err.detail;
return null;
});
if (error) {
throw error;
}
return res;
};
export const resetVectorDB = async (token: string) => { export const resetVectorDB = async (token: string) => {
let error = null; let error = null;

View file

@ -366,7 +366,7 @@
{#if message.done} {#if message.done}
<div <div
class=" flex justify-start space-x-1 -mt-1 overflow-x-auto buttons text-gray-700 dark:text-gray-500" class=" flex justify-start space-x-1 overflow-x-auto buttons text-gray-700 dark:text-gray-500"
> >
{#if siblings.length > 1} {#if siblings.length > 1}
<div class="flex self-center min-w-fit"> <div class="flex self-center min-w-fit">

View file

@ -0,0 +1,106 @@
<script lang="ts">
import { getDocs } from '$lib/apis/documents';
import { scanDocs } from '$lib/apis/rag';
import { documents } from '$lib/stores';
import { onMount } from 'svelte';
import toast from 'svelte-french-toast';
export let saveHandler: Function;
let loading = false;
const scanHandler = async () => {
loading = true;
const res = await scanDocs(localStorage.token);
loading = false;
if (res) {
await documents.set(await getDocs(localStorage.token));
toast.success('Scan complete!');
}
};
onMount(async () => {});
</script>
<form
class="flex flex-col h-full justify-between space-y-3 text-sm"
on:submit|preventDefault={() => {
// console.log('submit');
saveHandler();
}}
>
<div class=" space-y-3 pr-1.5 overflow-y-scroll max-h-80">
<div>
<div class=" mb-2 text-sm font-medium">General Settings</div>
<div class=" flex w-full justify-between">
<div class=" self-center text-xs font-medium">Scan for documents from '/data/docs'</div>
<button
class=" self-center text-xs p-1 px-3 bg-gray-100 dark:bg-gray-800 dark:hover:bg-gray-700 rounded flex flex-row space-x-1 items-center {loading
? ' cursor-not-allowed'
: ''}"
on:click={() => {
scanHandler();
console.log('check');
}}
type="button"
disabled={loading}
>
<div class="self-center font-medium">Scan</div>
<!-- <svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
fill="currentColor"
class="w-3 h-3"
>
<path
fill-rule="evenodd"
d="M13.836 2.477a.75.75 0 0 1 .75.75v3.182a.75.75 0 0 1-.75.75h-3.182a.75.75 0 0 1 0-1.5h1.37l-.84-.841a4.5 4.5 0 0 0-7.08.932.75.75 0 0 1-1.3-.75 6 6 0 0 1 9.44-1.242l.842.84V3.227a.75.75 0 0 1 .75-.75Zm-.911 7.5A.75.75 0 0 1 13.199 11a6 6 0 0 1-9.44 1.241l-.84-.84v1.371a.75.75 0 0 1-1.5 0V9.591a.75.75 0 0 1 .75-.75H5.35a.75.75 0 0 1 0 1.5H3.98l.841.841a4.5 4.5 0 0 0 7.08-.932.75.75 0 0 1 1.025-.273Z"
clip-rule="evenodd"
/>
</svg> -->
{#if loading}
<div class="ml-3 self-center">
<svg
class=" w-3 h-3"
viewBox="0 0 24 24"
fill="currentColor"
xmlns="http://www.w3.org/2000/svg"
><style>
.spinner_ajPY {
transform-origin: center;
animation: spinner_AtaB 0.75s infinite linear;
}
@keyframes spinner_AtaB {
100% {
transform: rotate(360deg);
}
}
</style><path
d="M12,1A11,11,0,1,0,23,12,11,11,0,0,0,12,1Zm0,19a8,8,0,1,1,8-8A8,8,0,0,1,12,20Z"
opacity=".25"
/><path
d="M10.14,1.16a11,11,0,0,0-9,8.92A1.59,1.59,0,0,0,2.46,12,1.52,1.52,0,0,0,4.11,10.7a8,8,0,0,1,6.66-6.61A1.42,1.42,0,0,0,12,2.69h0A1.57,1.57,0,0,0,10.14,1.16Z"
class="spinner_ajPY"
/></svg
>
</div>
{/if}
</button>
</div>
</div>
</div>
<!-- <div class="flex justify-end pt-3 text-sm font-medium">
<button
class=" px-4 py-2 bg-emerald-600 hover:bg-emerald-700 text-gray-100 transition rounded"
type="submit"
>
Save
</button>
</div> -->
</form>

View file

@ -0,0 +1,86 @@
<script>
import Modal from '../common/Modal.svelte';
import General from './Settings/General.svelte';
export let show = false;
let selectedTab = 'general';
</script>
<Modal bind:show>
<div>
<div class=" flex justify-between dark:text-gray-300 px-5 py-4">
<div class=" text-lg font-medium self-center">Document Settings</div>
<button
class="self-center"
on:click={() => {
show = false;
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 20 20"
fill="currentColor"
class="w-5 h-5"
>
<path
d="M6.28 5.22a.75.75 0 00-1.06 1.06L8.94 10l-3.72 3.72a.75.75 0 101.06 1.06L10 11.06l3.72 3.72a.75.75 0 101.06-1.06L11.06 10l3.72-3.72a.75.75 0 00-1.06-1.06L10 8.94 6.28 5.22z"
/>
</svg>
</button>
</div>
<hr class=" dark:border-gray-800" />
<div class="flex flex-col md:flex-row w-full p-4 md:space-x-4">
<div
class="tabs flex flex-row overflow-x-auto space-x-1 md:space-x-0 md:space-y-1 md:flex-col flex-1 md:flex-none md:w-40 dark:text-gray-200 text-xs text-left mb-3 md:mb-0"
>
<button
class="px-2.5 py-2.5 min-w-fit rounded-lg flex-1 md:flex-none flex text-right transition {selectedTab ===
'general'
? 'bg-gray-200 dark:bg-gray-700'
: ' hover:bg-gray-300 dark:hover:bg-gray-800'}"
on:click={() => {
selectedTab = 'general';
}}
>
<div class=" self-center mr-2">
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
fill="currentColor"
class="w-4 h-4"
>
<path
fill-rule="evenodd"
d="M6.955 1.45A.5.5 0 0 1 7.452 1h1.096a.5.5 0 0 1 .497.45l.17 1.699c.484.12.94.312 1.356.562l1.321-1.081a.5.5 0 0 1 .67.033l.774.775a.5.5 0 0 1 .034.67l-1.08 1.32c.25.417.44.873.561 1.357l1.699.17a.5.5 0 0 1 .45.497v1.096a.5.5 0 0 1-.45.497l-1.699.17c-.12.484-.312.94-.562 1.356l1.082 1.322a.5.5 0 0 1-.034.67l-.774.774a.5.5 0 0 1-.67.033l-1.322-1.08c-.416.25-.872.44-1.356.561l-.17 1.699a.5.5 0 0 1-.497.45H7.452a.5.5 0 0 1-.497-.45l-.17-1.699a4.973 4.973 0 0 1-1.356-.562L4.108 13.37a.5.5 0 0 1-.67-.033l-.774-.775a.5.5 0 0 1-.034-.67l1.08-1.32a4.971 4.971 0 0 1-.561-1.357l-1.699-.17A.5.5 0 0 1 1 8.548V7.452a.5.5 0 0 1 .45-.497l1.699-.17c.12-.484.312-.94.562-1.356L2.629 4.107a.5.5 0 0 1 .034-.67l.774-.774a.5.5 0 0 1 .67-.033L5.43 3.71a4.97 4.97 0 0 1 1.356-.561l.17-1.699ZM6 8c0 .538.212 1.026.558 1.385l.057.057a2 2 0 0 0 2.828-2.828l-.058-.056A2 2 0 0 0 6 8Z"
clip-rule="evenodd"
/>
</svg>
</div>
<div class=" self-center">General</div>
</button>
</div>
<div class="flex-1 md:min-h-[380px]">
{#if selectedTab === 'general'}
<General
saveHandler={() => {
show = false;
}}
/>
<!-- <General
saveHandler={() => {
show = false;
}}
/> -->
<!-- {:else if selectedTab === 'users'}
<Users
saveHandler={() => {
show = false;
}}
/> -->
{/if}
</div>
</div>
</div>
</Modal>

View file

@ -13,6 +13,7 @@
import EditDocModal from '$lib/components/documents/EditDocModal.svelte'; import EditDocModal from '$lib/components/documents/EditDocModal.svelte';
import AddFilesPlaceholder from '$lib/components/AddFilesPlaceholder.svelte'; import AddFilesPlaceholder from '$lib/components/AddFilesPlaceholder.svelte';
import SettingsModal from '$lib/components/documents/SettingsModal.svelte';
let importFiles = ''; let importFiles = '';
let inputFiles = ''; let inputFiles = '';
@ -20,6 +21,7 @@
let tags = []; let tags = [];
let showSettingsModal = false;
let showEditDocModal = false; let showEditDocModal = false;
let selectedDoc; let selectedDoc;
let selectedTag = ''; let selectedTag = '';
@ -179,11 +181,38 @@
}} }}
/> />
<SettingsModal bind:show={showSettingsModal} />
<div class="min-h-screen max-h-[100dvh] w-full flex justify-center dark:text-white"> <div class="min-h-screen max-h-[100dvh] w-full flex justify-center dark:text-white">
<div class=" py-2.5 flex flex-col justify-between w-full overflow-y-auto"> <div class=" py-2.5 flex flex-col justify-between w-full overflow-y-auto">
<div class="max-w-2xl mx-auto w-full px-3 md:px-0 my-10"> <div class="max-w-2xl mx-auto w-full px-3 md:px-0 my-10">
<div class="mb-6 flex justify-between items-center"> <div class="mb-6 flex justify-between items-center">
<div class=" text-2xl font-semibold self-center">My Documents</div> <div class=" text-2xl font-semibold self-center">My Documents</div>
<div>
<button
class="flex items-center space-x-1 border border-gray-200 dark:border-gray-600 px-3 py-1 rounded-lg"
type="button"
on:click={() => {
showSettingsModal = !showSettingsModal;
}}
>
<svg
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 16 16"
fill="currentColor"
class="w-4 h-4"
>
<path
fill-rule="evenodd"
d="M6.955 1.45A.5.5 0 0 1 7.452 1h1.096a.5.5 0 0 1 .497.45l.17 1.699c.484.12.94.312 1.356.562l1.321-1.081a.5.5 0 0 1 .67.033l.774.775a.5.5 0 0 1 .034.67l-1.08 1.32c.25.417.44.873.561 1.357l1.699.17a.5.5 0 0 1 .45.497v1.096a.5.5 0 0 1-.45.497l-1.699.17c-.12.484-.312.94-.562 1.356l1.082 1.322a.5.5 0 0 1-.034.67l-.774.774a.5.5 0 0 1-.67.033l-1.322-1.08c-.416.25-.872.44-1.356.561l-.17 1.699a.5.5 0 0 1-.497.45H7.452a.5.5 0 0 1-.497-.45l-.17-1.699a4.973 4.973 0 0 1-1.356-.562L4.108 13.37a.5.5 0 0 1-.67-.033l-.774-.775a.5.5 0 0 1-.034-.67l1.08-1.32a4.971 4.971 0 0 1-.561-1.357l-1.699-.17A.5.5 0 0 1 1 8.548V7.452a.5.5 0 0 1 .45-.497l1.699-.17c.12-.484.312-.94.562-1.356L2.629 4.107a.5.5 0 0 1 .034-.67l.774-.774a.5.5 0 0 1 .67-.033L5.43 3.71a4.97 4.97 0 0 1 1.356-.561l.17-1.699ZM6 8c0 .538.212 1.026.558 1.385l.057.057a2 2 0 0 0 2.828-2.828l-.058-.056A2 2 0 0 0 6 8Z"
clip-rule="evenodd"
/>
</svg>
<div class=" text-xs">Document Settings</div>
</button>
</div>
</div> </div>
<div class=" flex w-full space-x-2"> <div class=" flex w-full space-x-2">
@ -419,6 +448,8 @@
</button> </button>
</div> </div>
</div> </div>
<div class=" my-2.5" />
{/each} {/each}
{#if $documents.length > 0} {#if $documents.length > 0}