feat: web rag support

This commit is contained in:
Timothy J. Baek 2024-01-26 22:17:28 -08:00
parent 5e672d9f79
commit 28226a6f97
5 changed files with 131 additions and 33 deletions

View file

@ -37,7 +37,7 @@ from typing import Optional
import uuid
import time
from utils.misc import calculate_sha256
from utils.misc import calculate_sha256, calculate_sha256_string
from utils.utils import get_current_user
from config import UPLOAD_DIR, EMBED_MODEL, CHROMA_CLIENT, CHUNK_SIZE, CHUNK_OVERLAP
from constants import ERROR_MESSAGES
@ -124,10 +124,15 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
try:
loader = WebBaseLoader(form_data.url)
data = loader.load()
store_data_in_vector_db(data, form_data.collection_name)
collection_name = form_data.collection_name
if collection_name == "":
collection_name = calculate_sha256_string(form_data.url)[:63]
store_data_in_vector_db(data, collection_name)
return {
"status": True,
"collection_name": form_data.collection_name,
"collection_name": collection_name,
"filename": form_data.url,
}
except Exception as e:

View file

@ -24,6 +24,16 @@ def calculate_sha256(file):
return sha256.hexdigest()
def calculate_sha256_string(string):
# Create a new SHA-256 hash object
sha256_hash = hashlib.sha256()
# Update the hash object with the bytes of the input string
sha256_hash.update(string.encode("utf-8"))
# Get the hexadecimal representation of the hash
hashed_string = sha256_hash.hexdigest()
return hashed_string
def validate_email_format(email: str) -> bool:
if not re.match(r"[^@]+@[^@]+\.[^@]+", email):
return False