Merge pull request #1292 from ddanat-smm/dev

Add htm/html support for RAG documents
This commit is contained in:
Timothy Jaeryang Baek 2024-03-25 23:51:20 -07:00 committed by GitHub
commit a1fc2f4df0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 61 additions and 40 deletions

View file

@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
TextLoader, TextLoader,
PyPDFLoader, PyPDFLoader,
CSVLoader, CSVLoader,
BSHTMLLoader,
Docx2txtLoader, Docx2txtLoader,
UnstructuredEPubLoader, UnstructuredEPubLoader,
UnstructuredWordDocumentLoader, UnstructuredWordDocumentLoader,
@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel):
class StoreWebForm(CollectionNameForm): class StoreWebForm(CollectionNameForm):
url: str url: str
@app.get("/") @app.get("/")
async def get_status(): async def get_status():
return { return {
@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
text_splitter = RecursiveCharacterTextSplitter( text_splitter = RecursiveCharacterTextSplitter(
chunk_size=app.state.CHUNK_SIZE, chunk_size=app.state.CHUNK_SIZE,
chunk_overlap=app.state.CHUNK_OVERLAP, chunk_overlap=app.state.CHUNK_OVERLAP,
add_start_index=True, add_start_index=True,
) )
docs = text_splitter.split_documents(data) docs = text_splitter.split_documents(data)
return store_docs_in_vector_db(docs, collection_name, overwrite)
if len(docs) > 0:
return store_docs_in_vector_db(docs, collection_name, overwrite), None
else:
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
def store_text_in_vector_db( def store_text_in_vector_db(
@ -318,6 +325,7 @@ def store_text_in_vector_db(
def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
texts = [doc.page_content for doc in docs] texts = [doc.page_content for doc in docs]
metadatas = [doc.metadata for doc in docs] metadatas = [doc.metadata for doc in docs]
@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
loader = UnstructuredRSTLoader(file_path, mode="elements") loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext == "xml": elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path) loader = UnstructuredXMLLoader(file_path)
elif file_ext in ["htm", "html"]:
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
elif file_ext == "md": elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path) loader = UnstructuredMarkdownLoader(file_path)
elif file_content_type == "application/epub+zip": elif file_content_type == "application/epub+zip":
@ -452,19 +462,21 @@ def store_doc(
loader, known_type = get_loader(file.filename, file.content_type, file_path) loader, known_type = get_loader(file.filename, file.content_type, file_path)
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name)
if result: try:
return { result = store_data_in_vector_db(data, collection_name)
"status": True,
"collection_name": collection_name, if result:
"filename": filename, return {
"known_type": known_type, "status": True,
} "collection_name": collection_name,
else: "filename": filename,
"known_type": known_type,
}
except Exception as e:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=ERROR_MESSAGES.DEFAULT(), detail=e,
) )
except Exception as e: except Exception as e:
log.exception(e) log.exception(e)
@ -529,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)):
) )
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name) try:
result = store_data_in_vector_db(data, collection_name)
if result: if result:
sanitized_filename = sanitize_filename(filename) sanitized_filename = sanitize_filename(filename)
doc = Documents.get_doc_by_name(sanitized_filename) doc = Documents.get_doc_by_name(sanitized_filename)
if doc == None: if doc == None:
doc = Documents.insert_new_doc( doc = Documents.insert_new_doc(
user.id, user.id,
DocumentForm( DocumentForm(
**{ **{
"name": sanitized_filename, "name": sanitized_filename,
"title": filename, "title": filename,
"collection_name": collection_name, "collection_name": collection_name,
"filename": filename, "filename": filename,
"content": ( "content": (
json.dumps( json.dumps(
{ {
"tags": list( "tags": list(
map( map(
lambda name: {"name": name}, lambda name: {"name": name},
tags, tags,
)
) )
) }
} )
) if len(tags)
if len(tags) else "{}"
else "{}" ),
), }
} ),
), )
) except Exception as e:
print(e)
pass
except Exception as e: except Exception as e:
log.exception(e) log.exception(e)

View file

@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum):
MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found" MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found"
OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found" OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama" OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding."

View file

@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [
'text/plain', 'text/plain',
'text/csv', 'text/csv',
'text/xml', 'text/xml',
'text/html',
'text/x-python', 'text/x-python',
'text/css', 'text/css',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
'h', 'h',
'c', 'c',
'cs', 'cs',
'htm',
'html',
'sql', 'sql',
'log', 'log',
'ini', 'ini',