forked from open-webui/open-webui
Merge pull request #1292 from ddanat-smm/dev
Add htm/html support for RAG documents
This commit is contained in:
commit
a1fc2f4df0
3 changed files with 61 additions and 40 deletions
|
@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
|
|||
TextLoader,
|
||||
PyPDFLoader,
|
||||
CSVLoader,
|
||||
BSHTMLLoader,
|
||||
Docx2txtLoader,
|
||||
UnstructuredEPubLoader,
|
||||
UnstructuredWordDocumentLoader,
|
||||
|
@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel):
|
|||
class StoreWebForm(CollectionNameForm):
|
||||
url: str
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def get_status():
|
||||
return {
|
||||
|
@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
|
|||
|
||||
|
||||
def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
|
||||
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=app.state.CHUNK_SIZE,
|
||||
chunk_overlap=app.state.CHUNK_OVERLAP,
|
||||
add_start_index=True,
|
||||
)
|
||||
docs = text_splitter.split_documents(data)
|
||||
return store_docs_in_vector_db(docs, collection_name, overwrite)
|
||||
|
||||
if len(docs) > 0:
|
||||
return store_docs_in_vector_db(docs, collection_name, overwrite), None
|
||||
else:
|
||||
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
||||
|
||||
|
||||
def store_text_in_vector_db(
|
||||
|
@ -318,6 +325,7 @@ def store_text_in_vector_db(
|
|||
|
||||
|
||||
def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
|
||||
|
||||
texts = [doc.page_content for doc in docs]
|
||||
metadatas = [doc.metadata for doc in docs]
|
||||
|
||||
|
@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
|
|||
loader = UnstructuredRSTLoader(file_path, mode="elements")
|
||||
elif file_ext == "xml":
|
||||
loader = UnstructuredXMLLoader(file_path)
|
||||
elif file_ext in ["htm", "html"]:
|
||||
loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
|
||||
elif file_ext == "md":
|
||||
loader = UnstructuredMarkdownLoader(file_path)
|
||||
elif file_content_type == "application/epub+zip":
|
||||
|
@ -452,19 +462,21 @@ def store_doc(
|
|||
|
||||
loader, known_type = get_loader(file.filename, file.content_type, file_path)
|
||||
data = loader.load()
|
||||
result = store_data_in_vector_db(data, collection_name)
|
||||
|
||||
if result:
|
||||
return {
|
||||
"status": True,
|
||||
"collection_name": collection_name,
|
||||
"filename": filename,
|
||||
"known_type": known_type,
|
||||
}
|
||||
else:
|
||||
try:
|
||||
result = store_data_in_vector_db(data, collection_name)
|
||||
|
||||
if result:
|
||||
return {
|
||||
"status": True,
|
||||
"collection_name": collection_name,
|
||||
"filename": filename,
|
||||
"known_type": known_type,
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=ERROR_MESSAGES.DEFAULT(),
|
||||
detail=e,
|
||||
)
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
|
@ -529,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)):
|
|||
)
|
||||
data = loader.load()
|
||||
|
||||
result = store_data_in_vector_db(data, collection_name)
|
||||
try:
|
||||
result = store_data_in_vector_db(data, collection_name)
|
||||
|
||||
if result:
|
||||
sanitized_filename = sanitize_filename(filename)
|
||||
doc = Documents.get_doc_by_name(sanitized_filename)
|
||||
if result:
|
||||
sanitized_filename = sanitize_filename(filename)
|
||||
doc = Documents.get_doc_by_name(sanitized_filename)
|
||||
|
||||
if doc == None:
|
||||
doc = Documents.insert_new_doc(
|
||||
user.id,
|
||||
DocumentForm(
|
||||
**{
|
||||
"name": sanitized_filename,
|
||||
"title": filename,
|
||||
"collection_name": collection_name,
|
||||
"filename": filename,
|
||||
"content": (
|
||||
json.dumps(
|
||||
{
|
||||
"tags": list(
|
||||
map(
|
||||
lambda name: {"name": name},
|
||||
tags,
|
||||
if doc == None:
|
||||
doc = Documents.insert_new_doc(
|
||||
user.id,
|
||||
DocumentForm(
|
||||
**{
|
||||
"name": sanitized_filename,
|
||||
"title": filename,
|
||||
"collection_name": collection_name,
|
||||
"filename": filename,
|
||||
"content": (
|
||||
json.dumps(
|
||||
{
|
||||
"tags": list(
|
||||
map(
|
||||
lambda name: {"name": name},
|
||||
tags,
|
||||
)
|
||||
)
|
||||
)
|
||||
}
|
||||
)
|
||||
if len(tags)
|
||||
else "{}"
|
||||
),
|
||||
}
|
||||
),
|
||||
)
|
||||
}
|
||||
)
|
||||
if len(tags)
|
||||
else "{}"
|
||||
),
|
||||
}
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
log.exception(e)
|
||||
|
|
|
@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum):
|
|||
MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found"
|
||||
OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
|
||||
OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
|
||||
|
||||
EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding."
|
||||
|
|
|
@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [
|
|||
'text/plain',
|
||||
'text/csv',
|
||||
'text/xml',
|
||||
'text/html',
|
||||
'text/x-python',
|
||||
'text/css',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
|
@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
|
|||
'h',
|
||||
'c',
|
||||
'cs',
|
||||
'htm',
|
||||
'html',
|
||||
'sql',
|
||||
'log',
|
||||
'ini',
|
||||
|
|
Loading…
Reference in a new issue