Merge pull request #1292 from ddanat-smm/dev

Add htm/html support for RAG documents
2024-03-25 23:51:20 -07:00 · 2024-03-25 23:51:20 -07:00 · a1fc2f4df0
commit a1fc2f4df0
parent 4f78acaa5c 3688955c77
3 changed files with 61 additions and 40 deletions
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
    TextLoader,
    PyPDFLoader,
    CSVLoader,
    BSHTMLLoader,
    Docx2txtLoader,
    UnstructuredEPubLoader,
    UnstructuredWordDocumentLoader,
@ -114,6 +115,7 @@ class CollectionNameForm(BaseModel):
 class StoreWebForm(CollectionNameForm):
    url: str
@app.get("/")
 async def get_status():
    return {
@ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
 def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=app.state.CHUNK_SIZE,
        chunk_overlap=app.state.CHUNK_OVERLAP,
        add_start_index=True,
    )
    docs = text_splitter.split_documents(data)
-    return store_docs_in_vector_db(docs, collection_name, overwrite)
+
    if len(docs) > 0:
        return store_docs_in_vector_db(docs, collection_name, overwrite), None
    else:
        raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
 def store_text_in_vector_db(
@ -318,6 +325,7 @@ def store_text_in_vector_db(
 def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool:
    texts = [doc.page_content for doc in docs]
    metadatas = [doc.metadata for doc in docs]
@ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
        loader = UnstructuredRSTLoader(file_path, mode="elements")
    elif file_ext == "xml":
        loader = UnstructuredXMLLoader(file_path)
    elif file_ext in ["htm", "html"]:
        loader = BSHTMLLoader(file_path, open_encoding="unicode_escape")
    elif file_ext == "md":
        loader = UnstructuredMarkdownLoader(file_path)
    elif file_content_type == "application/epub+zip":
@ -452,6 +462,8 @@ def store_doc(
        loader, known_type = get_loader(file.filename, file.content_type, file_path)
        data = loader.load()
        try:
            result = store_data_in_vector_db(data, collection_name)
            if result:
@ -461,10 +473,10 @@ def store_doc(
                    "filename": filename,
                    "known_type": known_type,
                }
-        else:
+        except Exception as e:
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-                detail=ERROR_MESSAGES.DEFAULT(),
+                detail=e,
            )
    except Exception as e:
        log.exception(e)
@ -529,6 +541,7 @@ def scan_docs_dir(user=Depends(get_admin_user)):
                )
                data = loader.load()
                try:
                    result = store_data_in_vector_db(data, collection_name)
                    if result:
@ -561,6 +574,9 @@ def scan_docs_dir(user=Depends(get_admin_user)):
                                    }
                                ),
                            )
                except Exception as e:
                    print(e)
                    pass
        except Exception as e:
            log.exception(e)
--- a/backend/constants.py
+++ b/backend/constants.py
@ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum):
    MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found"
    OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found"
    OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama"
    EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding."
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@ -22,6 +22,7 @@ export const SUPPORTED_FILE_TYPE = [
 	'text/plain',
 	'text/csv',
 	'text/xml',
 	'text/html',
 	'text/x-python',
 	'text/css',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
@ -50,6 +51,8 @@ export const SUPPORTED_FILE_EXTENSIONS = [
 	'h',
 	'c',
 	'cs',
 	'htm',
 	'html',
 	'sql',
 	'log',
 	'ini',