Merge pull request #466 from baumandm/feat/epub-support

feat: Add epub support
2024-01-22 23:12:46 -08:00 · 2024-01-22 23:12:46 -08:00 · 7054f02891
commit 7054f02891
parent d517a3ebb4 5188bab560
6 changed files with 37 additions and 11 deletions
--- a/7
+++ b/7
@ -45,6 +45,13 @@ COPY ./backend/requirements.txt ./requirements.txt
 RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
 RUN pip3 install -r requirements.txt --no-cache-dir
 # Install pandoc
 # RUN python -c "import pypandoc; pypandoc.download_pandoc()"
 RUN apt-get update \
    && apt-get install -y pandoc \
    && rm -rf /var/lib/apt/lists/*
 # RUN python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('all-MiniLM-L6-v2')"
 COPY ./backend .
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@ -19,6 +19,7 @@ from langchain_community.document_loaders import (
    PyPDFLoader,
    CSVLoader,
    Docx2txtLoader,
    UnstructuredEPubLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredMarkdownLoader,
    UnstructuredXMLLoader,
@ -187,6 +188,8 @@ def store_doc(
            loader = TextLoader(file_path)
        elif file_ext in octet_markdown:
            loader = UnstructuredMarkdownLoader(file_path)
        elif file.content_type == "application/epub+zip":
            loader = UnstructuredEPubLoader(file_path)
        else:
            loader = TextLoader(file_path)
            known_type=False
@ -209,10 +212,16 @@ def store_doc(
            )
    except Exception as e:
        print(e)
-        raise HTTPException(
+        if "No pandoc was found" in str(e):
-            status_code=status.HTTP_400_BAD_REQUEST,
+            raise HTTPException(
-            detail=ERROR_MESSAGES.DEFAULT(e),
+                status_code=status.HTTP_400_BAD_REQUEST,
-        )
+                detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
            )
        else:
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
                detail=ERROR_MESSAGES.DEFAULT(e),
            )
@app.get("/reset/db")
--- a/backend/constants.py
+++ b/backend/constants.py
@ -42,3 +42,5 @@ class ERROR_MESSAGES(str, Enum):
    USER_NOT_FOUND = "We could not find what you're looking for :/"
    API_KEY_NOT_FOUND = "Oops! It looks like there's a hiccup. The API key is missing. Please make sure to provide a valid API key to access this feature."
    MALICIOUS = "Unusual activities detected, please try again in a few minutes."
    PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance."
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -24,6 +24,7 @@ pypdf
 docx2txt
 unstructured
 markdown
 pypandoc
 PyJWT
 pyjwt[crypto]
--- a/src/lib/components/chat/MessageInput.svelte
+++ b/src/lib/components/chat/MessageInput.svelte
@ -121,13 +121,19 @@
 			error: ''
 		};
-		files = [...files, doc];
+		try {
-		const res = await uploadDocToVectorDB(localStorage.token, '', file);
+			files = [...files, doc];
 			const res = await uploadDocToVectorDB(localStorage.token, '', file);
-		if (res) {
+			if (res) {
-			doc.upload_status = true;
+				doc.upload_status = true;
-			doc.collection_name = res.collection_name;
+				doc.collection_name = res.collection_name;
-			files = files;
+				files = files;
 			}
 		} catch (e) {
 			// Remove the failed doc from the files array
 			files = files.filter((f) => f.name !== file.name);
 			toast.error(e);
 		}
 	};
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@ -12,6 +12,7 @@ export const WEB_UI_VERSION = 'v1.0.0-alpha-static';
 export const REQUIRED_OLLAMA_VERSION = '0.1.16';
 export const SUPPORTED_FILE_TYPE = [
 	'application/epub+zip',
 	'application/pdf',
 	'text/plain',
 	'text/csv',