From f559068186f333ec6d73a87e693a950d9bf47442 Mon Sep 17 00:00:00 2001 From: Dave Bauman Date: Sat, 13 Jan 2024 08:46:56 -0500 Subject: [PATCH 1/4] feat: Add epub support --- Dockerfile | 5 +++++ backend/apps/rag/main.py | 17 +++++++++++++---- backend/constants.py | 2 ++ backend/requirements.txt | 3 ++- src/lib/components/chat/MessageInput.svelte | 18 ++++++++++++------ src/lib/constants.ts | 1 + 6 files changed, 35 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index fbfc9ae2..0fd14985 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,6 +28,11 @@ ENV WEBUI_JWT_SECRET_KEY "SECRET_KEY" WORKDIR /app +# Install pandoc +RUN apt-get update \ + && apt-get install -y pandoc \ + && rm -rf /var/lib/apt/lists/* + # copy embedding weight from build RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 COPY --from=build /app/onnx.tar.gz /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 65dde89a..aa6906f0 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -19,6 +19,7 @@ from langchain_community.document_loaders import ( PyPDFLoader, CSVLoader, Docx2txtLoader, + UnstructuredEPubLoader, UnstructuredWordDocumentLoader, UnstructuredMarkdownLoader, UnstructuredXMLLoader, @@ -184,6 +185,8 @@ def store_doc( loader = TextLoader(file_path) elif file_ext in octet_markdown: loader = UnstructuredMarkdownLoader(file_path) + elif file.content_type == "application/epub+zip": + loader = UnstructuredEPubLoader(file_path) else: loader = TextLoader(file_path) known_type=False @@ -206,10 +209,16 @@ def store_doc( ) except Exception as e: print(e) - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.DEFAULT(e), - ) + if "No pandoc was found" in str(e): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED, + ) + else: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=ERROR_MESSAGES.DEFAULT(e), + ) @app.get("/reset/db") diff --git a/backend/constants.py b/backend/constants.py index c9bfaec5..580db9c5 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -42,3 +42,5 @@ class ERROR_MESSAGES(str, Enum): USER_NOT_FOUND = "We could not find what you're looking for :/" API_KEY_NOT_FOUND = "Oops! It looks like there's a hiccup. The API key is missing. Please make sure to provide a valid API key to access this feature." MALICIOUS = "Unusual activities detected, please try again in a few minutes." + + PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance." diff --git a/backend/requirements.txt b/backend/requirements.txt index 76a20824..c28fcb68 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -24,8 +24,9 @@ pypdf docx2txt unstructured markdown +pypandoc PyJWT pyjwt[crypto] -black \ No newline at end of file +black diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index ff82d606..adf74561 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -121,13 +121,19 @@ error: '' }; - files = [...files, doc]; - const res = await uploadDocToVectorDB(localStorage.token, '', file); + try { + files = [...files, doc]; + const res = await uploadDocToVectorDB(localStorage.token, '', file); - if (res) { - doc.upload_status = true; - doc.collection_name = res.collection_name; - files = files; + if (res) { + doc.upload_status = true; + doc.collection_name = res.collection_name; + files = files; + } + } catch (e) { + // Remove the failed doc from the files array + files = files.filter((f) => f.name !== file.name); + toast.error(e); } }; diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 1d54dae1..91d9dd59 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -12,6 +12,7 @@ export const WEB_UI_VERSION = 'v1.0.0-alpha-static'; export const REQUIRED_OLLAMA_VERSION = '0.1.16'; export const SUPPORTED_FILE_TYPE = [ + 'application/epub+zip', 'application/pdf', 'text/plain', 'text/csv', From 5188bab560dfba1bff0003f9f088d8b5e17c2379 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Mon, 22 Jan 2024 23:11:50 -0800 Subject: [PATCH 2/4] Update Dockerfile --- Dockerfile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0fd14985..a6260a81 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,11 +28,6 @@ ENV WEBUI_JWT_SECRET_KEY "SECRET_KEY" WORKDIR /app -# Install pandoc -RUN apt-get update \ - && apt-get install -y pandoc \ - && rm -rf /var/lib/apt/lists/* - # copy embedding weight from build RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 COPY --from=build /app/onnx.tar.gz /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 @@ -50,6 +45,13 @@ COPY ./backend/requirements.txt ./requirements.txt RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir RUN pip3 install -r requirements.txt --no-cache-dir + +# Install pandoc +# RUN python -c "import pypandoc; pypandoc.download_pandoc()" +RUN apt-get update \ + && apt-get install -y pandoc \ + && rm -rf /var/lib/apt/lists/* + # RUN python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('all-MiniLM-L6-v2')" COPY ./backend . From 255fbb3c968d57c0c938ac8b31cd0ab3109d9325 Mon Sep 17 00:00:00 2001 From: Dave Bauman Date: Tue, 23 Jan 2024 07:48:27 -0500 Subject: [PATCH 3/4] chore: optimize dockerfile order for caching --- .dockerignore | 3 +++ Dockerfile | 33 ++++++++++++++++----------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/.dockerignore b/.dockerignore index 419f53fb..58cf1f0f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,7 @@ +.github .DS_Store +docs +kubernetes node_modules /.svelte-kit /package diff --git a/Dockerfile b/Dockerfile index a6260a81..4a23e914 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,7 +26,21 @@ ENV OPENAI_API_KEY "" ENV WEBUI_JWT_SECRET_KEY "SECRET_KEY" -WORKDIR /app +WORKDIR /app/backend + +# install python dependencies +COPY ./backend/requirements.txt ./requirements.txt + +RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir +RUN pip3 install -r requirements.txt --no-cache-dir + +# Install pandoc +# RUN python -c "import pypandoc; pypandoc.download_pandoc()" +RUN apt-get update \ + && apt-get install -y pandoc \ + && rm -rf /var/lib/apt/lists/* + +# RUN python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('all-MiniLM-L6-v2')" # copy embedding weight from build RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 @@ -38,22 +52,7 @@ RUN cd /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 &&\ # copy built frontend files COPY --from=build /app/build /app/build -WORKDIR /app/backend - -COPY ./backend/requirements.txt ./requirements.txt - -RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir -RUN pip3 install -r requirements.txt --no-cache-dir - - -# Install pandoc -# RUN python -c "import pypandoc; pypandoc.download_pandoc()" -RUN apt-get update \ - && apt-get install -y pandoc \ - && rm -rf /var/lib/apt/lists/* - -# RUN python -c "from sentence_transformers import SentenceTransformer; model = SentenceTransformer('all-MiniLM-L6-v2')" - +# copy backend files COPY ./backend . CMD [ "sh", "start.sh"] \ No newline at end of file From 3848b2c8edda3de0769d7cc7eae7b2d2b5e20e29 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Wed, 24 Jan 2024 23:51:22 -0800 Subject: [PATCH 4/4] fix: search --- src/lib/components/layout/Sidebar.svelte | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lib/components/layout/Sidebar.svelte b/src/lib/components/layout/Sidebar.svelte index afd4b0e5..7eaf4f61 100644 --- a/src/lib/components/layout/Sidebar.svelte +++ b/src/lib/components/layout/Sidebar.svelte @@ -321,8 +321,9 @@ return true; } else { let title = chat.title.toLowerCase(); + const query = search.toLowerCase(); - if (title.includes(search)) { + if (title.includes(query)) { return true; } else { return false;