From 57c050326c31684c511995d4863a064963fddd55 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Sun, 7 Jan 2024 13:56:01 -0800 Subject: [PATCH] feat: docx support --- backend/apps/rag/main.py | 13 ++++++++++++- backend/requirements.txt | 2 +- src/lib/components/chat/MessageInput.svelte | 18 ++++++++++++++++-- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 4cde679e..6e4f5c09 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -18,6 +18,7 @@ from langchain_community.document_loaders import ( TextLoader, PyPDFLoader, CSVLoader, + Docx2txtLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -135,7 +136,12 @@ def store_doc( ): # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" - if file.content_type not in ["application/pdf", "text/plain", "text/csv"]: + if file.content_type not in [ + "application/pdf", + "text/plain", + "text/csv", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ]: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, @@ -156,6 +162,11 @@ def store_doc( if file.content_type == "application/pdf": loader = PyPDFLoader(file_path) + elif ( + file.content_type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ): + loader = Docx2txtLoader(file_path) elif file.content_type == "text/plain": loader = TextLoader(file_path) elif file.content_type == "text/csv": diff --git a/backend/requirements.txt b/backend/requirements.txt index d3355b5f..06af0efc 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -16,12 +16,12 @@ aiohttp peewee bcrypt - langchain langchain-community chromadb sentence_transformers pypdf +docx2txt PyJWT pyjwt[crypto] diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index f76a74d6..54ccc8f4 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -143,7 +143,14 @@ const file = inputFiles[0]; if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { reader.readAsDataURL(file); - } else if (['application/pdf', 'text/plain', 'text/csv'].includes(file['type'])) { + } else if ( + [ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'text/plain', + 'text/csv' + ].includes(file['type']) + ) { uploadDoc(file); } else { toast.error(`Unsupported File Type '${file['type']}'.`); @@ -249,7 +256,14 @@ const file = inputFiles[0]; if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { reader.readAsDataURL(file); - } else if (['application/pdf', 'text/plain', 'text/csv'].includes(file['type'])) { + } else if ( + [ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'text/plain', + 'text/csv' + ].includes(file['type']) + ) { uploadDoc(file); filesInputElement.value = ''; } else {