feat: docx support

This commit is contained in:
Timothy J. Baek 2024-01-07 13:56:01 -08:00
parent 537a7f5f00
commit 57c050326c
3 changed files with 29 additions and 4 deletions

View file

@ -18,6 +18,7 @@ from langchain_community.document_loaders import (
TextLoader, TextLoader,
PyPDFLoader, PyPDFLoader,
CSVLoader, CSVLoader,
Docx2txtLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@ -135,7 +136,12 @@ def store_doc(
): ):
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
if file.content_type not in ["application/pdf", "text/plain", "text/csv"]: if file.content_type not in [
"application/pdf",
"text/plain",
"text/csv",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
@ -156,6 +162,11 @@ def store_doc(
if file.content_type == "application/pdf": if file.content_type == "application/pdf":
loader = PyPDFLoader(file_path) loader = PyPDFLoader(file_path)
elif (
file.content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
loader = Docx2txtLoader(file_path)
elif file.content_type == "text/plain": elif file.content_type == "text/plain":
loader = TextLoader(file_path) loader = TextLoader(file_path)
elif file.content_type == "text/csv": elif file.content_type == "text/csv":

View file

@ -16,12 +16,12 @@ aiohttp
peewee peewee
bcrypt bcrypt
langchain langchain
langchain-community langchain-community
chromadb chromadb
sentence_transformers sentence_transformers
pypdf pypdf
docx2txt
PyJWT PyJWT
pyjwt[crypto] pyjwt[crypto]

View file

@ -143,7 +143,14 @@
const file = inputFiles[0]; const file = inputFiles[0];
if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
reader.readAsDataURL(file); reader.readAsDataURL(file);
} else if (['application/pdf', 'text/plain', 'text/csv'].includes(file['type'])) { } else if (
[
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'text/csv'
].includes(file['type'])
) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unsupported File Type '${file['type']}'.`);
@ -249,7 +256,14 @@
const file = inputFiles[0]; const file = inputFiles[0];
if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
reader.readAsDataURL(file); reader.readAsDataURL(file);
} else if (['application/pdf', 'text/plain', 'text/csv'].includes(file['type'])) { } else if (
[
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/plain',
'text/csv'
].includes(file['type'])
) {
uploadDoc(file); uploadDoc(file);
filesInputElement.value = ''; filesInputElement.value = '';
} else { } else {