From 8bfda730d9dd99eae2147bcb8207efe09016d165 Mon Sep 17 00:00:00 2001 From: Marclass Date: Tue, 23 Jan 2024 14:03:22 -0700 Subject: [PATCH] add excel document support --- backend/apps/rag/main.py | 6 ++++++ backend/requirements.txt | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 4ceae2a8..e6bb02a4 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -23,6 +23,7 @@ from langchain_community.document_loaders import ( UnstructuredMarkdownLoader, UnstructuredXMLLoader, UnstructuredRSTLoader, + UnstructuredExcelLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -157,6 +158,9 @@ def store_doc( ] docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" known_doc_ext=["doc","docx"] + excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"] + known_excel_ext=["xls", "xlsx"] + file_ext=file.filename.split(".")[-1].lower() known_type=True @@ -179,6 +183,8 @@ def store_doc( loader = Docx2txtLoader(file_path) elif file_ext=="csv": loader = CSVLoader(file_path) + elif (file.content_type in excel_types or file_ext in known_excel_ext): + loader = UnstructuredExcelLoader(file_path) elif file_ext=="rst": loader = UnstructuredRSTLoader(file_path, mode="elements") elif file_ext in text_xml: diff --git a/backend/requirements.txt b/backend/requirements.txt index 76a20824..07ea0ea3 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -28,4 +28,9 @@ markdown PyJWT pyjwt[crypto] -black \ No newline at end of file +black + +pandas +openpyxl +pyxlsb +xlrd \ No newline at end of file