add excel document support

This commit is contained in:
Marclass 2024-01-23 14:03:22 -07:00
parent 7eea3ef313
commit 8bfda730d9
2 changed files with 12 additions and 1 deletions

View file

@ -23,6 +23,7 @@ from langchain_community.document_loaders import (
UnstructuredMarkdownLoader,
UnstructuredXMLLoader,
UnstructuredRSTLoader,
UnstructuredExcelLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
@ -157,6 +158,9 @@ def store_doc(
]
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
known_doc_ext=["doc","docx"]
excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
known_excel_ext=["xls", "xlsx"]
file_ext=file.filename.split(".")[-1].lower()
known_type=True
@ -179,6 +183,8 @@ def store_doc(
loader = Docx2txtLoader(file_path)
elif file_ext=="csv":
loader = CSVLoader(file_path)
elif (file.content_type in excel_types or file_ext in known_excel_ext):
loader = UnstructuredExcelLoader(file_path)
elif file_ext=="rst":
loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext in text_xml:

View file

@ -28,4 +28,9 @@ markdown
PyJWT
pyjwt[crypto]
black
black
pandas
openpyxl
pyxlsb
xlrd