add excel document support

This commit is contained in:
Marclass 2024-01-23 14:03:22 -07:00
parent 7eea3ef313
commit 8bfda730d9
2 changed files with 12 additions and 1 deletions

View file

@ -23,6 +23,7 @@ from langchain_community.document_loaders import (
UnstructuredMarkdownLoader, UnstructuredMarkdownLoader,
UnstructuredXMLLoader, UnstructuredXMLLoader,
UnstructuredRSTLoader, UnstructuredRSTLoader,
UnstructuredExcelLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@ -157,6 +158,9 @@ def store_doc(
] ]
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
known_doc_ext=["doc","docx"] known_doc_ext=["doc","docx"]
excel_types=["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
known_excel_ext=["xls", "xlsx"]
file_ext=file.filename.split(".")[-1].lower() file_ext=file.filename.split(".")[-1].lower()
known_type=True known_type=True
@ -179,6 +183,8 @@ def store_doc(
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file_ext=="csv": elif file_ext=="csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif (file.content_type in excel_types or file_ext in known_excel_ext):
loader = UnstructuredExcelLoader(file_path)
elif file_ext=="rst": elif file_ext=="rst":
loader = UnstructuredRSTLoader(file_path, mode="elements") loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext in text_xml: elif file_ext in text_xml:

View file

@ -29,3 +29,8 @@ PyJWT
pyjwt[crypto] pyjwt[crypto]
black black
pandas
openpyxl
pyxlsb
xlrd