switch to using BeautifulSoup HTML loader so title is also captured

This commit is contained in:
Doug Danat 2024-03-25 11:26:18 +01:00
parent 77f4ffddc1
commit c91a5d8b1f

View file

@ -21,7 +21,7 @@ from langchain_community.document_loaders import (
TextLoader,
PyPDFLoader,
CSVLoader,
UnstructuredHTMLLoader,
BSHTMLLoader,
Docx2txtLoader,
UnstructuredEPubLoader,
UnstructuredWordDocumentLoader,
@ -404,7 +404,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
elif file_ext == "xml":
loader = UnstructuredXMLLoader(file_path)
elif file_ext in ["htm", "html"]:
loader = UnstructuredHTMLLoader(file_path)
loader = BSHTMLLoader(file_path)
elif file_ext == "md":
loader = UnstructuredMarkdownLoader(file_path)
elif file_content_type == "application/epub+zip":