feat: rag md support

This commit is contained in:
Timothy J. Baek 2024-01-09 15:24:53 -08:00
parent 358f79f533
commit c1ec604f21
4 changed files with 22 additions and 1 deletions

View file

@ -19,6 +19,8 @@ from langchain_community.document_loaders import (
PyPDFLoader, PyPDFLoader,
CSVLoader, CSVLoader,
Docx2txtLoader, Docx2txtLoader,
UnstructuredWordDocumentLoader,
UnstructuredMarkdownLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@ -140,17 +142,27 @@ def store_doc(
): ):
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
print(file.content_type)
if file.content_type not in [ if file.content_type not in [
"application/pdf", "application/pdf",
"text/plain", "text/plain",
"text/csv", "text/csv",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/octet-stream",
]: ]:
raise HTTPException( raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST, status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
) )
if file.content_type == "application/octet-stream" and file.filename.split(".")[
-1
] not in ["md"]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
try: try:
filename = file.filename filename = file.filename
file_path = f"{UPLOAD_DIR}/{filename}" file_path = f"{UPLOAD_DIR}/{filename}"
@ -175,6 +187,9 @@ def store_doc(
loader = TextLoader(file_path) loader = TextLoader(file_path)
elif file.content_type == "text/csv": elif file.content_type == "text/csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file.content_type == "application/octet-stream":
if file.filename.split(".")[-1] == "md":
loader = UnstructuredMarkdownLoader(file_path)
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name) result = store_data_in_vector_db(data, collection_name)

View file

@ -22,6 +22,7 @@ chromadb
sentence_transformers sentence_transformers
pypdf pypdf
docx2txt docx2txt
unstructured
PyJWT PyJWT
pyjwt[crypto] pyjwt[crypto]

View file

@ -149,9 +149,13 @@
if (inputFiles && inputFiles.length > 0) { if (inputFiles && inputFiles.length > 0) {
const file = inputFiles[0]; const file = inputFiles[0];
console.log(file, file.name.split('.').at(-1));
if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) { if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
reader.readAsDataURL(file); reader.readAsDataURL(file);
} else if (SUPPORTED_FILE_TYPE.includes(file['type'])) { } else if (
SUPPORTED_FILE_TYPE.includes(file['type']) ||
['md'].includes(file.name.split('.').at(-1))
) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unsupported File Type '${file['type']}'.`);

View file

@ -14,6 +14,7 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
export const SUPPORTED_FILE_TYPE = [ export const SUPPORTED_FILE_TYPE = [
'application/pdf', 'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/markdown',
'text/plain', 'text/plain',
'text/csv' 'text/csv'
]; ];