forked from open-webui/open-webui
feat: rag md support
This commit is contained in:
parent
358f79f533
commit
c1ec604f21
4 changed files with 22 additions and 1 deletions
|
@ -19,6 +19,8 @@ from langchain_community.document_loaders import (
|
||||||
PyPDFLoader,
|
PyPDFLoader,
|
||||||
CSVLoader,
|
CSVLoader,
|
||||||
Docx2txtLoader,
|
Docx2txtLoader,
|
||||||
|
UnstructuredWordDocumentLoader,
|
||||||
|
UnstructuredMarkdownLoader,
|
||||||
)
|
)
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
|
@ -140,17 +142,27 @@ def store_doc(
|
||||||
):
|
):
|
||||||
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
||||||
|
|
||||||
|
print(file.content_type)
|
||||||
if file.content_type not in [
|
if file.content_type not in [
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
"text/plain",
|
"text/plain",
|
||||||
"text/csv",
|
"text/csv",
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/octet-stream",
|
||||||
]:
|
]:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
|
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if file.content_type == "application/octet-stream" and file.filename.split(".")[
|
||||||
|
-1
|
||||||
|
] not in ["md"]:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
filename = file.filename
|
filename = file.filename
|
||||||
file_path = f"{UPLOAD_DIR}/{filename}"
|
file_path = f"{UPLOAD_DIR}/{filename}"
|
||||||
|
@ -175,6 +187,9 @@ def store_doc(
|
||||||
loader = TextLoader(file_path)
|
loader = TextLoader(file_path)
|
||||||
elif file.content_type == "text/csv":
|
elif file.content_type == "text/csv":
|
||||||
loader = CSVLoader(file_path)
|
loader = CSVLoader(file_path)
|
||||||
|
elif file.content_type == "application/octet-stream":
|
||||||
|
if file.filename.split(".")[-1] == "md":
|
||||||
|
loader = UnstructuredMarkdownLoader(file_path)
|
||||||
|
|
||||||
data = loader.load()
|
data = loader.load()
|
||||||
result = store_data_in_vector_db(data, collection_name)
|
result = store_data_in_vector_db(data, collection_name)
|
||||||
|
|
|
@ -22,6 +22,7 @@ chromadb
|
||||||
sentence_transformers
|
sentence_transformers
|
||||||
pypdf
|
pypdf
|
||||||
docx2txt
|
docx2txt
|
||||||
|
unstructured
|
||||||
|
|
||||||
PyJWT
|
PyJWT
|
||||||
pyjwt[crypto]
|
pyjwt[crypto]
|
||||||
|
|
|
@ -149,9 +149,13 @@
|
||||||
|
|
||||||
if (inputFiles && inputFiles.length > 0) {
|
if (inputFiles && inputFiles.length > 0) {
|
||||||
const file = inputFiles[0];
|
const file = inputFiles[0];
|
||||||
|
console.log(file, file.name.split('.').at(-1));
|
||||||
if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
|
if (['image/gif', 'image/jpeg', 'image/png'].includes(file['type'])) {
|
||||||
reader.readAsDataURL(file);
|
reader.readAsDataURL(file);
|
||||||
} else if (SUPPORTED_FILE_TYPE.includes(file['type'])) {
|
} else if (
|
||||||
|
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
||||||
|
['md'].includes(file.name.split('.').at(-1))
|
||||||
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unsupported File Type '${file['type']}'.`);
|
toast.error(`Unsupported File Type '${file['type']}'.`);
|
||||||
|
|
|
@ -14,6 +14,7 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
|
||||||
export const SUPPORTED_FILE_TYPE = [
|
export const SUPPORTED_FILE_TYPE = [
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'text/markdown',
|
||||||
'text/plain',
|
'text/plain',
|
||||||
'text/csv'
|
'text/csv'
|
||||||
];
|
];
|
||||||
|
|
Loading…
Reference in a new issue