forked from open-webui/open-webui
Merge pull request #524 from Marclass/rag-arbitrary-files
feat: Allow RAG on XML and arbitrary text files including source code
This commit is contained in:
commit
f079cb6b56
4 changed files with 47 additions and 39 deletions
|
@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
|
||||||
Docx2txtLoader,
|
Docx2txtLoader,
|
||||||
UnstructuredWordDocumentLoader,
|
UnstructuredWordDocumentLoader,
|
||||||
UnstructuredMarkdownLoader,
|
UnstructuredMarkdownLoader,
|
||||||
|
UnstructuredXMLLoader,
|
||||||
)
|
)
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
|
@ -143,26 +144,21 @@ def store_doc(
|
||||||
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
||||||
|
|
||||||
print(file.content_type)
|
print(file.content_type)
|
||||||
if file.content_type not in [
|
|
||||||
"application/pdf",
|
text_xml=["xml"]
|
||||||
"text/plain",
|
octet_markdown=["md"]
|
||||||
"text/csv",
|
known_source_ext=[
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"go", "py", "java", "sh", "bat", "ps1", "cmd", "js",
|
||||||
"application/octet-stream",
|
"css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
|
||||||
]:
|
"pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
|
||||||
raise HTTPException(
|
"hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
"rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
|
||||||
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
|
]
|
||||||
)
|
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
known_doc_ext=["doc","docx"]
|
||||||
if file.content_type == "application/octet-stream" and file.filename.split(".")[
|
file_ext=file.filename.split(".")[-1].lower()
|
||||||
-1
|
known_type=True
|
||||||
] not in ["md"]:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
|
||||||
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
filename = file.filename
|
filename = file.filename
|
||||||
file_path = f"{UPLOAD_DIR}/{filename}"
|
file_path = f"{UPLOAD_DIR}/{filename}"
|
||||||
|
@ -176,20 +172,22 @@ def store_doc(
|
||||||
collection_name = calculate_sha256(f)[:63]
|
collection_name = calculate_sha256(f)[:63]
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
if file.content_type == "application/pdf":
|
if file_ext=="pdf":
|
||||||
loader = PyPDFLoader(file_path)
|
loader = PyPDFLoader(file_path)
|
||||||
elif (
|
elif (file.content_type ==docx_type or file_ext in known_doc_ext):
|
||||||
file.content_type
|
|
||||||
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
):
|
|
||||||
loader = Docx2txtLoader(file_path)
|
loader = Docx2txtLoader(file_path)
|
||||||
elif file.content_type == "text/plain":
|
elif file_ext=="csv":
|
||||||
loader = TextLoader(file_path)
|
|
||||||
elif file.content_type == "text/csv":
|
|
||||||
loader = CSVLoader(file_path)
|
loader = CSVLoader(file_path)
|
||||||
elif file.content_type == "application/octet-stream":
|
elif file_ext in text_xml:
|
||||||
if file.filename.split(".")[-1] == "md":
|
loader=UnstructuredXMLLoader(file_path)
|
||||||
loader = UnstructuredMarkdownLoader(file_path)
|
elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
|
||||||
|
loader = TextLoader(file_path)
|
||||||
|
elif file_ext in octet_markdown:
|
||||||
|
loader = UnstructuredMarkdownLoader(file_path)
|
||||||
|
else:
|
||||||
|
loader = TextLoader(file_path)
|
||||||
|
known_type=False
|
||||||
|
|
||||||
|
|
||||||
data = loader.load()
|
data = loader.load()
|
||||||
result = store_data_in_vector_db(data, collection_name)
|
result = store_data_in_vector_db(data, collection_name)
|
||||||
|
@ -199,6 +197,7 @@ def store_doc(
|
||||||
"status": True,
|
"status": True,
|
||||||
"collection_name": collection_name,
|
"collection_name": collection_name,
|
||||||
"filename": filename,
|
"filename": filename,
|
||||||
|
"known_type":known_type,
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
|
|
|
@ -173,7 +173,8 @@
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unsupported File Type '${file['type']}'.`);
|
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
||||||
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
toast.error(`File not found.`);
|
toast.error(`File not found.`);
|
||||||
|
@ -308,8 +309,9 @@
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
filesInputElement.value = '';
|
filesInputElement.value = '';
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unsupported File Type '${file['type']}'.`);
|
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
||||||
inputFiles = null;
|
uploadDoc(file);
|
||||||
|
filesInputElement.value = '';
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
toast.error(`File not found.`);
|
toast.error(`File not found.`);
|
||||||
|
|
|
@ -13,10 +13,15 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
|
||||||
|
|
||||||
export const SUPPORTED_FILE_TYPE = [
|
export const SUPPORTED_FILE_TYPE = [
|
||||||
'application/pdf',
|
'application/pdf',
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
||||||
'text/markdown',
|
|
||||||
'text/plain',
|
'text/plain',
|
||||||
'text/csv'
|
'text/csv',
|
||||||
|
'text/xml',
|
||||||
|
'text/x-python',
|
||||||
|
'text/css',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
'application/octet-stream',
|
||||||
|
'application/x-javascript',
|
||||||
|
'text/markdown',
|
||||||
];
|
];
|
||||||
|
|
||||||
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
||||||
|
|
|
@ -73,7 +73,8 @@
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unsupported File Type '${file['type']}'.`);
|
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
||||||
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
toast.error(`File not found.`);
|
toast.error(`File not found.`);
|
||||||
|
@ -153,7 +154,8 @@
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unsupported File Type '${file['type']}'.`);
|
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
||||||
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
inputFiles = null;
|
inputFiles = null;
|
||||||
|
|
Loading…
Reference in a new issue