Merge pull request #524 from Marclass/rag-arbitrary-files

feat: Allow RAG on XML and arbitrary text files including source code
This commit is contained in:
Timothy Jaeryang Baek 2024-01-19 00:09:04 -08:00 committed by GitHub
commit f079cb6b56
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 47 additions and 39 deletions

View file

@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
Docx2txtLoader, Docx2txtLoader,
UnstructuredWordDocumentLoader, UnstructuredWordDocumentLoader,
UnstructuredMarkdownLoader, UnstructuredMarkdownLoader,
UnstructuredXMLLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@ -143,25 +144,20 @@ def store_doc(
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
print(file.content_type) print(file.content_type)
if file.content_type not in [
"application/pdf",
"text/plain",
"text/csv",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/octet-stream",
]:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
)
if file.content_type == "application/octet-stream" and file.filename.split(".")[ text_xml=["xml"]
-1 octet_markdown=["md"]
] not in ["md"]: known_source_ext=[
raise HTTPException( "go", "py", "java", "sh", "bat", "ps1", "cmd", "js",
status_code=status.HTTP_400_BAD_REQUEST, "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
) "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
"rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
]
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
known_doc_ext=["doc","docx"]
file_ext=file.filename.split(".")[-1].lower()
known_type=True
try: try:
filename = file.filename filename = file.filename
@ -176,20 +172,22 @@ def store_doc(
collection_name = calculate_sha256(f)[:63] collection_name = calculate_sha256(f)[:63]
f.close() f.close()
if file.content_type == "application/pdf": if file_ext=="pdf":
loader = PyPDFLoader(file_path) loader = PyPDFLoader(file_path)
elif ( elif (file.content_type ==docx_type or file_ext in known_doc_ext):
file.content_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file.content_type == "text/plain": elif file_ext=="csv":
loader = TextLoader(file_path)
elif file.content_type == "text/csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file.content_type == "application/octet-stream": elif file_ext in text_xml:
if file.filename.split(".")[-1] == "md": loader=UnstructuredXMLLoader(file_path)
loader = UnstructuredMarkdownLoader(file_path) elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
loader = TextLoader(file_path)
elif file_ext in octet_markdown:
loader = UnstructuredMarkdownLoader(file_path)
else:
loader = TextLoader(file_path)
known_type=False
data = loader.load() data = loader.load()
result = store_data_in_vector_db(data, collection_name) result = store_data_in_vector_db(data, collection_name)
@ -199,6 +197,7 @@ def store_doc(
"status": True, "status": True,
"collection_name": collection_name, "collection_name": collection_name,
"filename": filename, "filename": filename,
"known_type":known_type,
} }
else: else:
raise HTTPException( raise HTTPException(

View file

@ -173,7 +173,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);
@ -308,8 +309,9 @@
uploadDoc(file); uploadDoc(file);
filesInputElement.value = ''; filesInputElement.value = '';
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
inputFiles = null; uploadDoc(file);
filesInputElement.value = '';
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);

View file

@ -13,10 +13,15 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
export const SUPPORTED_FILE_TYPE = [ export const SUPPORTED_FILE_TYPE = [
'application/pdf', 'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'text/markdown',
'text/plain', 'text/plain',
'text/csv' 'text/csv',
'text/xml',
'text/x-python',
'text/css',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/octet-stream',
'application/x-javascript',
'text/markdown',
]; ];
// Source: https://kit.svelte.dev/docs/modules#$env-static-public // Source: https://kit.svelte.dev/docs/modules#$env-static-public

View file

@ -73,7 +73,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
} else { } else {
toast.error(`File not found.`); toast.error(`File not found.`);
@ -153,7 +154,8 @@
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unsupported File Type '${file['type']}'.`); toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
uploadDoc(file);
} }
inputFiles = null; inputFiles = null;