Merge pull request #524 from Marclass/rag-arbitrary-files

feat: Allow RAG on XML and arbitrary text files including source code
2024-01-19 00:09:04 -08:00 · 2024-01-19 00:09:04 -08:00 · f079cb6b56
commit f079cb6b56
parent ff33aa37ae aa1d386042
4 changed files with 47 additions and 39 deletions
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@ -21,6 +21,7 @@ from langchain_community.document_loaders import (
    Docx2txtLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredMarkdownLoader,
    UnstructuredXMLLoader,
 )
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import Chroma
@ -143,26 +144,21 @@ def store_doc(
    # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
    print(file.content_type)
-    if file.content_type not in [
+    
-        "application/pdf",
+    text_xml=["xml"]
-        "text/plain",
+    octet_markdown=["md"]
-        "text/csv",
+    known_source_ext=[
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", 
-        "application/octet-stream",
+        "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
-    ]:
+        "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs",
-        raise HTTPException(
+        "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
-            status_code=status.HTTP_400_BAD_REQUEST,
+        "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
-            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
+        ]
-        )
+    docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-
+    known_doc_ext=["doc","docx"]
-    if file.content_type == "application/octet-stream" and file.filename.split(".")[
+    file_ext=file.filename.split(".")[-1].lower()
-        -1
+    known_type=True
-    ] not in ["md"]:
+    
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED,
        )
    try:
        filename = file.filename
        file_path = f"{UPLOAD_DIR}/{filename}"
@ -176,20 +172,22 @@ def store_doc(
            collection_name = calculate_sha256(f)[:63]
        f.close()
-        if file.content_type == "application/pdf":
+        if file_ext=="pdf":
            loader = PyPDFLoader(file_path)
-        elif (
+        elif (file.content_type ==docx_type or file_ext in known_doc_ext):
            file.content_type
            == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        ):
            loader = Docx2txtLoader(file_path)
-        elif file.content_type == "text/plain":
+        elif file_ext=="csv":
            loader = TextLoader(file_path)
        elif file.content_type == "text/csv":
            loader = CSVLoader(file_path)
-        elif file.content_type == "application/octet-stream":
+        elif file_ext in text_xml:
-            if file.filename.split(".")[-1] == "md":
+            loader=UnstructuredXMLLoader(file_path)
-                loader = UnstructuredMarkdownLoader(file_path)
+        elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
            loader = TextLoader(file_path)
        elif file_ext in octet_markdown:
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            loader = TextLoader(file_path)
            known_type=False
        data = loader.load()
        result = store_data_in_vector_db(data, collection_name)
@ -199,6 +197,7 @@ def store_doc(
                "status": True,
                "collection_name": collection_name,
                "filename": filename,
                "known_type":known_type,
            }
        else:
            raise HTTPException(
--- a/src/lib/components/chat/MessageInput.svelte
+++ b/src/lib/components/chat/MessageInput.svelte
@ -173,7 +173,8 @@
 					) {
 						uploadDoc(file);
 					} else {
-						toast.error(`Unsupported File Type '${file['type']}'.`);
+						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
 						uploadDoc(file);
 					}
 				} else {
 					toast.error(`File not found.`);
@ -308,8 +309,9 @@
 								uploadDoc(file);
 								filesInputElement.value = '';
 							} else {
-								toast.error(`Unsupported File Type '${file['type']}'.`);
+								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
-								inputFiles = null;
+								uploadDoc(file);
 								filesInputElement.value = '';
 							}
 						} else {
 							toast.error(`File not found.`);
--- a/src/lib/constants.ts
+++ b/src/lib/constants.ts
@ -13,10 +13,15 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16';
 export const SUPPORTED_FILE_TYPE = [
 	'application/pdf',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 	'text/markdown',
 	'text/plain',
-	'text/csv'
+	'text/csv',
 	'text/xml',
 	'text/x-python',
 	'text/css',
 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 	'application/octet-stream',
 	'application/x-javascript',
 	'text/markdown',
 ];
 // Source: https://kit.svelte.dev/docs/modules#$env-static-public
--- a/src/routes/(app)/documents/+page.svelte
+++ b/src/routes/(app)/documents/+page.svelte
@ -73,7 +73,8 @@
 				) {
 					uploadDoc(file);
 				} else {
-					toast.error(`Unsupported File Type '${file['type']}'.`);
+					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
 					uploadDoc(file);
 				}
 			} else {
 				toast.error(`File not found.`);
@ -153,7 +154,8 @@
 						) {
 							uploadDoc(file);
 						} else {
-							toast.error(`Unsupported File Type '${file['type']}'.`);
+							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
 							uploadDoc(file);
 						}
 						inputFiles = null;