From 43d8466677e54a2492b58ef7049fc98d93871515 Mon Sep 17 00:00:00 2001 From: Marclass Date: Wed, 17 Jan 2024 00:09:47 -0700 Subject: [PATCH 1/6] feat: Add RAG support for various programming languages Enables RAG for golang, python, java, sh, bat, powershell, cmd, js, css, c/c++/c#, sql, logs, ini, perl, r, dart, docker, env, php, haskell, lua, conf, plsql, ruby, db2, scalla, bash, swift, vue, html, xml, and other arbitrary text files. --- backend/apps/rag/main.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index a4776691..11bbbbe8 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredMarkdownLoader, + UnstructuredXMLLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -147,6 +148,9 @@ def store_doc( "application/pdf", "text/plain", "text/csv", + "text/xml", + "text/html", + "text/x-python", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/octet-stream", ]: @@ -154,10 +158,17 @@ def store_doc( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, ) - - if file.content_type == "application/octet-stream" and file.filename.split(".")[ - -1 - ] not in ["md"]: + text_xml=["text/html", "text/xml"] + octet_markdown=["md"] + octet_plain=[ + "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", + "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", + "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", + "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", + "rb", "rs", "db2", "scala", "bash", "swift", "vue" + ] + file_ext=file.filename.split(".")[-1].lower() + if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, @@ -183,13 +194,18 @@ def store_doc( == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ): loader = Docx2txtLoader(file_path) - elif file.content_type == "text/plain": - loader = TextLoader(file_path) + elif file.content_type == "text/csv": loader = CSVLoader(file_path) + elif file.content_type in text_xml: + loader=UnstructuredXMLLoader(file_path) + elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: + loader = TextLoader(file_path) elif file.content_type == "application/octet-stream": - if file.filename.split(".")[-1] == "md": + if file_ext in octet_markdown: loader = UnstructuredMarkdownLoader(file_path) + if file_ext in octet_plain: + loader = TextLoader(file_path) data = loader.load() result = store_data_in_vector_db(data, collection_name) From cf6b3fa48aa11142f5df2ee0dda28391f326514b Mon Sep 17 00:00:00 2001 From: Marclass Date: Wed, 17 Jan 2024 00:34:22 -0700 Subject: [PATCH 2/6] remove html type and add js/css --- backend/apps/rag/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 11bbbbe8..820ec195 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -149,16 +149,17 @@ def store_doc( "text/plain", "text/csv", "text/xml", - "text/html", "text/x-python", + "text/css", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/octet-stream", + "application/x-javascript", ]: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, ) - text_xml=["text/html", "text/xml"] + text_xml=["text/xml"] octet_markdown=["md"] octet_plain=[ "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", @@ -206,6 +207,8 @@ def store_doc( loader = UnstructuredMarkdownLoader(file_path) if file_ext in octet_plain: loader = TextLoader(file_path) + elif file.content_type == "application/x-javascript": + loader = TextLoader(file_path) data = loader.load() result = store_data_in_vector_db(data, collection_name) From 9b6378813535291f78d0694e239b71d949823370 Mon Sep 17 00:00:00 2001 From: Marclass Date: Wed, 17 Jan 2024 16:28:52 -0700 Subject: [PATCH 3/6] Update constants.ts include new upload types in front end --- src/lib/constants.ts | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 5d77834b..3d1cfb05 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -13,10 +13,14 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16'; export const SUPPORTED_FILE_TYPE = [ 'application/pdf', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'text/markdown', 'text/plain', - 'text/csv' + 'text/csv', + 'text/xml', + 'text/x-python', + 'text/css', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/octet-stream', + 'application/x-javascript', ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public From c7cee3ec250131722ad27b242e5ccf40cc3571fd Mon Sep 17 00:00:00 2001 From: Marclass Date: Wed, 17 Jan 2024 16:34:19 -0700 Subject: [PATCH 4/6] Update constants.ts was missing markdown --- src/lib/constants.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 3d1cfb05..1d54dae1 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -21,6 +21,7 @@ export const SUPPORTED_FILE_TYPE = [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/octet-stream', 'application/x-javascript', + 'text/markdown', ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public From 6070e6bcd1cb18dc7387389a2394ec464f1353d9 Mon Sep 17 00:00:00 2001 From: Marclass Date: Wed, 17 Jan 2024 20:10:34 -0700 Subject: [PATCH 5/6] add svelte type to RAG --- backend/apps/rag/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 820ec195..36a12e4c 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -166,7 +166,7 @@ def store_doc( "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", - "rb", "rs", "db2", "scala", "bash", "swift", "vue" + "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" ] file_ext=file.filename.split(".")[-1].lower() if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): From aa1d386042cc6e30dea92ddd99684eecf7edc7c1 Mon Sep 17 00:00:00 2001 From: Marclass Date: Thu, 18 Jan 2024 20:41:14 -0700 Subject: [PATCH 6/6] Allow any file to be used for RAG. Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file. --- backend/apps/rag/main.py | 56 +++++++-------------- src/lib/components/chat/MessageInput.svelte | 8 +-- src/routes/(app)/documents/+page.svelte | 6 ++- 3 files changed, 27 insertions(+), 43 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 36a12e4c..65dde89a 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -144,37 +144,21 @@ def store_doc( # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" print(file.content_type) - if file.content_type not in [ - "application/pdf", - "text/plain", - "text/csv", - "text/xml", - "text/x-python", - "text/css", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/octet-stream", - "application/x-javascript", - ]: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - text_xml=["text/xml"] + + text_xml=["xml"] octet_markdown=["md"] - octet_plain=[ + known_source_ext=[ "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" ] + docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + known_doc_ext=["doc","docx"] file_ext=file.filename.split(".")[-1].lower() - if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - + known_type=True + try: filename = file.filename file_path = f"{UPLOAD_DIR}/{filename}" @@ -188,27 +172,22 @@ def store_doc( collection_name = calculate_sha256(f)[:63] f.close() - if file.content_type == "application/pdf": + if file_ext=="pdf": loader = PyPDFLoader(file_path) - elif ( - file.content_type - == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): + elif (file.content_type ==docx_type or file_ext in known_doc_ext): loader = Docx2txtLoader(file_path) - - elif file.content_type == "text/csv": + elif file_ext=="csv": loader = CSVLoader(file_path) - elif file.content_type in text_xml: + elif file_ext in text_xml: loader=UnstructuredXMLLoader(file_path) - elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: + elif file_ext in known_source_ext or file.content_type.find("text/")>=0: loader = TextLoader(file_path) - elif file.content_type == "application/octet-stream": - if file_ext in octet_markdown: - loader = UnstructuredMarkdownLoader(file_path) - if file_ext in octet_plain: - loader = TextLoader(file_path) - elif file.content_type == "application/x-javascript": + elif file_ext in octet_markdown: + loader = UnstructuredMarkdownLoader(file_path) + else: loader = TextLoader(file_path) + known_type=False + data = loader.load() result = store_data_in_vector_db(data, collection_name) @@ -218,6 +197,7 @@ def store_doc( "status": True, "collection_name": collection_name, "filename": filename, + "known_type":known_type, } else: raise HTTPException( diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index 4830b98c..ff82d606 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -173,7 +173,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -308,8 +309,9 @@ uploadDoc(file); filesInputElement.value = ''; } else { - toast.error(`Unsupported File Type '${file['type']}'.`); - inputFiles = null; + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); + filesInputElement.value = ''; } } else { toast.error(`File not found.`); diff --git a/src/routes/(app)/documents/+page.svelte b/src/routes/(app)/documents/+page.svelte index 9a0aa130..597220b5 100644 --- a/src/routes/(app)/documents/+page.svelte +++ b/src/routes/(app)/documents/+page.svelte @@ -73,7 +73,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -153,7 +154,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } inputFiles = null;