From aa1d386042cc6e30dea92ddd99684eecf7edc7c1 Mon Sep 17 00:00:00 2001 From: Marclass Date: Thu, 18 Jan 2024 20:41:14 -0700 Subject: [PATCH] Allow any file to be used for RAG. Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file. --- backend/apps/rag/main.py | 56 +++++++-------------- src/lib/components/chat/MessageInput.svelte | 8 +-- src/routes/(app)/documents/+page.svelte | 6 ++- 3 files changed, 27 insertions(+), 43 deletions(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 36a12e4c..65dde89a 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -144,37 +144,21 @@ def store_doc( # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" print(file.content_type) - if file.content_type not in [ - "application/pdf", - "text/plain", - "text/csv", - "text/xml", - "text/x-python", - "text/css", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/octet-stream", - "application/x-javascript", - ]: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - text_xml=["text/xml"] + + text_xml=["xml"] octet_markdown=["md"] - octet_plain=[ + known_source_ext=[ "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" ] + docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + known_doc_ext=["doc","docx"] file_ext=file.filename.split(".")[-1].lower() - if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - + known_type=True + try: filename = file.filename file_path = f"{UPLOAD_DIR}/{filename}" @@ -188,27 +172,22 @@ def store_doc( collection_name = calculate_sha256(f)[:63] f.close() - if file.content_type == "application/pdf": + if file_ext=="pdf": loader = PyPDFLoader(file_path) - elif ( - file.content_type - == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): + elif (file.content_type ==docx_type or file_ext in known_doc_ext): loader = Docx2txtLoader(file_path) - - elif file.content_type == "text/csv": + elif file_ext=="csv": loader = CSVLoader(file_path) - elif file.content_type in text_xml: + elif file_ext in text_xml: loader=UnstructuredXMLLoader(file_path) - elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: + elif file_ext in known_source_ext or file.content_type.find("text/")>=0: loader = TextLoader(file_path) - elif file.content_type == "application/octet-stream": - if file_ext in octet_markdown: - loader = UnstructuredMarkdownLoader(file_path) - if file_ext in octet_plain: - loader = TextLoader(file_path) - elif file.content_type == "application/x-javascript": + elif file_ext in octet_markdown: + loader = UnstructuredMarkdownLoader(file_path) + else: loader = TextLoader(file_path) + known_type=False + data = loader.load() result = store_data_in_vector_db(data, collection_name) @@ -218,6 +197,7 @@ def store_doc( "status": True, "collection_name": collection_name, "filename": filename, + "known_type":known_type, } else: raise HTTPException( diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index 4830b98c..ff82d606 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -173,7 +173,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -308,8 +309,9 @@ uploadDoc(file); filesInputElement.value = ''; } else { - toast.error(`Unsupported File Type '${file['type']}'.`); - inputFiles = null; + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); + filesInputElement.value = ''; } } else { toast.error(`File not found.`); diff --git a/src/routes/(app)/documents/+page.svelte b/src/routes/(app)/documents/+page.svelte index 9a0aa130..597220b5 100644 --- a/src/routes/(app)/documents/+page.svelte +++ b/src/routes/(app)/documents/+page.svelte @@ -73,7 +73,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -153,7 +154,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } inputFiles = null;