diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index a4776691..65dde89a 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( Docx2txtLoader, UnstructuredWordDocumentLoader, UnstructuredMarkdownLoader, + UnstructuredXMLLoader, ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma @@ -143,26 +144,21 @@ def store_doc( # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" print(file.content_type) - if file.content_type not in [ - "application/pdf", - "text/plain", - "text/csv", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/octet-stream", - ]: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - - if file.content_type == "application/octet-stream" and file.filename.split(".")[ - -1 - ] not in ["md"]: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, - ) - + + text_xml=["xml"] + octet_markdown=["md"] + known_source_ext=[ + "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", + "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", + "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", + "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", + "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" + ] + docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" + known_doc_ext=["doc","docx"] + file_ext=file.filename.split(".")[-1].lower() + known_type=True + try: filename = file.filename file_path = f"{UPLOAD_DIR}/{filename}" @@ -176,20 +172,22 @@ def store_doc( collection_name = calculate_sha256(f)[:63] f.close() - if file.content_type == "application/pdf": + if file_ext=="pdf": loader = PyPDFLoader(file_path) - elif ( - file.content_type - == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - ): + elif (file.content_type ==docx_type or file_ext in known_doc_ext): loader = Docx2txtLoader(file_path) - elif file.content_type == "text/plain": - loader = TextLoader(file_path) - elif file.content_type == "text/csv": + elif file_ext=="csv": loader = CSVLoader(file_path) - elif file.content_type == "application/octet-stream": - if file.filename.split(".")[-1] == "md": - loader = UnstructuredMarkdownLoader(file_path) + elif file_ext in text_xml: + loader=UnstructuredXMLLoader(file_path) + elif file_ext in known_source_ext or file.content_type.find("text/")>=0: + loader = TextLoader(file_path) + elif file_ext in octet_markdown: + loader = UnstructuredMarkdownLoader(file_path) + else: + loader = TextLoader(file_path) + known_type=False + data = loader.load() result = store_data_in_vector_db(data, collection_name) @@ -199,6 +197,7 @@ def store_doc( "status": True, "collection_name": collection_name, "filename": filename, + "known_type":known_type, } else: raise HTTPException( diff --git a/src/lib/components/chat/MessageInput.svelte b/src/lib/components/chat/MessageInput.svelte index 4830b98c..ff82d606 100644 --- a/src/lib/components/chat/MessageInput.svelte +++ b/src/lib/components/chat/MessageInput.svelte @@ -173,7 +173,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -308,8 +309,9 @@ uploadDoc(file); filesInputElement.value = ''; } else { - toast.error(`Unsupported File Type '${file['type']}'.`); - inputFiles = null; + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); + filesInputElement.value = ''; } } else { toast.error(`File not found.`); diff --git a/src/lib/constants.ts b/src/lib/constants.ts index 5d77834b..1d54dae1 100644 --- a/src/lib/constants.ts +++ b/src/lib/constants.ts @@ -13,10 +13,15 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16'; export const SUPPORTED_FILE_TYPE = [ 'application/pdf', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'text/markdown', 'text/plain', - 'text/csv' + 'text/csv', + 'text/xml', + 'text/x-python', + 'text/css', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/octet-stream', + 'application/x-javascript', + 'text/markdown', ]; // Source: https://kit.svelte.dev/docs/modules#$env-static-public diff --git a/src/routes/(app)/documents/+page.svelte b/src/routes/(app)/documents/+page.svelte index 9a0aa130..597220b5 100644 --- a/src/routes/(app)/documents/+page.svelte +++ b/src/routes/(app)/documents/+page.svelte @@ -73,7 +73,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } } else { toast.error(`File not found.`); @@ -153,7 +154,8 @@ ) { uploadDoc(file); } else { - toast.error(`Unsupported File Type '${file['type']}'.`); + toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); + uploadDoc(file); } inputFiles = null;