forked from open-webui/open-webui
		
	Allow any file to be used for RAG.
Changed RAG parser to prefer file extensions over MIME content types. If the type of file is not recognized assume it's a text file.
This commit is contained in:
		
							parent
							
								
									6070e6bcd1
								
							
						
					
					
						commit
						aa1d386042
					
				
					 3 changed files with 27 additions and 43 deletions
				
			
		|  | @ -144,36 +144,20 @@ def store_doc( | |||
|     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" | ||||
| 
 | ||||
|     print(file.content_type) | ||||
|     if file.content_type not in [ | ||||
|         "application/pdf", | ||||
|         "text/plain", | ||||
|         "text/csv", | ||||
|         "text/xml", | ||||
|         "text/x-python", | ||||
|         "text/css", | ||||
|         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         "application/octet-stream", | ||||
|         "application/x-javascript", | ||||
|     ]: | ||||
|         raise HTTPException( | ||||
|             status_code=status.HTTP_400_BAD_REQUEST, | ||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, | ||||
|         ) | ||||
|     text_xml=["text/xml"] | ||||
|      | ||||
|     text_xml=["xml"] | ||||
|     octet_markdown=["md"] | ||||
|     octet_plain=[ | ||||
|     known_source_ext=[ | ||||
|         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js",  | ||||
|         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", | ||||
|         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", | ||||
|         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", | ||||
|         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" | ||||
|         ] | ||||
|     docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||
|     known_doc_ext=["doc","docx"] | ||||
|     file_ext=file.filename.split(".")[-1].lower() | ||||
|     if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): | ||||
|         raise HTTPException( | ||||
|             status_code=status.HTTP_400_BAD_REQUEST, | ||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, | ||||
|         ) | ||||
|     known_type=True | ||||
|      | ||||
|     try: | ||||
|         filename = file.filename | ||||
|  | @ -188,27 +172,22 @@ def store_doc( | |||
|             collection_name = calculate_sha256(f)[:63] | ||||
|         f.close() | ||||
| 
 | ||||
|         if file.content_type == "application/pdf": | ||||
|         if file_ext=="pdf": | ||||
|             loader = PyPDFLoader(file_path) | ||||
|         elif ( | ||||
|             file.content_type | ||||
|             == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||
|         ): | ||||
|         elif (file.content_type ==docx_type or file_ext in known_doc_ext): | ||||
|             loader = Docx2txtLoader(file_path) | ||||
|          | ||||
|         elif file.content_type == "text/csv": | ||||
|         elif file_ext=="csv": | ||||
|             loader = CSVLoader(file_path) | ||||
|         elif file.content_type in text_xml: | ||||
|         elif file_ext in text_xml: | ||||
|             loader=UnstructuredXMLLoader(file_path) | ||||
|         elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: | ||||
|         elif file_ext in known_source_ext or file.content_type.find("text/")>=0: | ||||
|             loader = TextLoader(file_path) | ||||
|         elif file.content_type == "application/octet-stream": | ||||
|             if file_ext in octet_markdown: | ||||
|                 loader = UnstructuredMarkdownLoader(file_path) | ||||
|             if file_ext in octet_plain: | ||||
|                 loader = TextLoader(file_path) | ||||
|         elif file.content_type == "application/x-javascript": | ||||
|         elif file_ext in octet_markdown: | ||||
|             loader = UnstructuredMarkdownLoader(file_path) | ||||
|         else: | ||||
|             loader = TextLoader(file_path) | ||||
|             known_type=False | ||||
| 
 | ||||
| 
 | ||||
|         data = loader.load() | ||||
|         result = store_data_in_vector_db(data, collection_name) | ||||
|  | @ -218,6 +197,7 @@ def store_doc( | |||
|                 "status": True, | ||||
|                 "collection_name": collection_name, | ||||
|                 "filename": filename, | ||||
|                 "known_type":known_type, | ||||
|             } | ||||
|         else: | ||||
|             raise HTTPException( | ||||
|  |  | |||
|  | @ -173,7 +173,8 @@ | |||
| 					) { | ||||
| 						uploadDoc(file); | ||||
| 					} else { | ||||
| 						toast.error(`Unsupported File Type '${file['type']}'.`); | ||||
| 						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||
| 						uploadDoc(file); | ||||
| 					} | ||||
| 				} else { | ||||
| 					toast.error(`File not found.`); | ||||
|  | @ -308,8 +309,9 @@ | |||
| 								uploadDoc(file); | ||||
| 								filesInputElement.value = ''; | ||||
| 							} else { | ||||
| 								toast.error(`Unsupported File Type '${file['type']}'.`); | ||||
| 								inputFiles = null; | ||||
| 								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||
| 								uploadDoc(file); | ||||
| 								filesInputElement.value = ''; | ||||
| 							} | ||||
| 						} else { | ||||
| 							toast.error(`File not found.`); | ||||
|  |  | |||
|  | @ -73,7 +73,8 @@ | |||
| 				) { | ||||
| 					uploadDoc(file); | ||||
| 				} else { | ||||
| 					toast.error(`Unsupported File Type '${file['type']}'.`); | ||||
| 					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||
| 					uploadDoc(file); | ||||
| 				} | ||||
| 			} else { | ||||
| 				toast.error(`File not found.`); | ||||
|  | @ -153,7 +154,8 @@ | |||
| 						) { | ||||
| 							uploadDoc(file); | ||||
| 						} else { | ||||
| 							toast.error(`Unsupported File Type '${file['type']}'.`); | ||||
| 							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||
| 							uploadDoc(file); | ||||
| 						} | ||||
| 
 | ||||
| 						inputFiles = null; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Marclass
						Marclass