forked from open-webui/open-webui
		
	Merge pull request #524 from Marclass/rag-arbitrary-files
feat: Allow RAG on XML and arbitrary text files including source code
This commit is contained in:
		
						commit
						f079cb6b56
					
				
					 4 changed files with 47 additions and 39 deletions
				
			
		|  | @ -21,6 +21,7 @@ from langchain_community.document_loaders import ( | ||||||
|     Docx2txtLoader, |     Docx2txtLoader, | ||||||
|     UnstructuredWordDocumentLoader, |     UnstructuredWordDocumentLoader, | ||||||
|     UnstructuredMarkdownLoader, |     UnstructuredMarkdownLoader, | ||||||
|  |     UnstructuredXMLLoader, | ||||||
| ) | ) | ||||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter | from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||||
| from langchain_community.vectorstores import Chroma | from langchain_community.vectorstores import Chroma | ||||||
|  | @ -143,26 +144,21 @@ def store_doc( | ||||||
|     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" |     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" | ||||||
| 
 | 
 | ||||||
|     print(file.content_type) |     print(file.content_type) | ||||||
|     if file.content_type not in [ |      | ||||||
|         "application/pdf", |     text_xml=["xml"] | ||||||
|         "text/plain", |     octet_markdown=["md"] | ||||||
|         "text/csv", |     known_source_ext=[ | ||||||
|         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js",  | ||||||
|         "application/octet-stream", |         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", | ||||||
|     ]: |         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", | ||||||
|         raise HTTPException( |         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", | ||||||
|             status_code=status.HTTP_400_BAD_REQUEST, |         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" | ||||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, |         ] | ||||||
|         ) |     docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||||
| 
 |     known_doc_ext=["doc","docx"] | ||||||
|     if file.content_type == "application/octet-stream" and file.filename.split(".")[ |     file_ext=file.filename.split(".")[-1].lower() | ||||||
|         -1 |     known_type=True | ||||||
|     ] not in ["md"]: |      | ||||||
|         raise HTTPException( |  | ||||||
|             status_code=status.HTTP_400_BAD_REQUEST, |  | ||||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|     try: |     try: | ||||||
|         filename = file.filename |         filename = file.filename | ||||||
|         file_path = f"{UPLOAD_DIR}/{filename}" |         file_path = f"{UPLOAD_DIR}/{filename}" | ||||||
|  | @ -176,20 +172,22 @@ def store_doc( | ||||||
|             collection_name = calculate_sha256(f)[:63] |             collection_name = calculate_sha256(f)[:63] | ||||||
|         f.close() |         f.close() | ||||||
| 
 | 
 | ||||||
|         if file.content_type == "application/pdf": |         if file_ext=="pdf": | ||||||
|             loader = PyPDFLoader(file_path) |             loader = PyPDFLoader(file_path) | ||||||
|         elif ( |         elif (file.content_type ==docx_type or file_ext in known_doc_ext): | ||||||
|             file.content_type |  | ||||||
|             == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |  | ||||||
|         ): |  | ||||||
|             loader = Docx2txtLoader(file_path) |             loader = Docx2txtLoader(file_path) | ||||||
|         elif file.content_type == "text/plain": |         elif file_ext=="csv": | ||||||
|             loader = TextLoader(file_path) |  | ||||||
|         elif file.content_type == "text/csv": |  | ||||||
|             loader = CSVLoader(file_path) |             loader = CSVLoader(file_path) | ||||||
|         elif file.content_type == "application/octet-stream": |         elif file_ext in text_xml: | ||||||
|             if file.filename.split(".")[-1] == "md": |             loader=UnstructuredXMLLoader(file_path) | ||||||
|                 loader = UnstructuredMarkdownLoader(file_path) |         elif file_ext in known_source_ext or file.content_type.find("text/")>=0: | ||||||
|  |             loader = TextLoader(file_path) | ||||||
|  |         elif file_ext in octet_markdown: | ||||||
|  |             loader = UnstructuredMarkdownLoader(file_path) | ||||||
|  |         else: | ||||||
|  |             loader = TextLoader(file_path) | ||||||
|  |             known_type=False | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
|         data = loader.load() |         data = loader.load() | ||||||
|         result = store_data_in_vector_db(data, collection_name) |         result = store_data_in_vector_db(data, collection_name) | ||||||
|  | @ -199,6 +197,7 @@ def store_doc( | ||||||
|                 "status": True, |                 "status": True, | ||||||
|                 "collection_name": collection_name, |                 "collection_name": collection_name, | ||||||
|                 "filename": filename, |                 "filename": filename, | ||||||
|  |                 "known_type":known_type, | ||||||
|             } |             } | ||||||
|         else: |         else: | ||||||
|             raise HTTPException( |             raise HTTPException( | ||||||
|  |  | ||||||
|  | @ -173,7 +173,8 @@ | ||||||
| 					) { | 					) { | ||||||
| 						uploadDoc(file); | 						uploadDoc(file); | ||||||
| 					} else { | 					} else { | ||||||
| 						toast.error(`Unsupported File Type '${file['type']}'.`); | 						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||||
|  | 						uploadDoc(file); | ||||||
| 					} | 					} | ||||||
| 				} else { | 				} else { | ||||||
| 					toast.error(`File not found.`); | 					toast.error(`File not found.`); | ||||||
|  | @ -308,8 +309,9 @@ | ||||||
| 								uploadDoc(file); | 								uploadDoc(file); | ||||||
| 								filesInputElement.value = ''; | 								filesInputElement.value = ''; | ||||||
| 							} else { | 							} else { | ||||||
| 								toast.error(`Unsupported File Type '${file['type']}'.`); | 								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||||
| 								inputFiles = null; | 								uploadDoc(file); | ||||||
|  | 								filesInputElement.value = ''; | ||||||
| 							} | 							} | ||||||
| 						} else { | 						} else { | ||||||
| 							toast.error(`File not found.`); | 							toast.error(`File not found.`); | ||||||
|  |  | ||||||
|  | @ -13,10 +13,15 @@ export const REQUIRED_OLLAMA_VERSION = '0.1.16'; | ||||||
| 
 | 
 | ||||||
| export const SUPPORTED_FILE_TYPE = [ | export const SUPPORTED_FILE_TYPE = [ | ||||||
| 	'application/pdf', | 	'application/pdf', | ||||||
| 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', |  | ||||||
| 	'text/markdown', |  | ||||||
| 	'text/plain', | 	'text/plain', | ||||||
| 	'text/csv' | 	'text/csv', | ||||||
|  | 	'text/xml', | ||||||
|  | 	'text/x-python', | ||||||
|  | 	'text/css', | ||||||
|  | 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||||||
|  | 	'application/octet-stream', | ||||||
|  | 	'application/x-javascript', | ||||||
|  | 	'text/markdown', | ||||||
| ]; | ]; | ||||||
| 
 | 
 | ||||||
| // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 | // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 | ||||||
|  |  | ||||||
|  | @ -73,7 +73,8 @@ | ||||||
| 				) { | 				) { | ||||||
| 					uploadDoc(file); | 					uploadDoc(file); | ||||||
| 				} else { | 				} else { | ||||||
| 					toast.error(`Unsupported File Type '${file['type']}'.`); | 					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||||
|  | 					uploadDoc(file); | ||||||
| 				} | 				} | ||||||
| 			} else { | 			} else { | ||||||
| 				toast.error(`File not found.`); | 				toast.error(`File not found.`); | ||||||
|  | @ -153,7 +154,8 @@ | ||||||
| 						) { | 						) { | ||||||
| 							uploadDoc(file); | 							uploadDoc(file); | ||||||
| 						} else { | 						} else { | ||||||
| 							toast.error(`Unsupported File Type '${file['type']}'.`); | 							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | ||||||
|  | 							uploadDoc(file); | ||||||
| 						} | 						} | ||||||
| 
 | 
 | ||||||
| 						inputFiles = null; | 						inputFiles = null; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Jaeryang Baek
						Timothy Jaeryang Baek