forked from open-webui/open-webui
		
	Merge pull request #554 from Marclass/main
feat: Add excel parser for RAG
This commit is contained in:
		
						commit
						d29321f1ec
					
				
					 3 changed files with 89 additions and 38 deletions
				
			
		|  | @ -24,6 +24,7 @@ from langchain_community.document_loaders import ( | |||
|     UnstructuredMarkdownLoader, | ||||
|     UnstructuredXMLLoader, | ||||
|     UnstructuredRSTLoader, | ||||
|     UnstructuredExcelLoader, | ||||
| ) | ||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||
| from langchain_community.vectorstores import Chroma | ||||
|  | @ -137,6 +138,87 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def get_loader(file, file_path): | ||||
|     file_ext = file.filename.split(".")[-1].lower() | ||||
|     known_type = True | ||||
| 
 | ||||
|     known_source_ext = [ | ||||
|         "go", | ||||
|         "py", | ||||
|         "java", | ||||
|         "sh", | ||||
|         "bat", | ||||
|         "ps1", | ||||
|         "cmd", | ||||
|         "js", | ||||
|         "ts", | ||||
|         "css", | ||||
|         "cpp", | ||||
|         "hpp", | ||||
|         "h", | ||||
|         "c", | ||||
|         "cs", | ||||
|         "sql", | ||||
|         "log", | ||||
|         "ini", | ||||
|         "pl", | ||||
|         "pm", | ||||
|         "r", | ||||
|         "dart", | ||||
|         "dockerfile", | ||||
|         "env", | ||||
|         "php", | ||||
|         "hs", | ||||
|         "hsc", | ||||
|         "lua", | ||||
|         "nginxconf", | ||||
|         "conf", | ||||
|         "m", | ||||
|         "mm", | ||||
|         "plsql", | ||||
|         "perl", | ||||
|         "rb", | ||||
|         "rs", | ||||
|         "db2", | ||||
|         "scala", | ||||
|         "bash", | ||||
|         "swift", | ||||
|         "vue", | ||||
|         "svelte", | ||||
|     ] | ||||
| 
 | ||||
|     if file_ext == "pdf": | ||||
|         loader = PyPDFLoader(file_path) | ||||
|     elif file_ext == "csv": | ||||
|         loader = CSVLoader(file_path) | ||||
|     elif file_ext == "rst": | ||||
|         loader = UnstructuredRSTLoader(file_path, mode="elements") | ||||
|     elif file_ext == "xml": | ||||
|         loader = UnstructuredXMLLoader(file_path) | ||||
|     elif file_ext == "md": | ||||
|         loader = UnstructuredMarkdownLoader(file_path) | ||||
|     elif file.content_type == "application/epub+zip": | ||||
|         loader = UnstructuredEPubLoader(file_path) | ||||
|     elif ( | ||||
|         file.content_type | ||||
|         == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||
|         or file_ext in ["doc", "docx"] | ||||
|     ): | ||||
|         loader = Docx2txtLoader(file_path) | ||||
|     elif file.content_type in [ | ||||
|         "application/vnd.ms-excel", | ||||
|         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | ||||
|     ] or file_ext in ["xls", "xlsx"]: | ||||
|         loader = UnstructuredExcelLoader(file_path) | ||||
|     elif file_ext in known_source_ext or file.content_type.find("text/") >= 0: | ||||
|         loader = TextLoader(file_path) | ||||
|     else: | ||||
|         loader = TextLoader(file_path) | ||||
|         known_type = False | ||||
| 
 | ||||
|     return loader, known_type | ||||
| 
 | ||||
| 
 | ||||
| @app.post("/doc") | ||||
| def store_doc( | ||||
|     collection_name: Optional[str] = Form(None), | ||||
|  | @ -146,21 +228,6 @@ def store_doc( | |||
|     # "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm" | ||||
| 
 | ||||
|     print(file.content_type) | ||||
|      | ||||
|     text_xml=["xml"] | ||||
|     octet_markdown=["md"] | ||||
|     known_source_ext=[ | ||||
|         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "ts", | ||||
|         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", | ||||
|         "pl", "pm", "r", "dart", "dockerfile", "env", "php", "hs", | ||||
|         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", | ||||
|         "rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte" | ||||
|         ] | ||||
|     docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||
|     known_doc_ext=["doc","docx"] | ||||
|     file_ext=file.filename.split(".")[-1].lower() | ||||
|     known_type=True | ||||
|      | ||||
|     try: | ||||
|         filename = file.filename | ||||
|         file_path = f"{UPLOAD_DIR}/{filename}" | ||||
|  | @ -174,27 +241,7 @@ def store_doc( | |||
|             collection_name = calculate_sha256(f)[:63] | ||||
|         f.close() | ||||
| 
 | ||||
|         if file_ext=="pdf": | ||||
|             loader = PyPDFLoader(file_path) | ||||
|         elif (file.content_type ==docx_type or file_ext in known_doc_ext): | ||||
|             loader = Docx2txtLoader(file_path) | ||||
|         elif file_ext=="csv": | ||||
|             loader = CSVLoader(file_path) | ||||
|         elif file_ext=="rst": | ||||
|             loader = UnstructuredRSTLoader(file_path, mode="elements") | ||||
|         elif file_ext in text_xml: | ||||
|             loader=UnstructuredXMLLoader(file_path) | ||||
|         elif file_ext in known_source_ext or file.content_type.find("text/")>=0: | ||||
|             loader = TextLoader(file_path) | ||||
|         elif file_ext in octet_markdown: | ||||
|             loader = UnstructuredMarkdownLoader(file_path) | ||||
|         elif file.content_type == "application/epub+zip": | ||||
|             loader = UnstructuredEPubLoader(file_path) | ||||
|         else: | ||||
|             loader = TextLoader(file_path) | ||||
|             known_type=False | ||||
| 
 | ||||
| 
 | ||||
|         loader, known_type = get_loader(file, file_path) | ||||
|         data = loader.load() | ||||
|         result = store_data_in_vector_db(data, collection_name) | ||||
| 
 | ||||
|  | @ -203,7 +250,7 @@ def store_doc( | |||
|                 "status": True, | ||||
|                 "collection_name": collection_name, | ||||
|                 "filename": filename, | ||||
|                 "known_type":known_type, | ||||
|                 "known_type": known_type, | ||||
|             } | ||||
|         else: | ||||
|             raise HTTPException( | ||||
|  |  | |||
|  | @ -25,6 +25,10 @@ docx2txt | |||
| unstructured | ||||
| markdown | ||||
| pypandoc | ||||
| pandas | ||||
| openpyxl | ||||
| pyxlsb | ||||
| xlrd | ||||
| 
 | ||||
| PyJWT | ||||
| pyjwt[crypto] | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ export const SUPPORTED_FILE_EXTENSIONS = [ | |||
| 	'pl', 'pm', 'r', 'dart', 'dockerfile', 'env', 'php', 'hs', | ||||
| 	'hsc', 'lua', 'nginxconf', 'conf', 'm', 'mm', 'plsql', 'perl', | ||||
| 	'rb', 'rs', 'db2', 'scala', 'bash', 'swift', 'vue', 'svelte', | ||||
| 	'doc','docx', 'pdf', 'csv', 'txt' | ||||
| 	'doc','docx', 'pdf', 'csv', 'txt', 'xls', 'xlsx' | ||||
| ]; | ||||
| 
 | ||||
| // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Jaeryang Baek
						Timothy Jaeryang Baek