forked from open-webui/open-webui
		
	feat: Add RAG support for various programming languages
Enables RAG for golang, python, java, sh, bat, powershell, cmd, js, css, c/c++/c#, sql, logs, ini, perl, r, dart, docker, env, php, haskell, lua, conf, plsql, ruby, db2, scalla, bash, swift, vue, html, xml, and other arbitrary text files.
This commit is contained in:
		
							parent
							
								
									5e32db1c57
								
							
						
					
					
						commit
						43d8466677
					
				
					 1 changed files with 23 additions and 7 deletions
				
			
		|  | @ -21,6 +21,7 @@ from langchain_community.document_loaders import ( | ||||||
|     Docx2txtLoader, |     Docx2txtLoader, | ||||||
|     UnstructuredWordDocumentLoader, |     UnstructuredWordDocumentLoader, | ||||||
|     UnstructuredMarkdownLoader, |     UnstructuredMarkdownLoader, | ||||||
|  |     UnstructuredXMLLoader, | ||||||
| ) | ) | ||||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter | from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||||
| from langchain_community.vectorstores import Chroma | from langchain_community.vectorstores import Chroma | ||||||
|  | @ -147,6 +148,9 @@ def store_doc( | ||||||
|         "application/pdf", |         "application/pdf", | ||||||
|         "text/plain", |         "text/plain", | ||||||
|         "text/csv", |         "text/csv", | ||||||
|  |         "text/xml", | ||||||
|  |         "text/html", | ||||||
|  |         "text/x-python", | ||||||
|         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |         "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||||
|         "application/octet-stream", |         "application/octet-stream", | ||||||
|     ]: |     ]: | ||||||
|  | @ -154,10 +158,17 @@ def store_doc( | ||||||
|             status_code=status.HTTP_400_BAD_REQUEST, |             status_code=status.HTTP_400_BAD_REQUEST, | ||||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, |             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, | ||||||
|         ) |         ) | ||||||
| 
 |     text_xml=["text/html", "text/xml"] | ||||||
|     if file.content_type == "application/octet-stream" and file.filename.split(".")[ |     octet_markdown=["md"] | ||||||
|         -1 |     octet_plain=[ | ||||||
|     ] not in ["md"]: |         "go", "py", "java", "sh", "bat", "ps1", "cmd", "js",  | ||||||
|  |         "css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini", | ||||||
|  |         "pl" "pm", "r", "dart", "dockerfile", "env", "php", "hs", | ||||||
|  |         "hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl", | ||||||
|  |         "rb", "rs", "db2", "scala", "bash", "swift", "vue" | ||||||
|  |         ] | ||||||
|  |     file_ext=file.filename.split(".")[-1].lower() | ||||||
|  |     if file.content_type == "application/octet-stream" and file_ext not in (octet_markdown + octet_plain): | ||||||
|         raise HTTPException( |         raise HTTPException( | ||||||
|             status_code=status.HTTP_400_BAD_REQUEST, |             status_code=status.HTTP_400_BAD_REQUEST, | ||||||
|             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, |             detail=ERROR_MESSAGES.FILE_NOT_SUPPORTED, | ||||||
|  | @ -183,13 +194,18 @@ def store_doc( | ||||||
|             == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |             == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | ||||||
|         ): |         ): | ||||||
|             loader = Docx2txtLoader(file_path) |             loader = Docx2txtLoader(file_path) | ||||||
|         elif file.content_type == "text/plain": |          | ||||||
|             loader = TextLoader(file_path) |  | ||||||
|         elif file.content_type == "text/csv": |         elif file.content_type == "text/csv": | ||||||
|             loader = CSVLoader(file_path) |             loader = CSVLoader(file_path) | ||||||
|  |         elif file.content_type in text_xml: | ||||||
|  |             loader=UnstructuredXMLLoader(file_path) | ||||||
|  |         elif file.content_type == "text/plain" or file.content_type.find("text/")>=0: | ||||||
|  |             loader = TextLoader(file_path) | ||||||
|         elif file.content_type == "application/octet-stream": |         elif file.content_type == "application/octet-stream": | ||||||
|             if file.filename.split(".")[-1] == "md": |             if file_ext in octet_markdown: | ||||||
|                 loader = UnstructuredMarkdownLoader(file_path) |                 loader = UnstructuredMarkdownLoader(file_path) | ||||||
|  |             if file_ext in octet_plain: | ||||||
|  |                 loader = TextLoader(file_path) | ||||||
| 
 | 
 | ||||||
|         data = loader.load() |         data = loader.load() | ||||||
|         result = store_data_in_vector_db(data, collection_name) |         result = store_data_in_vector_db(data, collection_name) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Marclass
						Marclass