forked from open-webui/open-webui
		
	Merge pull request #547 from Marclass/main
feat: Add reStructuredText specific parser for RAG
This commit is contained in:
		
						commit
						d517a3ebb4
					
				
					 4 changed files with 26 additions and 13 deletions
				
			
		|  | @ -22,6 +22,7 @@ from langchain_community.document_loaders import ( | ||||||
|     UnstructuredWordDocumentLoader, |     UnstructuredWordDocumentLoader, | ||||||
|     UnstructuredMarkdownLoader, |     UnstructuredMarkdownLoader, | ||||||
|     UnstructuredXMLLoader, |     UnstructuredXMLLoader, | ||||||
|  |     UnstructuredRSTLoader, | ||||||
| ) | ) | ||||||
| from langchain.text_splitter import RecursiveCharacterTextSplitter | from langchain.text_splitter import RecursiveCharacterTextSplitter | ||||||
| from langchain_community.vectorstores import Chroma | from langchain_community.vectorstores import Chroma | ||||||
|  | @ -178,6 +179,8 @@ def store_doc( | ||||||
|             loader = Docx2txtLoader(file_path) |             loader = Docx2txtLoader(file_path) | ||||||
|         elif file_ext=="csv": |         elif file_ext=="csv": | ||||||
|             loader = CSVLoader(file_path) |             loader = CSVLoader(file_path) | ||||||
|  |         elif file_ext=="rst": | ||||||
|  |             loader = UnstructuredRSTLoader(file_path, mode="elements") | ||||||
|         elif file_ext in text_xml: |         elif file_ext in text_xml: | ||||||
|             loader=UnstructuredXMLLoader(file_path) |             loader=UnstructuredXMLLoader(file_path) | ||||||
|         elif file_ext in known_source_ext or file.content_type.find("text/")>=0: |         elif file_ext in known_source_ext or file.content_type.find("text/")>=0: | ||||||
|  |  | ||||||
|  | @ -8,7 +8,7 @@ | ||||||
| 	import Suggestions from './MessageInput/Suggestions.svelte'; | 	import Suggestions from './MessageInput/Suggestions.svelte'; | ||||||
| 	import { uploadDocToVectorDB } from '$lib/apis/rag'; | 	import { uploadDocToVectorDB } from '$lib/apis/rag'; | ||||||
| 	import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte'; | 	import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte'; | ||||||
| 	import { SUPPORTED_FILE_TYPE } from '$lib/constants'; | 	import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; | ||||||
| 	import Documents from './MessageInput/Documents.svelte'; | 	import Documents from './MessageInput/Documents.svelte'; | ||||||
| 	import Models from './MessageInput/Models.svelte'; | 	import Models from './MessageInput/Models.svelte'; | ||||||
| 
 | 
 | ||||||
|  | @ -169,11 +169,13 @@ | ||||||
| 						reader.readAsDataURL(file); | 						reader.readAsDataURL(file); | ||||||
| 					} else if ( | 					} else if ( | ||||||
| 						SUPPORTED_FILE_TYPE.includes(file['type']) || | 						SUPPORTED_FILE_TYPE.includes(file['type']) || | ||||||
| 						['md'].includes(file.name.split('.').at(-1)) | 						SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1)) | ||||||
| 					) { | 					) { | ||||||
| 						uploadDoc(file); | 						uploadDoc(file); | ||||||
| 					} else { | 					} else { | ||||||
| 						toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | 						toast.error( | ||||||
|  | 							`Unknown File Type '${file['type']}', but accepting and treating as plain text` | ||||||
|  | 						); | ||||||
| 						uploadDoc(file); | 						uploadDoc(file); | ||||||
| 					} | 					} | ||||||
| 				} else { | 				} else { | ||||||
|  | @ -304,12 +306,14 @@ | ||||||
| 								reader.readAsDataURL(file); | 								reader.readAsDataURL(file); | ||||||
| 							} else if ( | 							} else if ( | ||||||
| 								SUPPORTED_FILE_TYPE.includes(file['type']) || | 								SUPPORTED_FILE_TYPE.includes(file['type']) || | ||||||
| 								['md'].includes(file.name.split('.').at(-1)) | 								SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1)) | ||||||
| 							) { | 							) { | ||||||
| 								uploadDoc(file); | 								uploadDoc(file); | ||||||
| 								filesInputElement.value = ''; | 								filesInputElement.value = ''; | ||||||
| 							} else { | 							} else { | ||||||
| 								toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | 								toast.error( | ||||||
|  | 									`Unknown File Type '${file['type']}', but accepting and treating as plain text` | ||||||
|  | 								); | ||||||
| 								uploadDoc(file); | 								uploadDoc(file); | ||||||
| 								filesInputElement.value = ''; | 								filesInputElement.value = ''; | ||||||
| 							} | 							} | ||||||
|  | @ -466,8 +470,8 @@ | ||||||
| 							placeholder={chatInputPlaceholder !== '' | 							placeholder={chatInputPlaceholder !== '' | ||||||
| 								? chatInputPlaceholder | 								? chatInputPlaceholder | ||||||
| 								: speechRecognitionListening | 								: speechRecognitionListening | ||||||
| 									? 'Listening...' | 								? 'Listening...' | ||||||
| 									: 'Send a message'} | 								: 'Send a message'} | ||||||
| 							bind:value={prompt} | 							bind:value={prompt} | ||||||
| 							on:keypress={(e) => { | 							on:keypress={(e) => { | ||||||
| 								if (e.keyCode == 13 && !e.shiftKey) { | 								if (e.keyCode == 13 && !e.shiftKey) { | ||||||
|  |  | ||||||
|  | @ -21,9 +21,11 @@ export const SUPPORTED_FILE_TYPE = [ | ||||||
| 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | 	'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | ||||||
| 	'application/octet-stream', | 	'application/octet-stream', | ||||||
| 	'application/x-javascript', | 	'application/x-javascript', | ||||||
| 	'text/markdown', | 	'text/markdown' | ||||||
| ]; | ]; | ||||||
| 
 | 
 | ||||||
|  | export const SUPPORTED_FILE_EXTENSIONS = ['md', 'rst']; | ||||||
|  | 
 | ||||||
| // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 | // Source: https://kit.svelte.dev/docs/modules#$env-static-public
 | ||||||
| // This feature, akin to $env/static/private, exclusively incorporates environment variables
 | // This feature, akin to $env/static/private, exclusively incorporates environment variables
 | ||||||
| // that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_).
 | // that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_).
 | ||||||
|  |  | ||||||
|  | @ -7,7 +7,7 @@ | ||||||
| 	import { documents } from '$lib/stores'; | 	import { documents } from '$lib/stores'; | ||||||
| 	import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents'; | 	import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents'; | ||||||
| 
 | 
 | ||||||
| 	import { SUPPORTED_FILE_TYPE } from '$lib/constants'; | 	import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants'; | ||||||
| 	import { uploadDocToVectorDB } from '$lib/apis/rag'; | 	import { uploadDocToVectorDB } from '$lib/apis/rag'; | ||||||
| 	import { transformFileName } from '$lib/utils'; | 	import { transformFileName } from '$lib/utils'; | ||||||
| 
 | 
 | ||||||
|  | @ -69,11 +69,13 @@ | ||||||
| 				const file = inputFiles[0]; | 				const file = inputFiles[0]; | ||||||
| 				if ( | 				if ( | ||||||
| 					SUPPORTED_FILE_TYPE.includes(file['type']) || | 					SUPPORTED_FILE_TYPE.includes(file['type']) || | ||||||
| 					['md'].includes(file.name.split('.').at(-1)) | 					SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1)) | ||||||
| 				) { | 				) { | ||||||
| 					uploadDoc(file); | 					uploadDoc(file); | ||||||
| 				} else { | 				} else { | ||||||
| 					toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | 					toast.error( | ||||||
|  | 						`Unknown File Type '${file['type']}', but accepting and treating as plain text` | ||||||
|  | 					); | ||||||
| 					uploadDoc(file); | 					uploadDoc(file); | ||||||
| 				} | 				} | ||||||
| 			} else { | 			} else { | ||||||
|  | @ -150,11 +152,13 @@ | ||||||
| 						const file = inputFiles[0]; | 						const file = inputFiles[0]; | ||||||
| 						if ( | 						if ( | ||||||
| 							SUPPORTED_FILE_TYPE.includes(file['type']) || | 							SUPPORTED_FILE_TYPE.includes(file['type']) || | ||||||
| 							['md'].includes(file.name.split('.').at(-1)) | 							SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1)) | ||||||
| 						) { | 						) { | ||||||
| 							uploadDoc(file); | 							uploadDoc(file); | ||||||
| 						} else { | 						} else { | ||||||
| 							toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); | 							toast.error( | ||||||
|  | 								`Unknown File Type '${file['type']}', but accepting and treating as plain text` | ||||||
|  | 							); | ||||||
| 							uploadDoc(file); | 							uploadDoc(file); | ||||||
| 						} | 						} | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Jaeryang Baek
						Timothy Jaeryang Baek