forked from open-webui/open-webui
		
	Merge pull request #1292 from ddanat-smm/dev
Add htm/html support for RAG documents
This commit is contained in:
		
						commit
						a1fc2f4df0
					
				
					 3 changed files with 61 additions and 40 deletions
				
			
		|  | @ -21,6 +21,7 @@ from langchain_community.document_loaders import ( | |||
|     TextLoader, | ||||
|     PyPDFLoader, | ||||
|     CSVLoader, | ||||
|     BSHTMLLoader, | ||||
|     Docx2txtLoader, | ||||
|     UnstructuredEPubLoader, | ||||
|     UnstructuredWordDocumentLoader, | ||||
|  | @ -114,6 +115,7 @@ class CollectionNameForm(BaseModel): | |||
| class StoreWebForm(CollectionNameForm): | ||||
|     url: str | ||||
| 
 | ||||
| 
 | ||||
| @app.get("/") | ||||
| async def get_status(): | ||||
|     return { | ||||
|  | @ -296,13 +298,18 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): | |||
| 
 | ||||
| 
 | ||||
| def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: | ||||
| 
 | ||||
|     text_splitter = RecursiveCharacterTextSplitter( | ||||
|         chunk_size=app.state.CHUNK_SIZE, | ||||
|         chunk_overlap=app.state.CHUNK_OVERLAP, | ||||
|         add_start_index=True, | ||||
|     ) | ||||
|     docs = text_splitter.split_documents(data) | ||||
|     return store_docs_in_vector_db(docs, collection_name, overwrite) | ||||
| 
 | ||||
|     if len(docs) > 0: | ||||
|         return store_docs_in_vector_db(docs, collection_name, overwrite), None | ||||
|     else: | ||||
|         raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT) | ||||
| 
 | ||||
| 
 | ||||
| def store_text_in_vector_db( | ||||
|  | @ -318,6 +325,7 @@ def store_text_in_vector_db( | |||
| 
 | ||||
| 
 | ||||
| def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: | ||||
| 
 | ||||
|     texts = [doc.page_content for doc in docs] | ||||
|     metadatas = [doc.metadata for doc in docs] | ||||
| 
 | ||||
|  | @ -402,6 +410,8 @@ def get_loader(filename: str, file_content_type: str, file_path: str): | |||
|         loader = UnstructuredRSTLoader(file_path, mode="elements") | ||||
|     elif file_ext == "xml": | ||||
|         loader = UnstructuredXMLLoader(file_path) | ||||
|     elif file_ext in ["htm", "html"]: | ||||
|         loader = BSHTMLLoader(file_path, open_encoding="unicode_escape") | ||||
|     elif file_ext == "md": | ||||
|         loader = UnstructuredMarkdownLoader(file_path) | ||||
|     elif file_content_type == "application/epub+zip": | ||||
|  | @ -452,19 +462,21 @@ def store_doc( | |||
| 
 | ||||
|         loader, known_type = get_loader(file.filename, file.content_type, file_path) | ||||
|         data = loader.load() | ||||
|         result = store_data_in_vector_db(data, collection_name) | ||||
| 
 | ||||
|         if result: | ||||
|             return { | ||||
|                 "status": True, | ||||
|                 "collection_name": collection_name, | ||||
|                 "filename": filename, | ||||
|                 "known_type": known_type, | ||||
|             } | ||||
|         else: | ||||
|         try: | ||||
|             result = store_data_in_vector_db(data, collection_name) | ||||
| 
 | ||||
|             if result: | ||||
|                 return { | ||||
|                     "status": True, | ||||
|                     "collection_name": collection_name, | ||||
|                     "filename": filename, | ||||
|                     "known_type": known_type, | ||||
|                 } | ||||
|         except Exception as e: | ||||
|             raise HTTPException( | ||||
|                 status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||||
|                 detail=ERROR_MESSAGES.DEFAULT(), | ||||
|                 detail=e, | ||||
|             ) | ||||
|     except Exception as e: | ||||
|         log.exception(e) | ||||
|  | @ -529,38 +541,42 @@ def scan_docs_dir(user=Depends(get_admin_user)): | |||
|                 ) | ||||
|                 data = loader.load() | ||||
| 
 | ||||
|                 result = store_data_in_vector_db(data, collection_name) | ||||
|                 try: | ||||
|                     result = store_data_in_vector_db(data, collection_name) | ||||
| 
 | ||||
|                 if result: | ||||
|                     sanitized_filename = sanitize_filename(filename) | ||||
|                     doc = Documents.get_doc_by_name(sanitized_filename) | ||||
|                     if result: | ||||
|                         sanitized_filename = sanitize_filename(filename) | ||||
|                         doc = Documents.get_doc_by_name(sanitized_filename) | ||||
| 
 | ||||
|                     if doc == None: | ||||
|                         doc = Documents.insert_new_doc( | ||||
|                             user.id, | ||||
|                             DocumentForm( | ||||
|                                 **{ | ||||
|                                     "name": sanitized_filename, | ||||
|                                     "title": filename, | ||||
|                                     "collection_name": collection_name, | ||||
|                                     "filename": filename, | ||||
|                                     "content": ( | ||||
|                                         json.dumps( | ||||
|                                             { | ||||
|                                                 "tags": list( | ||||
|                                                     map( | ||||
|                                                         lambda name: {"name": name}, | ||||
|                                                         tags, | ||||
|                         if doc == None: | ||||
|                             doc = Documents.insert_new_doc( | ||||
|                                 user.id, | ||||
|                                 DocumentForm( | ||||
|                                     **{ | ||||
|                                         "name": sanitized_filename, | ||||
|                                         "title": filename, | ||||
|                                         "collection_name": collection_name, | ||||
|                                         "filename": filename, | ||||
|                                         "content": ( | ||||
|                                             json.dumps( | ||||
|                                                 { | ||||
|                                                     "tags": list( | ||||
|                                                         map( | ||||
|                                                             lambda name: {"name": name}, | ||||
|                                                             tags, | ||||
|                                                         ) | ||||
|                                                     ) | ||||
|                                                 ) | ||||
|                                             } | ||||
|                                         ) | ||||
|                                         if len(tags) | ||||
|                                         else "{}" | ||||
|                                     ), | ||||
|                                 } | ||||
|                             ), | ||||
|                         ) | ||||
|                                                 } | ||||
|                                             ) | ||||
|                                             if len(tags) | ||||
|                                             else "{}" | ||||
|                                         ), | ||||
|                                     } | ||||
|                                 ), | ||||
|                             ) | ||||
|                 except Exception as e: | ||||
|                     print(e) | ||||
|                     pass | ||||
| 
 | ||||
|         except Exception as e: | ||||
|             log.exception(e) | ||||
|  |  | |||
|  | @ -60,3 +60,5 @@ class ERROR_MESSAGES(str, Enum): | |||
|     MODEL_NOT_FOUND = lambda name="": f"Model '{name}' was not found" | ||||
|     OPENAI_NOT_FOUND = lambda name="": f"OpenAI API was not found" | ||||
|     OLLAMA_NOT_FOUND = "WebUI could not connect to Ollama" | ||||
| 
 | ||||
|     EMPTY_CONTENT = "The content provided is empty. Please ensure that there is text or data present before proceeding." | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy Jaeryang Baek
						Timothy Jaeryang Baek