forked from open-webui/open-webui
		
	feat: RAG text ingestion(store) api
This commit is contained in:
		
							parent
							
								
									c2d6d3230b
								
							
						
					
					
						commit
						7e0ea8f77d
					
				
					 2 changed files with 79 additions and 33 deletions
				
			
		|  | @ -111,39 +111,6 @@ class StoreWebForm(CollectionNameForm): | |||
|     url: str | ||||
| 
 | ||||
| 
 | ||||
| def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: | ||||
|     text_splitter = RecursiveCharacterTextSplitter( | ||||
|         chunk_size=app.state.CHUNK_SIZE, chunk_overlap=app.state.CHUNK_OVERLAP | ||||
|     ) | ||||
|     docs = text_splitter.split_documents(data) | ||||
| 
 | ||||
|     texts = [doc.page_content for doc in docs] | ||||
|     metadatas = [doc.metadata for doc in docs] | ||||
| 
 | ||||
|     try: | ||||
|         if overwrite: | ||||
|             for collection in CHROMA_CLIENT.list_collections(): | ||||
|                 if collection_name == collection.name: | ||||
|                     print(f"deleting existing collection {collection_name}") | ||||
|                     CHROMA_CLIENT.delete_collection(name=collection_name) | ||||
| 
 | ||||
|         collection = CHROMA_CLIENT.create_collection( | ||||
|             name=collection_name, | ||||
|             embedding_function=app.state.sentence_transformer_ef, | ||||
|         ) | ||||
| 
 | ||||
|         collection.add( | ||||
|             documents=texts, metadatas=metadatas, ids=[str(uuid.uuid1()) for _ in texts] | ||||
|         ) | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         print(e) | ||||
|         if e.__class__.__name__ == "UniqueConstraintError": | ||||
|             return True | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| @app.get("/") | ||||
| async def get_status(): | ||||
|     return { | ||||
|  | @ -325,6 +292,56 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)): | |||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| def store_data_in_vector_db(data, collection_name, overwrite: bool = False) -> bool: | ||||
|     text_splitter = RecursiveCharacterTextSplitter( | ||||
|         chunk_size=app.state.CHUNK_SIZE, | ||||
|         chunk_overlap=app.state.CHUNK_OVERLAP, | ||||
|         add_start_index=True, | ||||
|     ) | ||||
|     docs = text_splitter.split_documents(data) | ||||
|     return store_docs_in_vector_db(docs, collection_name, overwrite) | ||||
| 
 | ||||
| 
 | ||||
| def store_text_in_vector_db( | ||||
|     text, name, collection_name, overwrite: bool = False | ||||
| ) -> bool: | ||||
|     text_splitter = RecursiveCharacterTextSplitter( | ||||
|         chunk_size=app.state.CHUNK_SIZE, | ||||
|         chunk_overlap=app.state.CHUNK_OVERLAP, | ||||
|         add_start_index=True, | ||||
|     ) | ||||
|     docs = text_splitter.create_documents([text], metadatas=[{"name": name}]) | ||||
|     return store_docs_in_vector_db(docs, collection_name, overwrite) | ||||
| 
 | ||||
| 
 | ||||
| def store_docs_in_vector_db(docs, collection_name, overwrite: bool = False) -> bool: | ||||
|     texts = [doc.page_content for doc in docs] | ||||
|     metadatas = [doc.metadata for doc in docs] | ||||
| 
 | ||||
|     try: | ||||
|         if overwrite: | ||||
|             for collection in CHROMA_CLIENT.list_collections(): | ||||
|                 if collection_name == collection.name: | ||||
|                     print(f"deleting existing collection {collection_name}") | ||||
|                     CHROMA_CLIENT.delete_collection(name=collection_name) | ||||
| 
 | ||||
|         collection = CHROMA_CLIENT.create_collection( | ||||
|             name=collection_name, | ||||
|             embedding_function=app.state.sentence_transformer_ef, | ||||
|         ) | ||||
| 
 | ||||
|         collection.add( | ||||
|             documents=texts, metadatas=metadatas, ids=[str(uuid.uuid1()) for _ in texts] | ||||
|         ) | ||||
|         return True | ||||
|     except Exception as e: | ||||
|         print(e) | ||||
|         if e.__class__.__name__ == "UniqueConstraintError": | ||||
|             return True | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| def get_loader(filename: str, file_content_type: str, file_path: str): | ||||
|     file_ext = filename.split(".")[-1].lower() | ||||
|     known_type = True | ||||
|  | @ -460,6 +477,33 @@ def store_doc( | |||
|             ) | ||||
| 
 | ||||
| 
 | ||||
| class TextRAGForm(BaseModel): | ||||
|     name: str | ||||
|     content: str | ||||
|     collection_name: Optional[str] = None | ||||
| 
 | ||||
| 
 | ||||
| @app.post("/text") | ||||
| def store_text( | ||||
|     form_data: TextRAGForm, | ||||
|     user=Depends(get_current_user), | ||||
| ): | ||||
| 
 | ||||
|     collection_name = form_data.collection_name | ||||
|     if collection_name == None: | ||||
|         collection_name = calculate_sha256_string(form_data.content) | ||||
| 
 | ||||
|     result = store_text_in_vector_db(form_data.content, form_data.name, collection_name) | ||||
| 
 | ||||
|     if result: | ||||
|         return {"status": True, "collection_name": collection_name} | ||||
|     else: | ||||
|         raise HTTPException( | ||||
|             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||||
|             detail=ERROR_MESSAGES.DEFAULT(), | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| @app.get("/scan") | ||||
| def scan_docs_dir(user=Depends(get_admin_user)): | ||||
|     for path in Path(DOCS_DIR).rglob("./**/*"): | ||||
|  |  | |||
|  | @ -137,6 +137,8 @@ def rag_messages(docs, messages, template, k, embedding_function): | |||
|                     k=k, | ||||
|                     embedding_function=embedding_function, | ||||
|                 ) | ||||
|             elif doc["type"] == "text": | ||||
|                 context = doc["content"] | ||||
|             else: | ||||
|                 context = query_doc( | ||||
|                     collection_name=doc["collection_name"], | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Timothy J. Baek
						Timothy J. Baek