forked from open-webui/open-webui
feat: toggle pdf ocr
This commit is contained in:
parent
96ada23272
commit
98948814fd
3 changed files with 137 additions and 90 deletions
|
@ -77,6 +77,7 @@ from constants import ERROR_MESSAGES
|
|||
|
||||
app = FastAPI()
|
||||
|
||||
app.state.PDF_EXTRACT_IMAGES = False
|
||||
app.state.CHUNK_SIZE = CHUNK_SIZE
|
||||
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
|
||||
app.state.RAG_TEMPLATE = RAG_TEMPLATE
|
||||
|
@ -184,12 +185,15 @@ async def update_embedding_model(
|
|||
}
|
||||
|
||||
|
||||
@app.get("/chunk")
|
||||
async def get_chunk_params(user=Depends(get_admin_user)):
|
||||
@app.get("/config")
|
||||
async def get_rag_config(user=Depends(get_admin_user)):
|
||||
return {
|
||||
"status": True,
|
||||
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
|
||||
"chunk": {
|
||||
"chunk_size": app.state.CHUNK_SIZE,
|
||||
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
@ -198,17 +202,24 @@ class ChunkParamUpdateForm(BaseModel):
|
|||
chunk_overlap: int
|
||||
|
||||
|
||||
@app.post("/chunk/update")
|
||||
async def update_chunk_params(
|
||||
form_data: ChunkParamUpdateForm, user=Depends(get_admin_user)
|
||||
):
|
||||
app.state.CHUNK_SIZE = form_data.chunk_size
|
||||
app.state.CHUNK_OVERLAP = form_data.chunk_overlap
|
||||
class ConfigUpdateForm(BaseModel):
|
||||
pdf_extract_images: bool
|
||||
chunk: ChunkParamUpdateForm
|
||||
|
||||
|
||||
@app.post("/config/update")
|
||||
async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
|
||||
app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
|
||||
app.state.CHUNK_SIZE = form_data.chunk.chunk_size
|
||||
app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
|
||||
|
||||
return {
|
||||
"status": True,
|
||||
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
|
||||
"chunk": {
|
||||
"chunk_size": app.state.CHUNK_SIZE,
|
||||
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
@ -364,7 +375,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
|
|||
]
|
||||
|
||||
if file_ext == "pdf":
|
||||
loader = PyPDFLoader(file_path, extract_images=True)
|
||||
loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
|
||||
elif file_ext == "csv":
|
||||
loader = CSVLoader(file_path)
|
||||
elif file_ext == "rst":
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import { RAG_API_BASE_URL } from '$lib/constants';
|
||||
|
||||
export const getChunkParams = async (token: string) => {
|
||||
export const getRAGConfig = async (token: string) => {
|
||||
let error = null;
|
||||
|
||||
const res = await fetch(`${RAG_API_BASE_URL}/chunk`, {
|
||||
const res = await fetch(`${RAG_API_BASE_URL}/config`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
|
@ -27,18 +27,27 @@ export const getChunkParams = async (token: string) => {
|
|||
return res;
|
||||
};
|
||||
|
||||
export const updateChunkParams = async (token: string, size: number, overlap: number) => {
|
||||
type ChunkConfigForm = {
|
||||
chunk_size: number;
|
||||
chunk_overlap: number;
|
||||
};
|
||||
|
||||
type RAGConfigForm = {
|
||||
pdf_extract_images: boolean;
|
||||
chunk: ChunkConfigForm;
|
||||
};
|
||||
|
||||
export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
|
||||
let error = null;
|
||||
|
||||
const res = await fetch(`${RAG_API_BASE_URL}/chunk/update`, {
|
||||
const res = await fetch(`${RAG_API_BASE_URL}/config/update`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${token}`
|
||||
},
|
||||
body: JSON.stringify({
|
||||
chunk_size: size,
|
||||
chunk_overlap: overlap
|
||||
...payload
|
||||
})
|
||||
})
|
||||
.then(async (res) => {
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
<script lang="ts">
|
||||
import { getDocs } from '$lib/apis/documents';
|
||||
import {
|
||||
getChunkParams,
|
||||
getRAGConfig,
|
||||
updateRAGConfig,
|
||||
getQuerySettings,
|
||||
scanDocs,
|
||||
updateChunkParams,
|
||||
updateQuerySettings
|
||||
} from '$lib/apis/rag';
|
||||
import { documents } from '$lib/stores';
|
||||
|
@ -17,6 +17,7 @@
|
|||
|
||||
let chunkSize = 0;
|
||||
let chunkOverlap = 0;
|
||||
let pdfExtractImages = true;
|
||||
|
||||
let querySettings = {
|
||||
template: '',
|
||||
|
@ -35,16 +36,24 @@
|
|||
};
|
||||
|
||||
const submitHandler = async () => {
|
||||
const res = await updateChunkParams(localStorage.token, chunkSize, chunkOverlap);
|
||||
const res = await updateRAGConfig(localStorage.token, {
|
||||
pdf_extract_images: pdfExtractImages,
|
||||
chunk: {
|
||||
chunk_overlap: chunkOverlap,
|
||||
chunk_size: chunkSize
|
||||
}
|
||||
});
|
||||
querySettings = await updateQuerySettings(localStorage.token, querySettings);
|
||||
};
|
||||
|
||||
onMount(async () => {
|
||||
const res = await getChunkParams(localStorage.token);
|
||||
const res = await getRAGConfig(localStorage.token);
|
||||
|
||||
if (res) {
|
||||
chunkSize = res.chunk_size;
|
||||
chunkOverlap = res.chunk_overlap;
|
||||
pdfExtractImages = res.pdf_extract_images;
|
||||
|
||||
chunkSize = res.chunk.chunk_size;
|
||||
chunkOverlap = res.chunk.chunk_overlap;
|
||||
}
|
||||
|
||||
querySettings = await getQuerySettings(localStorage.token);
|
||||
|
@ -124,14 +133,15 @@
|
|||
|
||||
<hr class=" dark:border-gray-700" />
|
||||
|
||||
<div class=" ">
|
||||
<div class=" space-y-3">
|
||||
<div class=" space-y-3">
|
||||
<div class=" text-sm font-medium">Chunk Params</div>
|
||||
|
||||
<div class=" flex">
|
||||
<div class=" flex w-full justify-between">
|
||||
<div class=" flex gap-2">
|
||||
<div class=" flex w-full justify-between gap-2">
|
||||
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
|
||||
|
||||
<div class="self-center p-3">
|
||||
<div class="self-center">
|
||||
<input
|
||||
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||
type="number"
|
||||
|
@ -143,10 +153,10 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex w-full">
|
||||
<div class="flex w-full gap-2">
|
||||
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
|
||||
|
||||
<div class="self-center p-3">
|
||||
<div class="self-center">
|
||||
<input
|
||||
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||
type="number"
|
||||
|
@ -159,13 +169,29 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class="flex justify-between items-center text-xs">
|
||||
<div class=" text-xs font-medium">PDF Extract Images (OCR)</div>
|
||||
|
||||
<button
|
||||
class=" text-xs font-medium text-gray-500"
|
||||
type="button"
|
||||
on:click={() => {
|
||||
pdfExtractImages = !pdfExtractImages;
|
||||
}}>{pdfExtractImages ? 'On' : 'Off'}</button
|
||||
>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<div class=" text-sm font-medium">Query Params</div>
|
||||
|
||||
<div class=" flex">
|
||||
<div class=" flex w-full justify-between">
|
||||
<div class=" flex py-2">
|
||||
<div class=" flex w-full justify-between gap-2">
|
||||
<div class="self-center text-xs font-medium flex-1">Top K</div>
|
||||
|
||||
<div class="self-center p-3">
|
||||
<div class="self-center">
|
||||
<input
|
||||
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||
type="number"
|
||||
|
@ -203,6 +229,7 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="flex justify-end pt-3 text-sm font-medium">
|
||||
<button
|
||||
|
|
Loading…
Reference in a new issue