forked from open-webui/open-webui
feat: toggle pdf ocr
This commit is contained in:
parent
96ada23272
commit
98948814fd
3 changed files with 137 additions and 90 deletions
|
@ -77,6 +77,7 @@ from constants import ERROR_MESSAGES
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
app.state.PDF_EXTRACT_IMAGES = False
|
||||||
app.state.CHUNK_SIZE = CHUNK_SIZE
|
app.state.CHUNK_SIZE = CHUNK_SIZE
|
||||||
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
|
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
|
||||||
app.state.RAG_TEMPLATE = RAG_TEMPLATE
|
app.state.RAG_TEMPLATE = RAG_TEMPLATE
|
||||||
|
@ -184,12 +185,15 @@ async def update_embedding_model(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/chunk")
|
@app.get("/config")
|
||||||
async def get_chunk_params(user=Depends(get_admin_user)):
|
async def get_rag_config(user=Depends(get_admin_user)):
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
"chunk_size": app.state.CHUNK_SIZE,
|
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
|
||||||
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
"chunk": {
|
||||||
|
"chunk_size": app.state.CHUNK_SIZE,
|
||||||
|
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -198,17 +202,24 @@ class ChunkParamUpdateForm(BaseModel):
|
||||||
chunk_overlap: int
|
chunk_overlap: int
|
||||||
|
|
||||||
|
|
||||||
@app.post("/chunk/update")
|
class ConfigUpdateForm(BaseModel):
|
||||||
async def update_chunk_params(
|
pdf_extract_images: bool
|
||||||
form_data: ChunkParamUpdateForm, user=Depends(get_admin_user)
|
chunk: ChunkParamUpdateForm
|
||||||
):
|
|
||||||
app.state.CHUNK_SIZE = form_data.chunk_size
|
|
||||||
app.state.CHUNK_OVERLAP = form_data.chunk_overlap
|
@app.post("/config/update")
|
||||||
|
async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
|
||||||
|
app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
|
||||||
|
app.state.CHUNK_SIZE = form_data.chunk.chunk_size
|
||||||
|
app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": True,
|
"status": True,
|
||||||
"chunk_size": app.state.CHUNK_SIZE,
|
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
|
||||||
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
"chunk": {
|
||||||
|
"chunk_size": app.state.CHUNK_SIZE,
|
||||||
|
"chunk_overlap": app.state.CHUNK_OVERLAP,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -364,7 +375,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
|
||||||
]
|
]
|
||||||
|
|
||||||
if file_ext == "pdf":
|
if file_ext == "pdf":
|
||||||
loader = PyPDFLoader(file_path, extract_images=True)
|
loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
|
||||||
elif file_ext == "csv":
|
elif file_ext == "csv":
|
||||||
loader = CSVLoader(file_path)
|
loader = CSVLoader(file_path)
|
||||||
elif file_ext == "rst":
|
elif file_ext == "rst":
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
import { RAG_API_BASE_URL } from '$lib/constants';
|
import { RAG_API_BASE_URL } from '$lib/constants';
|
||||||
|
|
||||||
export const getChunkParams = async (token: string) => {
|
export const getRAGConfig = async (token: string) => {
|
||||||
let error = null;
|
let error = null;
|
||||||
|
|
||||||
const res = await fetch(`${RAG_API_BASE_URL}/chunk`, {
|
const res = await fetch(`${RAG_API_BASE_URL}/config`, {
|
||||||
method: 'GET',
|
method: 'GET',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
|
@ -27,18 +27,27 @@ export const getChunkParams = async (token: string) => {
|
||||||
return res;
|
return res;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const updateChunkParams = async (token: string, size: number, overlap: number) => {
|
type ChunkConfigForm = {
|
||||||
|
chunk_size: number;
|
||||||
|
chunk_overlap: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
type RAGConfigForm = {
|
||||||
|
pdf_extract_images: boolean;
|
||||||
|
chunk: ChunkConfigForm;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
|
||||||
let error = null;
|
let error = null;
|
||||||
|
|
||||||
const res = await fetch(`${RAG_API_BASE_URL}/chunk/update`, {
|
const res = await fetch(`${RAG_API_BASE_URL}/config/update`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: {
|
headers: {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
Authorization: `Bearer ${token}`
|
Authorization: `Bearer ${token}`
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
body: JSON.stringify({
|
||||||
chunk_size: size,
|
...payload
|
||||||
chunk_overlap: overlap
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.then(async (res) => {
|
.then(async (res) => {
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
<script lang="ts">
|
<script lang="ts">
|
||||||
import { getDocs } from '$lib/apis/documents';
|
import { getDocs } from '$lib/apis/documents';
|
||||||
import {
|
import {
|
||||||
getChunkParams,
|
getRAGConfig,
|
||||||
|
updateRAGConfig,
|
||||||
getQuerySettings,
|
getQuerySettings,
|
||||||
scanDocs,
|
scanDocs,
|
||||||
updateChunkParams,
|
|
||||||
updateQuerySettings
|
updateQuerySettings
|
||||||
} from '$lib/apis/rag';
|
} from '$lib/apis/rag';
|
||||||
import { documents } from '$lib/stores';
|
import { documents } from '$lib/stores';
|
||||||
|
@ -17,6 +17,7 @@
|
||||||
|
|
||||||
let chunkSize = 0;
|
let chunkSize = 0;
|
||||||
let chunkOverlap = 0;
|
let chunkOverlap = 0;
|
||||||
|
let pdfExtractImages = true;
|
||||||
|
|
||||||
let querySettings = {
|
let querySettings = {
|
||||||
template: '',
|
template: '',
|
||||||
|
@ -35,16 +36,24 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
const submitHandler = async () => {
|
const submitHandler = async () => {
|
||||||
const res = await updateChunkParams(localStorage.token, chunkSize, chunkOverlap);
|
const res = await updateRAGConfig(localStorage.token, {
|
||||||
|
pdf_extract_images: pdfExtractImages,
|
||||||
|
chunk: {
|
||||||
|
chunk_overlap: chunkOverlap,
|
||||||
|
chunk_size: chunkSize
|
||||||
|
}
|
||||||
|
});
|
||||||
querySettings = await updateQuerySettings(localStorage.token, querySettings);
|
querySettings = await updateQuerySettings(localStorage.token, querySettings);
|
||||||
};
|
};
|
||||||
|
|
||||||
onMount(async () => {
|
onMount(async () => {
|
||||||
const res = await getChunkParams(localStorage.token);
|
const res = await getRAGConfig(localStorage.token);
|
||||||
|
|
||||||
if (res) {
|
if (res) {
|
||||||
chunkSize = res.chunk_size;
|
pdfExtractImages = res.pdf_extract_images;
|
||||||
chunkOverlap = res.chunk_overlap;
|
|
||||||
|
chunkSize = res.chunk.chunk_size;
|
||||||
|
chunkOverlap = res.chunk.chunk_overlap;
|
||||||
}
|
}
|
||||||
|
|
||||||
querySettings = await getQuerySettings(localStorage.token);
|
querySettings = await getQuerySettings(localStorage.token);
|
||||||
|
@ -124,82 +133,100 @@
|
||||||
|
|
||||||
<hr class=" dark:border-gray-700" />
|
<hr class=" dark:border-gray-700" />
|
||||||
|
|
||||||
<div class=" ">
|
<div class=" space-y-3">
|
||||||
<div class=" text-sm font-medium">Chunk Params</div>
|
<div class=" space-y-3">
|
||||||
|
<div class=" text-sm font-medium">Chunk Params</div>
|
||||||
|
|
||||||
<div class=" flex">
|
<div class=" flex gap-2">
|
||||||
<div class=" flex w-full justify-between">
|
<div class=" flex w-full justify-between gap-2">
|
||||||
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
|
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
|
||||||
|
|
||||||
<div class="self-center p-3">
|
<div class="self-center">
|
||||||
<input
|
<input
|
||||||
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||||
type="number"
|
type="number"
|
||||||
placeholder="Enter Chunk Size"
|
placeholder="Enter Chunk Size"
|
||||||
bind:value={chunkSize}
|
bind:value={chunkSize}
|
||||||
autocomplete="off"
|
autocomplete="off"
|
||||||
min="0"
|
min="0"
|
||||||
/>
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="flex w-full gap-2">
|
||||||
|
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
|
||||||
|
|
||||||
|
<div class="self-center">
|
||||||
|
<input
|
||||||
|
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||||
|
type="number"
|
||||||
|
placeholder="Enter Chunk Overlap"
|
||||||
|
bind:value={chunkOverlap}
|
||||||
|
autocomplete="off"
|
||||||
|
min="0"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="flex w-full">
|
<div>
|
||||||
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
|
<div class="flex justify-between items-center text-xs">
|
||||||
|
<div class=" text-xs font-medium">PDF Extract Images (OCR)</div>
|
||||||
|
|
||||||
<div class="self-center p-3">
|
<button
|
||||||
<input
|
class=" text-xs font-medium text-gray-500"
|
||||||
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
type="button"
|
||||||
type="number"
|
on:click={() => {
|
||||||
placeholder="Enter Chunk Overlap"
|
pdfExtractImages = !pdfExtractImages;
|
||||||
bind:value={chunkOverlap}
|
}}>{pdfExtractImages ? 'On' : 'Off'}</button
|
||||||
autocomplete="off"
|
>
|
||||||
min="0"
|
|
||||||
/>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class=" text-sm font-medium">Query Params</div>
|
|
||||||
|
|
||||||
<div class=" flex">
|
|
||||||
<div class=" flex w-full justify-between">
|
|
||||||
<div class="self-center text-xs font-medium flex-1">Top K</div>
|
|
||||||
|
|
||||||
<div class="self-center p-3">
|
|
||||||
<input
|
|
||||||
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
|
||||||
type="number"
|
|
||||||
placeholder="Enter Top K"
|
|
||||||
bind:value={querySettings.k}
|
|
||||||
autocomplete="off"
|
|
||||||
min="0"
|
|
||||||
/>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- <div class="flex w-full">
|
|
||||||
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
|
|
||||||
|
|
||||||
<div class="self-center p-3">
|
|
||||||
<input
|
|
||||||
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
|
||||||
type="number"
|
|
||||||
placeholder="Enter Chunk Overlap"
|
|
||||||
bind:value={chunkOverlap}
|
|
||||||
autocomplete="off"
|
|
||||||
min="0"
|
|
||||||
/>
|
|
||||||
</div>
|
|
||||||
</div> -->
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
<div>
|
||||||
<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
|
<div class=" text-sm font-medium">Query Params</div>
|
||||||
<textarea
|
|
||||||
bind:value={querySettings.template}
|
<div class=" flex py-2">
|
||||||
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
|
<div class=" flex w-full justify-between gap-2">
|
||||||
rows="4"
|
<div class="self-center text-xs font-medium flex-1">Top K</div>
|
||||||
/>
|
|
||||||
|
<div class="self-center">
|
||||||
|
<input
|
||||||
|
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||||
|
type="number"
|
||||||
|
placeholder="Enter Top K"
|
||||||
|
bind:value={querySettings.k}
|
||||||
|
autocomplete="off"
|
||||||
|
min="0"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- <div class="flex w-full">
|
||||||
|
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
|
||||||
|
|
||||||
|
<div class="self-center p-3">
|
||||||
|
<input
|
||||||
|
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
|
||||||
|
type="number"
|
||||||
|
placeholder="Enter Chunk Overlap"
|
||||||
|
bind:value={chunkOverlap}
|
||||||
|
autocomplete="off"
|
||||||
|
min="0"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div> -->
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
|
||||||
|
<textarea
|
||||||
|
bind:value={querySettings.template}
|
||||||
|
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
|
||||||
|
rows="4"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue