feat: toggle pdf ocr

This commit is contained in:
Timothy J. Baek 2024-03-10 13:32:34 -07:00
parent 96ada23272
commit 98948814fd
3 changed files with 137 additions and 90 deletions

View file

@ -77,6 +77,7 @@ from constants import ERROR_MESSAGES
app = FastAPI()
app.state.PDF_EXTRACT_IMAGES = False
app.state.CHUNK_SIZE = CHUNK_SIZE
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
app.state.RAG_TEMPLATE = RAG_TEMPLATE
@ -184,12 +185,15 @@ async def update_embedding_model(
}
@app.get("/chunk")
async def get_chunk_params(user=Depends(get_admin_user)):
@app.get("/config")
async def get_rag_config(user=Depends(get_admin_user)):
return {
"status": True,
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
"chunk": {
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
},
}
@ -198,17 +202,24 @@ class ChunkParamUpdateForm(BaseModel):
chunk_overlap: int
@app.post("/chunk/update")
async def update_chunk_params(
form_data: ChunkParamUpdateForm, user=Depends(get_admin_user)
):
app.state.CHUNK_SIZE = form_data.chunk_size
app.state.CHUNK_OVERLAP = form_data.chunk_overlap
class ConfigUpdateForm(BaseModel):
pdf_extract_images: bool
chunk: ChunkParamUpdateForm
@app.post("/config/update")
async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
app.state.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
return {
"status": True,
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
"pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
"chunk": {
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
},
}
@ -364,7 +375,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
]
if file_ext == "pdf":
loader = PyPDFLoader(file_path, extract_images=True)
loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
elif file_ext == "csv":
loader = CSVLoader(file_path)
elif file_ext == "rst":

View file

@ -1,9 +1,9 @@
import { RAG_API_BASE_URL } from '$lib/constants';
export const getChunkParams = async (token: string) => {
export const getRAGConfig = async (token: string) => {
let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/chunk`, {
const res = await fetch(`${RAG_API_BASE_URL}/config`, {
method: 'GET',
headers: {
'Content-Type': 'application/json',
@ -27,18 +27,27 @@ export const getChunkParams = async (token: string) => {
return res;
};
export const updateChunkParams = async (token: string, size: number, overlap: number) => {
type ChunkConfigForm = {
chunk_size: number;
chunk_overlap: number;
};
type RAGConfigForm = {
pdf_extract_images: boolean;
chunk: ChunkConfigForm;
};
export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/chunk/update`, {
const res = await fetch(`${RAG_API_BASE_URL}/config/update`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${token}`
},
body: JSON.stringify({
chunk_size: size,
chunk_overlap: overlap
...payload
})
})
.then(async (res) => {

View file

@ -1,10 +1,10 @@
<script lang="ts">
import { getDocs } from '$lib/apis/documents';
import {
getChunkParams,
getRAGConfig,
updateRAGConfig,
getQuerySettings,
scanDocs,
updateChunkParams,
updateQuerySettings
} from '$lib/apis/rag';
import { documents } from '$lib/stores';
@ -17,6 +17,7 @@
let chunkSize = 0;
let chunkOverlap = 0;
let pdfExtractImages = true;
let querySettings = {
template: '',
@ -35,16 +36,24 @@
};
const submitHandler = async () => {
const res = await updateChunkParams(localStorage.token, chunkSize, chunkOverlap);
const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
chunk: {
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
}
});
querySettings = await updateQuerySettings(localStorage.token, querySettings);
};
onMount(async () => {
const res = await getChunkParams(localStorage.token);
const res = await getRAGConfig(localStorage.token);
if (res) {
chunkSize = res.chunk_size;
chunkOverlap = res.chunk_overlap;
pdfExtractImages = res.pdf_extract_images;
chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;
}
querySettings = await getQuerySettings(localStorage.token);
@ -124,82 +133,100 @@
<hr class=" dark:border-gray-700" />
<div class=" ">
<div class=" text-sm font-medium">Chunk Params</div>
<div class=" space-y-3">
<div class=" space-y-3">
<div class=" text-sm font-medium">Chunk Params</div>
<div class=" flex">
<div class=" flex w-full justify-between">
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
<div class=" flex gap-2">
<div class=" flex w-full justify-between gap-2">
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
<div class="self-center p-3">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Size"
bind:value={chunkSize}
autocomplete="off"
min="0"
/>
<div class="self-center">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Size"
bind:value={chunkSize}
autocomplete="off"
min="0"
/>
</div>
</div>
<div class="flex w-full gap-2">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div>
</div>
<div class="flex w-full">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div>
<div class="flex justify-between items-center text-xs">
<div class=" text-xs font-medium">PDF Extract Images (OCR)</div>
<div class="self-center p-3">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
<button
class=" text-xs font-medium text-gray-500"
type="button"
on:click={() => {
pdfExtractImages = !pdfExtractImages;
}}>{pdfExtractImages ? 'On' : 'Off'}</button
>
</div>
</div>
</div>
<div class=" text-sm font-medium">Query Params</div>
<div class=" flex">
<div class=" flex w-full justify-between">
<div class="self-center text-xs font-medium flex-1">Top K</div>
<div class="self-center p-3">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Top K"
bind:value={querySettings.k}
autocomplete="off"
min="0"
/>
</div>
</div>
<!-- <div class="flex w-full">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center p-3">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div> -->
</div>
<div>
<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
<textarea
bind:value={querySettings.template}
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
rows="4"
/>
<div class=" text-sm font-medium">Query Params</div>
<div class=" flex py-2">
<div class=" flex w-full justify-between gap-2">
<div class="self-center text-xs font-medium flex-1">Top K</div>
<div class="self-center">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Top K"
bind:value={querySettings.k}
autocomplete="off"
min="0"
/>
</div>
</div>
<!-- <div class="flex w-full">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center p-3">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div> -->
</div>
<div>
<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
<textarea
bind:value={querySettings.template}
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
rows="4"
/>
</div>
</div>
</div>
</div>