feat: toggle pdf ocr

This commit is contained in:
Timothy J. Baek 2024-03-10 13:32:34 -07:00
parent 96ada23272
commit 98948814fd
3 changed files with 137 additions and 90 deletions

View file

@ -77,6 +77,7 @@ from constants import ERROR_MESSAGES
app = FastAPI() app = FastAPI()
app.state.PDF_EXTRACT_IMAGES = False
app.state.CHUNK_SIZE = CHUNK_SIZE app.state.CHUNK_SIZE = CHUNK_SIZE
app.state.CHUNK_OVERLAP = CHUNK_OVERLAP app.state.CHUNK_OVERLAP = CHUNK_OVERLAP
app.state.RAG_TEMPLATE = RAG_TEMPLATE app.state.RAG_TEMPLATE = RAG_TEMPLATE
@ -184,12 +185,15 @@ async def update_embedding_model(
} }
@app.get("/chunk") @app.get("/config")
async def get_chunk_params(user=Depends(get_admin_user)): async def get_rag_config(user=Depends(get_admin_user)):
return { return {
"status": True, "status": True,
"chunk_size": app.state.CHUNK_SIZE, "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
"chunk_overlap": app.state.CHUNK_OVERLAP, "chunk": {
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
},
} }
@ -198,17 +202,24 @@ class ChunkParamUpdateForm(BaseModel):
chunk_overlap: int chunk_overlap: int
@app.post("/chunk/update") class ConfigUpdateForm(BaseModel):
async def update_chunk_params( pdf_extract_images: bool
form_data: ChunkParamUpdateForm, user=Depends(get_admin_user) chunk: ChunkParamUpdateForm
):
app.state.CHUNK_SIZE = form_data.chunk_size
app.state.CHUNK_OVERLAP = form_data.chunk_overlap @app.post("/config/update")
async def update_rag_config(form_data: ConfigUpdateForm, user=Depends(get_admin_user)):
app.state.PDF_EXTRACT_IMAGES = form_data.pdf_extract_images
app.state.CHUNK_SIZE = form_data.chunk.chunk_size
app.state.CHUNK_OVERLAP = form_data.chunk.chunk_overlap
return { return {
"status": True, "status": True,
"chunk_size": app.state.CHUNK_SIZE, "pdf_extract_images": app.state.PDF_EXTRACT_IMAGES,
"chunk_overlap": app.state.CHUNK_OVERLAP, "chunk": {
"chunk_size": app.state.CHUNK_SIZE,
"chunk_overlap": app.state.CHUNK_OVERLAP,
},
} }
@ -364,7 +375,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
] ]
if file_ext == "pdf": if file_ext == "pdf":
loader = PyPDFLoader(file_path, extract_images=True) loader = PyPDFLoader(file_path, extract_images=app.state.PDF_EXTRACT_IMAGES)
elif file_ext == "csv": elif file_ext == "csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file_ext == "rst": elif file_ext == "rst":

View file

@ -1,9 +1,9 @@
import { RAG_API_BASE_URL } from '$lib/constants'; import { RAG_API_BASE_URL } from '$lib/constants';
export const getChunkParams = async (token: string) => { export const getRAGConfig = async (token: string) => {
let error = null; let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/chunk`, { const res = await fetch(`${RAG_API_BASE_URL}/config`, {
method: 'GET', method: 'GET',
headers: { headers: {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
@ -27,18 +27,27 @@ export const getChunkParams = async (token: string) => {
return res; return res;
}; };
export const updateChunkParams = async (token: string, size: number, overlap: number) => { type ChunkConfigForm = {
chunk_size: number;
chunk_overlap: number;
};
type RAGConfigForm = {
pdf_extract_images: boolean;
chunk: ChunkConfigForm;
};
export const updateRAGConfig = async (token: string, payload: RAGConfigForm) => {
let error = null; let error = null;
const res = await fetch(`${RAG_API_BASE_URL}/chunk/update`, { const res = await fetch(`${RAG_API_BASE_URL}/config/update`, {
method: 'POST', method: 'POST',
headers: { headers: {
'Content-Type': 'application/json', 'Content-Type': 'application/json',
Authorization: `Bearer ${token}` Authorization: `Bearer ${token}`
}, },
body: JSON.stringify({ body: JSON.stringify({
chunk_size: size, ...payload
chunk_overlap: overlap
}) })
}) })
.then(async (res) => { .then(async (res) => {

View file

@ -1,10 +1,10 @@
<script lang="ts"> <script lang="ts">
import { getDocs } from '$lib/apis/documents'; import { getDocs } from '$lib/apis/documents';
import { import {
getChunkParams, getRAGConfig,
updateRAGConfig,
getQuerySettings, getQuerySettings,
scanDocs, scanDocs,
updateChunkParams,
updateQuerySettings updateQuerySettings
} from '$lib/apis/rag'; } from '$lib/apis/rag';
import { documents } from '$lib/stores'; import { documents } from '$lib/stores';
@ -17,6 +17,7 @@
let chunkSize = 0; let chunkSize = 0;
let chunkOverlap = 0; let chunkOverlap = 0;
let pdfExtractImages = true;
let querySettings = { let querySettings = {
template: '', template: '',
@ -35,16 +36,24 @@
}; };
const submitHandler = async () => { const submitHandler = async () => {
const res = await updateChunkParams(localStorage.token, chunkSize, chunkOverlap); const res = await updateRAGConfig(localStorage.token, {
pdf_extract_images: pdfExtractImages,
chunk: {
chunk_overlap: chunkOverlap,
chunk_size: chunkSize
}
});
querySettings = await updateQuerySettings(localStorage.token, querySettings); querySettings = await updateQuerySettings(localStorage.token, querySettings);
}; };
onMount(async () => { onMount(async () => {
const res = await getChunkParams(localStorage.token); const res = await getRAGConfig(localStorage.token);
if (res) { if (res) {
chunkSize = res.chunk_size; pdfExtractImages = res.pdf_extract_images;
chunkOverlap = res.chunk_overlap;
chunkSize = res.chunk.chunk_size;
chunkOverlap = res.chunk.chunk_overlap;
} }
querySettings = await getQuerySettings(localStorage.token); querySettings = await getQuerySettings(localStorage.token);
@ -124,82 +133,100 @@
<hr class=" dark:border-gray-700" /> <hr class=" dark:border-gray-700" />
<div class=" "> <div class=" space-y-3">
<div class=" text-sm font-medium">Chunk Params</div> <div class=" space-y-3">
<div class=" text-sm font-medium">Chunk Params</div>
<div class=" flex"> <div class=" flex gap-2">
<div class=" flex w-full justify-between"> <div class=" flex w-full justify-between gap-2">
<div class="self-center text-xs font-medium min-w-fit">Chunk Size</div> <div class="self-center text-xs font-medium min-w-fit">Chunk Size</div>
<div class="self-center p-3"> <div class="self-center">
<input <input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600" class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number" type="number"
placeholder="Enter Chunk Size" placeholder="Enter Chunk Size"
bind:value={chunkSize} bind:value={chunkSize}
autocomplete="off" autocomplete="off"
min="0" min="0"
/> />
</div>
</div>
<div class="flex w-full gap-2">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div> </div>
</div> </div>
<div class="flex w-full"> <div>
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div> <div class="flex justify-between items-center text-xs">
<div class=" text-xs font-medium">PDF Extract Images (OCR)</div>
<div class="self-center p-3"> <button
<input class=" text-xs font-medium text-gray-500"
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600" type="button"
type="number" on:click={() => {
placeholder="Enter Chunk Overlap" pdfExtractImages = !pdfExtractImages;
bind:value={chunkOverlap} }}>{pdfExtractImages ? 'On' : 'Off'}</button
autocomplete="off" >
min="0"
/>
</div> </div>
</div> </div>
</div> </div>
<div class=" text-sm font-medium">Query Params</div>
<div class=" flex">
<div class=" flex w-full justify-between">
<div class="self-center text-xs font-medium flex-1">Top K</div>
<div class="self-center p-3">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Top K"
bind:value={querySettings.k}
autocomplete="off"
min="0"
/>
</div>
</div>
<!-- <div class="flex w-full">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center p-3">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div> -->
</div>
<div> <div>
<div class=" mb-2.5 text-sm font-medium">RAG Template</div> <div class=" text-sm font-medium">Query Params</div>
<textarea
bind:value={querySettings.template} <div class=" flex py-2">
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none" <div class=" flex w-full justify-between gap-2">
rows="4" <div class="self-center text-xs font-medium flex-1">Top K</div>
/>
<div class="self-center">
<input
class=" w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Top K"
bind:value={querySettings.k}
autocomplete="off"
min="0"
/>
</div>
</div>
<!-- <div class="flex w-full">
<div class=" self-center text-xs font-medium min-w-fit">Chunk Overlap</div>
<div class="self-center p-3">
<input
class="w-full rounded py-1.5 px-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none border border-gray-100 dark:border-gray-600"
type="number"
placeholder="Enter Chunk Overlap"
bind:value={chunkOverlap}
autocomplete="off"
min="0"
/>
</div>
</div> -->
</div>
<div>
<div class=" mb-2.5 text-sm font-medium">RAG Template</div>
<textarea
bind:value={querySettings.template}
class="w-full rounded p-4 text-sm dark:text-gray-300 dark:bg-gray-800 outline-none resize-none"
rows="4"
/>
</div>
</div> </div>
</div> </div>
</div> </div>