From 089a63e0c68bb1c2693a15a8469a278dc358b111 Mon Sep 17 00:00:00 2001 From: Jannik Streidl Date: Tue, 5 Mar 2024 22:25:25 +0100 Subject: [PATCH] feat: added ocr functionality to the pdf loader --- backend/apps/rag/main.py | 2 +- backend/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 2a8b2a49..ee07d51a 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -419,7 +419,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str): ] if file_ext == "pdf": - loader = PyPDFLoader(file_path) + loader = PyPDFLoader(file_path, extract_images=True) elif file_ext == "csv": loader = CSVLoader(file_path) elif file_ext == "rst": diff --git a/backend/requirements.txt b/backend/requirements.txt index 0cacacd8..9de3df96 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -33,6 +33,7 @@ pandas openpyxl pyxlsb xlrd +rapidocr-onnxruntime faster-whisper