Merge pull request #1050 from jannikstdl/rag-pdf-ocr

feat: added ocr functionality to the pdf loader
2024-03-06 00:45:33 -05:00 · 2024-03-06 00:45:33 -05:00 · 8fb5f54751
commit 8fb5f54751
parent 2111398d13 089a63e0c6
2 changed files with 2 additions and 1 deletions
--- a/backend/apps/rag/main.py
+++ b/backend/apps/rag/main.py
@ -425,7 +425,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
    ]

    if file_ext == "pdf":
-        loader = PyPDFLoader(file_path)
+        loader = PyPDFLoader(file_path, extract_images=True)
    elif file_ext == "csv":
        loader = CSVLoader(file_path)
    elif file_ext == "rst":
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -34,6 +34,7 @@ pandas
 openpyxl
 pyxlsb
 xlrd
+rapidocr-onnxruntime

 faster-whisper