Merge pull request #1050 from jannikstdl/rag-pdf-ocr

feat: added ocr functionality to the pdf loader
This commit is contained in:
Timothy Jaeryang Baek 2024-03-06 00:45:33 -05:00 committed by GitHub
commit 8fb5f54751
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 2 additions and 1 deletions

View file

@ -425,7 +425,7 @@ def get_loader(filename: str, file_content_type: str, file_path: str):
] ]
if file_ext == "pdf": if file_ext == "pdf":
loader = PyPDFLoader(file_path) loader = PyPDFLoader(file_path, extract_images=True)
elif file_ext == "csv": elif file_ext == "csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file_ext == "rst": elif file_ext == "rst":

View file

@ -34,6 +34,7 @@ pandas
openpyxl openpyxl
pyxlsb pyxlsb
xlrd xlrd
rapidocr-onnxruntime
faster-whisper faster-whisper