feat: Add epub support

This commit is contained in:
Dave Bauman 2024-01-13 08:46:56 -05:00
parent f079cb6b56
commit f559068186
No known key found for this signature in database
GPG key ID: 3AF96C27EDA2C9D4
6 changed files with 35 additions and 11 deletions

View file

@ -28,6 +28,11 @@ ENV WEBUI_JWT_SECRET_KEY "SECRET_KEY"
WORKDIR /app
# Install pandoc
RUN apt-get update \
&& apt-get install -y pandoc \
&& rm -rf /var/lib/apt/lists/*
# copy embedding weight from build
RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2
COPY --from=build /app/onnx.tar.gz /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2

View file

@ -19,6 +19,7 @@ from langchain_community.document_loaders import (
PyPDFLoader,
CSVLoader,
Docx2txtLoader,
UnstructuredEPubLoader,
UnstructuredWordDocumentLoader,
UnstructuredMarkdownLoader,
UnstructuredXMLLoader,
@ -184,6 +185,8 @@ def store_doc(
loader = TextLoader(file_path)
elif file_ext in octet_markdown:
loader = UnstructuredMarkdownLoader(file_path)
elif file.content_type == "application/epub+zip":
loader = UnstructuredEPubLoader(file_path)
else:
loader = TextLoader(file_path)
known_type=False
@ -206,10 +209,16 @@ def store_doc(
)
except Exception as e:
print(e)
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
if "No pandoc was found" in str(e):
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.PANDOC_NOT_INSTALLED,
)
else:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=ERROR_MESSAGES.DEFAULT(e),
)
@app.get("/reset/db")

View file

@ -42,3 +42,5 @@ class ERROR_MESSAGES(str, Enum):
USER_NOT_FOUND = "We could not find what you're looking for :/"
API_KEY_NOT_FOUND = "Oops! It looks like there's a hiccup. The API key is missing. Please make sure to provide a valid API key to access this feature."
MALICIOUS = "Unusual activities detected, please try again in a few minutes."
PANDOC_NOT_INSTALLED = "Pandoc is not installed on the server. Please contact your administrator for assistance."

View file

@ -24,6 +24,7 @@ pypdf
docx2txt
unstructured
markdown
pypandoc
PyJWT
pyjwt[crypto]

View file

@ -121,13 +121,19 @@
error: ''
};
files = [...files, doc];
const res = await uploadDocToVectorDB(localStorage.token, '', file);
try {
files = [...files, doc];
const res = await uploadDocToVectorDB(localStorage.token, '', file);
if (res) {
doc.upload_status = true;
doc.collection_name = res.collection_name;
files = files;
if (res) {
doc.upload_status = true;
doc.collection_name = res.collection_name;
files = files;
}
} catch (e) {
// Remove the failed doc from the files array
files = files.filter((f) => f.name !== file.name);
toast.error(e);
}
};

View file

@ -12,6 +12,7 @@ export const WEB_UI_VERSION = 'v1.0.0-alpha-static';
export const REQUIRED_OLLAMA_VERSION = '0.1.16';
export const SUPPORTED_FILE_TYPE = [
'application/epub+zip',
'application/pdf',
'text/plain',
'text/csv',