diff --git a/.github/workflows/docker-build.yaml b/.github/workflows/docker-build.yaml index ebc4cc42..44c9c654 100644 --- a/.github/workflows/docker-build.yaml +++ b/.github/workflows/docker-build.yaml @@ -1,5 +1,4 @@ -# -name: Create and publish a Docker image +name: Create and publish Docker images with specific build args # Configures this workflow to run every time a change is pushed to the branch called `release`. on: @@ -24,7 +23,7 @@ jobs: permissions: contents: read packages: write - # + steps: - name: Checkout repository uses: actions/checkout@v4 @@ -42,8 +41,8 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Extract metadata for Docker images - id: meta + - name: Extract metadata for Docker images (default latest tag) + id: meta-latest uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} @@ -54,14 +53,31 @@ jobs: type=sha,prefix=git- type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} - flavor: | - latest=${{ github.ref == 'refs/heads/main' }} + latest=true - - name: Build and push Docker image + - name: Build and push Docker image (latest) uses: docker/build-push-action@v5 with: context: . push: true platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + tags: ${{ steps.meta-latest.outputs.tags }} + labels: ${{ steps.meta-latest.outputs.labels }} + + - name: Build and push Docker image with CUDA + uses: docker/build-push-action@v5 + with: + context: . + push: true + platforms: linux/amd64,linux/arm64 + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cuda + build-args: USE_CUDA=true + + - name: Build and push Docker image with Ollama + uses: docker/build-push-action@v5 + with: + context: . + push: true + platforms: linux/amd64,linux/arm64 + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:ollama + build-args: USE_OLLAMA=true diff --git a/Dockerfile b/Dockerfile index f76f8c32..e1891522 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,82 +1,111 @@ # syntax=docker/dockerfile:1 +# Initialize device type args +# use build args in the docker build commmand with --build-arg="BUILDARG=true" +ARG USE_CUDA=false +ARG USE_OLLAMA=false +# Tested with cu117 for CUDA 11 and cu121 for CUDA 12 (default) +ARG USE_CUDA_VER=cu121 +# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers +# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard +# for better performance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB) +# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them. +ARG USE_EMBEDDING_MODEL=all-MiniLM-L6-v2 -FROM node:alpine as build +######## WebUI frontend ######## +FROM node:21-alpine3.19 as build WORKDIR /app -# wget embedding model weight from alpine (does not exist from slim-buster) -RUN wget "https://chroma-onnx-models.s3.amazonaws.com/all-MiniLM-L6-v2/onnx.tar.gz" -O - | \ - tar -xzf - -C /app - COPY package.json package-lock.json ./ RUN npm ci COPY . . RUN npm run build - +######## WebUI backend ######## FROM python:3.11-slim-bookworm as base -ENV ENV=prod -ENV PORT "" +# Use args +ARG USE_CUDA +ARG USE_OLLAMA +ARG USE_CUDA_VER +ARG USE_EMBEDDING_MODEL -ENV OLLAMA_BASE_URL "/ollama" +## Basis ## +ENV ENV=prod \ + PORT=8080 \ + # pass build args to the build + USE_OLLAMA_DOCKER=${USE_OLLAMA} \ + USE_CUDA_DOCKER=${USE_CUDA} \ + USE_CUDA_DOCKER_VER=${USE_CUDA_VER} \ + USE_EMBEDDING_MODEL_DOCKER=${USE_EMBEDDING_MODEL} -ENV OPENAI_API_BASE_URL "" -ENV OPENAI_API_KEY "" +## Basis URL Config ## +ENV OLLAMA_BASE_URL="/ollama" \ + OPENAI_API_BASE_URL="" -ENV WEBUI_SECRET_KEY "" -ENV WEBUI_AUTH_TRUSTED_EMAIL_HEADER "" +## API Key and Security Config ## +ENV OPENAI_API_KEY="" \ + WEBUI_SECRET_KEY="" \ + SCARF_NO_ANALYTICS=true \ + DO_NOT_TRACK=true -ENV SCARF_NO_ANALYTICS true -ENV DO_NOT_TRACK true +#### Other models ######################################################### +## whisper TTS model settings ## +ENV WHISPER_MODEL="base" \ + WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models" -# Use locally bundled version of the LiteLLM cost map json -# to avoid repetitive startup connections -ENV LITELLM_LOCAL_MODEL_COST_MAP="True" - -######## Preloaded models ######## -# whisper TTS Settings -ENV WHISPER_MODEL="base" -ENV WHISPER_MODEL_DIR="/app/backend/data/cache/whisper/models" - -# RAG Embedding Model Settings -# any sentence transformer model; models to use can be found at https://huggingface.co/models?library=sentence-transformers -# Leaderboard: https://huggingface.co/spaces/mteb/leaderboard -# for better persormance and multilangauge support use "intfloat/multilingual-e5-large" (~2.5GB) or "intfloat/multilingual-e5-base" (~1.5GB) -# IMPORTANT: If you change the default model (all-MiniLM-L6-v2) and vice versa, you aren't able to use RAG Chat with your previous documents loaded in the WebUI! You need to re-embed them. -ENV RAG_EMBEDDING_MODEL="all-MiniLM-L6-v2" -# device type for whisper tts and embbeding models - "cpu" (default), "cuda" (nvidia gpu and CUDA required) or "mps" (apple silicon) - choosing this right can lead to better performance -ENV RAG_EMBEDDING_MODEL_DEVICE_TYPE="cpu" -ENV RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models" -ENV SENTENCE_TRANSFORMERS_HOME $RAG_EMBEDDING_MODEL_DIR - -######## Preloaded models ######## +## RAG Embedding model settings ## +ENV RAG_EMBEDDING_MODEL="$USE_EMBEDDING_MODEL_DOCKER" \ + RAG_EMBEDDING_MODEL_DIR="/app/backend/data/cache/embedding/models" \ + SENTENCE_TRANSFORMERS_HOME="/app/backend/data/cache/embedding/models" +#### Other models ########################################################## WORKDIR /app/backend - # install python dependencies COPY ./backend/requirements.txt ./requirements.txt -RUN apt-get update && apt-get install ffmpeg libsm6 libxext6 -y +RUN if [ "$USE_CUDA" = "true" ]; then \ + # If you use CUDA the whisper and embedding modell will be downloaded on first use + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/$USE_CUDA_DOCKER_VER --no-cache-dir && \ + pip3 install -r requirements.txt --no-cache-dir && \ + python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" && \ + python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device='cpu')"; \ + else \ + pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir && \ + pip3 install -r requirements.txt --no-cache-dir && \ + python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='cpu', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" && \ + python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device='cpu')"; \ + fi -RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir -RUN pip3 install -r requirements.txt --no-cache-dir -# Install pandoc and netcat -# RUN python -c "import pypandoc; pypandoc.download_pandoc()" -RUN apt-get update \ - && apt-get install -y pandoc netcat-openbsd \ - && rm -rf /var/lib/apt/lists/* +RUN if [ "$USE_OLLAMA" = "true" ]; then \ + apt-get update && \ + # Install pandoc and netcat + apt-get install -y --no-install-recommends pandoc netcat-openbsd && \ + # for RAG OCR + apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \ + # install helper tools + apt-get install -y --no-install-recommends curl && \ + # install ollama + curl -fsSL https://ollama.com/install.sh | sh && \ + # cleanup + rm -rf /var/lib/apt/lists/*; \ + else \ + apt-get update && \ + # Install pandoc and netcat + apt-get install -y --no-install-recommends pandoc netcat-openbsd && \ + # for RAG OCR + apt-get install -y --no-install-recommends ffmpeg libsm6 libxext6 && \ + # cleanup + rm -rf /var/lib/apt/lists/*; \ + fi + -# preload embedding model -RUN python -c "import os; from chromadb.utils import embedding_functions; sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=os.environ['RAG_EMBEDDING_MODEL'], device=os.environ['RAG_EMBEDDING_MODEL_DEVICE_TYPE'])" -# preload tts model -RUN python -c "import os; from faster_whisper import WhisperModel; WhisperModel(os.environ['WHISPER_MODEL'], device='auto', compute_type='int8', download_root=os.environ['WHISPER_MODEL_DIR'])" # copy embedding weight from build -RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 -COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx +# RUN mkdir -p /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2 +# COPY --from=build /app/onnx /root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx # copy built frontend files COPY --from=build /app/build /app/build @@ -86,4 +115,6 @@ COPY --from=build /app/package.json /app/package.json # copy backend files COPY ./backend . -CMD [ "bash", "start.sh"] +EXPOSE 8080 + +CMD [ "bash", "start.sh"] \ No newline at end of file diff --git a/README.md b/README.md index e2ee284e..3c0093e7 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,65 @@ Don't forget to explore our sibling project, [Open WebUI Community](https://open - After installation, you can access Open WebUI at [http://localhost:3000](http://localhost:3000). Enjoy! πŸ˜„ +- **If you want to customize your build with additional args**, use this commands: + + > [!NOTE] + > If you only want to use Open WebUI with Ollama included or CUDA acelleration it's recomented to use our official images with the tags :cuda or :with-ollama + > If you want a combination of both or more customisation options like a different embedding model and/or CUDA version you need to build the image yourself following the instructions below. + + **For the build:** + + ```bash + docker build -t open-webui + ``` + + Optional build ARGS (use them in the docker build command below if needed): + + e.g. + + ```bash + --build-arg="USE_EMBEDDING_MODEL=intfloat/multilingual-e5-large" + ``` + + For "intfloat/multilingual-e5-large" custom embedding model (default is all-MiniLM-L6-v2), only works with [sentence transforer models](https://huggingface.co/models?library=sentence-transformers). Current [Leaderbord](https://huggingface.co/spaces/mteb/leaderboard) of embedding models. + + ```bash + --build-arg="USE_OLLAMA=true" + ``` + + For including ollama in the image. + + ```bash + --build-arg="USE_CUDA=true" + ``` + + To use CUDA exeleration for the embedding and whisper models. + + > [!NOTE] + > You need to install the [Nvidia CUDA container toolkit](https://docs.nvidia.com/dgx/nvidia-container-runtime-upgrade/) on your machine to be able to set CUDA as the Docker engine. Only works with Linux - use WSL for Windows! + + ```bash + --build-arg="USE_CUDA_VER=cu117" + ``` + + For CUDA 11 (default is CUDA 12) + + **To run the image:** + + - **If you DID NOT use the USE_CUDA=true build ARG**, use this command: + + ```bash + docker run -d -p 3000:8080 -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main + ``` + + - **If you DID use the USE_CUDA=true build ARG**, use this command: + + ```bash + docker run --gpus all -d -p 3000:8080 -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main + ``` + + - After installation, you can access Open WebUI at [http://localhost:3000](http://localhost:3000). Enjoy! πŸ˜„ + #### Open WebUI: Server Connection Error If you're experiencing connection issues, it’s often due to the WebUI docker container not being able to reach the Ollama server at 127.0.0.1:11434 (host.docker.internal:11434) inside the container . Use the `--network=host` flag in your docker command to resolve this. Note that the port changes from 3000 to 8080, resulting in the link: `http://localhost:8080`. diff --git a/backend/apps/audio/main.py b/backend/apps/audio/main.py index bb3cd053..02d1f5e8 100644 --- a/backend/apps/audio/main.py +++ b/backend/apps/audio/main.py @@ -28,6 +28,7 @@ from config import ( UPLOAD_DIR, WHISPER_MODEL, WHISPER_MODEL_DIR, + DEVICE_TYPE, ) log = logging.getLogger(__name__) @@ -42,6 +43,10 @@ app.add_middleware( allow_headers=["*"], ) +# setting device type for whisper model +whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" +log.info(f"whisper_device_type: {whisper_device_type}") + @app.post("/transcribe") def transcribe( @@ -66,7 +71,7 @@ def transcribe( model = WhisperModel( WHISPER_MODEL, - device="auto", + device=whisper_device_type, compute_type="int8", download_root=WHISPER_MODEL_DIR, ) diff --git a/backend/apps/rag/main.py b/backend/apps/rag/main.py index 671429bb..08639866 100644 --- a/backend/apps/rag/main.py +++ b/backend/apps/rag/main.py @@ -59,7 +59,7 @@ from config import ( UPLOAD_DIR, DOCS_DIR, RAG_EMBEDDING_MODEL, - RAG_EMBEDDING_MODEL_DEVICE_TYPE, + DEVICE_TYPE, CHROMA_CLIENT, CHUNK_SIZE, CHUNK_OVERLAP, @@ -71,15 +71,6 @@ from constants import ERROR_MESSAGES log = logging.getLogger(__name__) log.setLevel(SRC_LOG_LEVELS["RAG"]) -# -# if RAG_EMBEDDING_MODEL: -# sentence_transformer_ef = SentenceTransformer( -# model_name_or_path=RAG_EMBEDDING_MODEL, -# cache_folder=RAG_EMBEDDING_MODEL_DIR, -# device=RAG_EMBEDDING_MODEL_DEVICE_TYPE, -# ) - - app = FastAPI() app.state.PDF_EXTRACT_IMAGES = False @@ -92,7 +83,7 @@ app.state.TOP_K = 4 app.state.sentence_transformer_ef = ( embedding_functions.SentenceTransformerEmbeddingFunction( model_name=app.state.RAG_EMBEDDING_MODEL, - device=RAG_EMBEDDING_MODEL_DEVICE_TYPE, + device=DEVICE_TYPE, ) ) @@ -147,10 +138,9 @@ async def update_embedding_model( app.state.sentence_transformer_ef = ( embedding_functions.SentenceTransformerEmbeddingFunction( model_name=app.state.RAG_EMBEDDING_MODEL, - device=RAG_EMBEDDING_MODEL_DEVICE_TYPE, + device=DEVICE_TYPE, ) ) - return { "status": True, "embedding_model": app.state.RAG_EMBEDDING_MODEL, diff --git a/backend/config.py b/backend/config.py index 39411d25..402a4183 100644 --- a/backend/config.py +++ b/backend/config.py @@ -257,6 +257,7 @@ OLLAMA_API_BASE_URL = os.environ.get( OLLAMA_BASE_URL = os.environ.get("OLLAMA_BASE_URL", "") K8S_FLAG = os.environ.get("K8S_FLAG", "") +USE_OLLAMA_DOCKER = os.environ.get("USE_OLLAMA_DOCKER", "false") if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "": OLLAMA_BASE_URL = ( @@ -266,9 +267,13 @@ if OLLAMA_BASE_URL == "" and OLLAMA_API_BASE_URL != "": ) if ENV == "prod": - if OLLAMA_BASE_URL == "/ollama": - OLLAMA_BASE_URL = "http://host.docker.internal:11434" - + if OLLAMA_BASE_URL == "/ollama" and not K8S_FLAG: + if USE_OLLAMA_DOCKER.lower() == "true": + # if you use all-in-one docker container (Open WebUI + Ollama) + # with the docker build arg USE_OLLAMA=true (--build-arg="USE_OLLAMA=true") this only works with http://localhost:11434 + OLLAMA_BASE_URL = "http://localhost:11434" + else: + OLLAMA_BASE_URL = "http://host.docker.internal:11434" elif K8S_FLAG: OLLAMA_BASE_URL = "http://ollama-service.open-webui.svc.cluster.local:11434" @@ -391,10 +396,16 @@ if WEBUI_AUTH and WEBUI_SECRET_KEY == "": CHROMA_DATA_PATH = f"{DATA_DIR}/vector_db" # this uses the model defined in the Dockerfile ENV variable. If you dont use docker or docker based deployments such as k8s, the default embedding model will be used (all-MiniLM-L6-v2) RAG_EMBEDDING_MODEL = os.environ.get("RAG_EMBEDDING_MODEL", "all-MiniLM-L6-v2") +log.info(f"Embedding model set: {RAG_EMBEDDING_MODEL}"), # device type ebbeding models - "cpu" (default), "cuda" (nvidia gpu required) or "mps" (apple silicon) - choosing this right can lead to better performance -RAG_EMBEDDING_MODEL_DEVICE_TYPE = os.environ.get( - "RAG_EMBEDDING_MODEL_DEVICE_TYPE", "cpu" -) +USE_CUDA = os.environ.get("USE_CUDA_DOCKER", "false") + +if USE_CUDA.lower() == "true": + DEVICE_TYPE = "cuda" +else: + DEVICE_TYPE = "cpu" + + CHROMA_CLIENT = chromadb.PersistentClient( path=CHROMA_DATA_PATH, settings=Settings(allow_reset=True, anonymized_telemetry=False), diff --git a/backend/start.sh b/backend/start.sh index f9ed5948..06adf1ff 100755 --- a/backend/start.sh +++ b/backend/start.sh @@ -7,16 +7,26 @@ KEY_FILE=.webui_secret_key PORT="${PORT:-8080}" if test "$WEBUI_SECRET_KEY $WEBUI_JWT_SECRET_KEY" = " "; then - echo No WEBUI_SECRET_KEY provided + echo "No WEBUI_SECRET_KEY provided" if ! [ -e "$KEY_FILE" ]; then - echo Generating WEBUI_SECRET_KEY + echo "Generating WEBUI_SECRET_KEY" # Generate a random value to use as a WEBUI_SECRET_KEY in case the user didn't provide one. - echo $(head -c 12 /dev/random | base64) > $KEY_FILE + echo $(head -c 12 /dev/random | base64) > "$KEY_FILE" fi - echo Loading WEBUI_SECRET_KEY from $KEY_FILE - WEBUI_SECRET_KEY=`cat $KEY_FILE` + echo "Loading WEBUI_SECRET_KEY from $KEY_FILE" + WEBUI_SECRET_KEY=$(cat "$KEY_FILE") fi -WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*' \ No newline at end of file +if [ "$USE_OLLAMA_DOCKER" = "true" ]; then + echo "USE_OLLAMA is set to true, starting ollama serve." + ollama serve & +fi + +if [ "$USE_CUDA_DOCKER" = "true" ]; then + echo "CUDA is enabled, appending LD_LIBRARY_PATH to include torch/cudnn & cublas libraries." + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.11/site-packages/torch/lib:/usr/local/lib/python3.11/site-packages/nvidia/cudnn/lib" +fi + +WEBUI_SECRET_KEY="$WEBUI_SECRET_KEY" exec uvicorn main:app --host 0.0.0.0 --port "$PORT" --forwarded-allow-ips '*'