Add testing configs

2025-12-11 22:21:47 +01:00 · 2025-12-11 22:21:47 +01:00 · ff11c1deb3
commit ff11c1deb3
parent 5de8181959
9 changed files with 250 additions and 2 deletions
--- a/config/download_datasets.sh
+++ b/config/download_datasets.sh
@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# Download all URLs (one per line) from a txt file into a destination directory.
+# This script is written by Copilot
+
+set -uo pipefail
+
+usage() {
+  echo "Usage: $0 <urls.txt> <destination_dir>"
+  echo "Example: $0 urls.txt ~/Downloads/files"
+  exit 1
+}
+
+# ---- Args & prerequisites ----
+[[ $# -ne 2 ]] && usage
+
+URLS_FILE="$1"
+DEST_DIR="$2"
+
+if [[ ! -f "$URLS_FILE" ]]; then
+  echo "Error: URL list file not found: $URLS_FILE" >&2
+  exit 2
+fi
+
+mkdir -p "$DEST_DIR" || {
+  echo "Error: Cannot create/access destination directory: $DEST_DIR" >&2
+  exit 3
+}
+
+# Prefer curl if available; otherwise try wget
+DOWNLOADER=""
+if command -v wget >/dev/null 2>&1; then
+  DOWNLOADER="wget"
+else
+  echo "Error: Neither 'curl' nor 'wget' found. Please install one." >&2
+  exit 4
+fi
+
+echo "Using downloader: $DOWNLOADER"
+echo "Reading URLs from: $URLS_FILE"
+echo "Saving to:        $DEST_DIR"
+echo
+
+# ---- Download loop ----
+# Reads lines including the last one even if it lacks a trailing newline.
+while IFS= read -r url || [[ -n "$url" ]]; do
+  # Skip empty lines and comments
+  [[ -z "$url" ]] && continue
+  [[ "$url" =~ ^[[:space:]]*# ]] && continue
+
+  # Optional: strip leading/trailing whitespace
+  url="$(printf '%s' "$url" | awk '{$1=$1;print}')"
+
+  # Basic scheme check
+  if ! [[ "$url" =~ ^https?:// ]]; then
+    echo "Skipping (invalid URL scheme): $url" >&2
+    continue
+  fi
+
+  echo "→ Downloading: $url"
+
+  if [[ "$DOWNLOADER" == "curl" ]]; then
+    # -f   fail on HTTP errors
+    # -L   follow redirects
+    # -C - resume if possible
+    # --retry 3 retry transient failures
+    # -OJ  save using server-provided filename (Content-Disposition) if present
+    # (cd to dest so curl -O/-OJ writes there)
+    (
+      cd "$DEST_DIR" && \
+      curl -fL -C - --retry 3 --remote-header-name -OJ "$url"
+    ) || {
+      echo "  ⚠️  Failed: $url" >&2
+    }
+  else
+    # wget:
+    # --content-disposition: respect server-provided filename
+    # --tries=3, --timeout=10: retry/transient handling
+    # --directory-prefix: write to dest
+    # --no-clobber: skip file if it already exists
+    wget -q --content-disposition --tries=3 --timeout=10 \
+         --directory-prefix="$DEST_DIR" --no-clobber "$url" || {
+      echo "  ⚠️  Failed: $url" >&2
+    }
+  fi
+
+  # Extract .gz files
+  if [[ "$url" =~ \.gz$ ]]; then
+    filename="${url##*/}"
+    echo "Extracting: $filename"
+    gunzip "$DEST_DIR/${filename}"
+  fi
+done < "$URLS_FILE"
+
+echo
+echo "✅ Done. Files saved in: $DEST_DIR"
--- a/config/generate_csv.sh
+++ b/config/generate_csv.sh
@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+# Generate a CSV that enumerates a test grid for your Python benchmarking script.
+# Columns: model,context_size,extra_args
+#
+# Example:
+#   ./generate_grid_csv.sh > grid.csv
+#   ./generate_grid_csv.sh -o grid.csv
+#
+# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS)
+# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted.
+
+set -euo pipefail
+
+OUT_FILE=""
+SHOW_HELP=false
+
+usage() {
+  cat <<'EOF'
+Usage:
+  generate_grid_csv.sh [-o output.csv]
+
+Options:
+  -o <file>   Write CSV to this file instead of stdout
+  -h          Show this help
+
+Customize the axes by editing arrays in the script:
+  MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA
+
+Examples:
+  ./generate_grid_csv.sh > grid.csv
+  ./generate_grid_csv.sh -o grid.csv
+
+Tip:
+  You can also override arrays via env vars (space-separated), e.g.:
+    MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv
+EOF
+}
+
+# --- Parse flags ---
+while getopts ":o:h" opt; do
+  case "$opt" in
+    o) OUT_FILE="$OPTARG" ;;
+    h) SHOW_HELP=true ;;
+    \?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;;
+    :)  echo "Option -$OPTARG requires an argument." >&2; exit 2 ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+$SHOW_HELP && { usage; exit 0; }
+
+# --- Axes (edit or override via env) ---
+# You can override these by exporting env vars before running, e.g.:
+#   export MODELS="gpt-4o-mini llama-3.1-8b"
+# shellcheck disable=SC2206
+DATASETS=${DATASETS:-"enwik9 human_reference"}
+CONTEXTS=${CONTEXTS:-"64"}
+
+# Convert space-separated env vars to bash arrays
+# shellcheck disable=SC2206
+DATASETS_ARR=($DATASETS)
+CONTEXTS_ARR=($CONTEXTS)
+
+# --- CSV helpers ---
+csv_escape() {
+  # Escape double quotes by doubling them, and wrap the whole field in quotes.
+  local s="$1"
+  s=${s//\"/\"\"}
+  printf '%s' "$s"
+}
+
+emit() {
+  # Write to file or stdout
+  if [[ -n "$OUT_FILE" ]]; then
+    printf "%s\n" "$1" >> "$OUT_FILE"
+  else
+    printf "%s\n" "$1"
+  fi
+}
+
+# Prepare output
+if [[ -n "$OUT_FILE" ]]; then
+  : > "$OUT_FILE"  # truncate/initialize
+fi
+
+# Header
+emit "id,input,model,dataset,context_size"
+
+# --- Generate rows (Cartesian product) ---
+id=0
+model="cnn"
+for file in /home/tdpeuter/data/ml-inputs/*; do
+  for dataset in "${DATASETS_ARR[@]}"; do
+    for ctx in "${CONTEXTS_ARR[@]}"; do
+      # CSV-quote each field
+      row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx"
+      emit "$row"
+      id=$((id+1))
+    done
+  done
+done
+
+# Done
+if [[ -n "$OUT_FILE" ]]; then
+  echo "CSV written to: $OUT_FILE"
+fi
--- a/config/local.sh
+++ b/config/local.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+INPUT_FILE="config/sub.csv"
+
+JOBID="$(date +%s | tail -c 9)"
+GIT_HASH="$(git rev-parse --short HEAD)"
+DATE="$(date "+%Y%m%d")"
+ID="${JOBID}-${GIT_HASH}-${DATE}"
+STAT_FILE="results/${ID}/results.csv"
+MODELS=/home/tdpeuter/data/ml-models
+
+while read -r line; do
+  IFS=',' read -r id input model dataset context <<< "$line"
+
+  if [[ "${id}" == "id" ]]; then
+    continue
+  fi
+
+  python main.py compress \
+    --model-load-path "${MODELS}/${dataset}/${context}/${model}-1024.pt" \
+    --input-file "${input}" \
+    --output-file "results/${ID}/${input}.pt" &
+  exit_code="${?}"
+  if [ "${exit_code}" -eq 0 ]; then
+    echo "DONE"
+  fi
+done < "${INPUT_FILE}"
--- a/config/nix/configuration.nix
+++ b/config/nix/configuration.nix
--- a/config/nix/flake.lock
+++ b/config/nix/flake.lock
--- a/config/nix/flake.nix
+++ b/config/nix/flake.nix
--- a/config/sub.csv
+++ b/config/sub.csv
@ -0,0 +1,5 @@
+id,input,model,dataset,context_size
+0,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,enwik9,64
+1,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,human_reference,64
+2,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,enwik9,64
+3,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,human_reference,64
--- a/config/urls.txt
+++ b/config/urls.txt
@ -0,0 +1,2 @@
+https://download.mozilla.org/?product=firefox-latest&os=win&lang=en-US
+https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz