diff --git a/README.md b/README.md index 2b0b5f7..28058f6 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,22 @@ Example usage: ```shell -python main.py --debug train --dataset enwik9 --data-root ~/data/datasets/ml --method optuna --model transformer --model-save-path ~/data/ml-models/test-transformer.pt +# Fetching +python main.py --debug train --method fetch \ + --dataset enwik9 --data-root /path/to/datasets -python benchmark.py --debug train --dataset enwik9 --data-root ~/data/datasets/ml --method optuna --model cnn --model-save-path ~/data/ml-models/test-cnn.pt +# Training +python main.py --debug train --method optuna \ + --dataset enwik9 --data-root /path/to/datasets \ + --model cnn --model-save-path /path/to/optuna-model +python main.py --debug --results /path/to/results train --method full \ + --dataset enwik9 --data-root /path/to/datasets \ + --model-load-path /path/to/optuna-model --model-save-path /path/to/full-model + +# Compressing +python benchmark.py --debug compress \ + --model-load-path /path/to/full-model \ + --input-file inputfile --output-file outputfile ``` ## Running locally diff --git a/config/download_datasets.sh b/config/download_datasets.sh new file mode 100644 index 0000000..d76147d --- /dev/null +++ b/config/download_datasets.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +# Download all URLs (one per line) from a txt file into a destination directory. +# This script is written by Copilot + +set -uo pipefail + +usage() { + echo "Usage: $0 " + echo "Example: $0 urls.txt ~/Downloads/files" + exit 1 +} + +# ---- Args & prerequisites ---- +[[ $# -ne 2 ]] && usage + +URLS_FILE="$1" +DEST_DIR="$2" + +if [[ ! -f "$URLS_FILE" ]]; then + echo "Error: URL list file not found: $URLS_FILE" >&2 + exit 2 +fi + +mkdir -p "$DEST_DIR" || { + echo "Error: Cannot create/access destination directory: $DEST_DIR" >&2 + exit 3 +} + +# Prefer curl if available; otherwise try wget +DOWNLOADER="" +if command -v wget >/dev/null 2>&1; then + DOWNLOADER="wget" +else + echo "Error: Neither 'curl' nor 'wget' found. Please install one." >&2 + exit 4 +fi + +echo "Using downloader: $DOWNLOADER" +echo "Reading URLs from: $URLS_FILE" +echo "Saving to: $DEST_DIR" +echo + +# ---- Download loop ---- +# Reads lines including the last one even if it lacks a trailing newline. +while IFS= read -r url || [[ -n "$url" ]]; do + # Skip empty lines and comments + [[ -z "$url" ]] && continue + [[ "$url" =~ ^[[:space:]]*# ]] && continue + + # Optional: strip leading/trailing whitespace + url="$(printf '%s' "$url" | awk '{$1=$1;print}')" + + # Basic scheme check + if ! [[ "$url" =~ ^https?:// ]]; then + echo "Skipping (invalid URL scheme): $url" >&2 + continue + fi + + echo "→ Downloading: $url" + + if [[ "$DOWNLOADER" == "curl" ]]; then + # -f fail on HTTP errors + # -L follow redirects + # -C - resume if possible + # --retry 3 retry transient failures + # -OJ save using server-provided filename (Content-Disposition) if present + # (cd to dest so curl -O/-OJ writes there) + ( + cd "$DEST_DIR" && \ + curl -fL -C - --retry 3 --remote-header-name -OJ "$url" + ) || { + echo " ⚠️ Failed: $url" >&2 + } + else + # wget: + # --content-disposition: respect server-provided filename + # --tries=3, --timeout=10: retry/transient handling + # --directory-prefix: write to dest + # --no-clobber: skip file if it already exists + wget -q --content-disposition --tries=3 --timeout=10 \ + --directory-prefix="$DEST_DIR" --no-clobber "$url" || { + echo " ⚠️ Failed: $url" >&2 + } + fi + + # Extract .gz files + if [[ "$url" =~ \.gz$ ]]; then + filename="${url##*/}" + echo "Extracting: $filename" + gunzip "$DEST_DIR/${filename}" + fi +done < "$URLS_FILE" + +echo +echo "✅ Done. Files saved in: $DEST_DIR" diff --git a/config/generate_csv.sh b/config/generate_csv.sh new file mode 100644 index 0000000..1d4fae1 --- /dev/null +++ b/config/generate_csv.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# Generate a CSV that enumerates a test grid for your Python benchmarking script. +# Columns: model,context_size,extra_args +# +# Example: +# ./generate_grid_csv.sh > grid.csv +# ./generate_grid_csv.sh -o grid.csv +# +# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS) +# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted. + +set -euo pipefail + +OUT_FILE="" +SHOW_HELP=false + +usage() { + cat <<'EOF' +Usage: + generate_grid_csv.sh [-o output.csv] + +Options: + -o Write CSV to this file instead of stdout + -h Show this help + +Customize the axes by editing arrays in the script: + MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA + +Examples: + ./generate_grid_csv.sh > grid.csv + ./generate_grid_csv.sh -o grid.csv + +Tip: + You can also override arrays via env vars (space-separated), e.g.: + MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv +EOF +} + +# --- Parse flags --- +while getopts ":o:h" opt; do + case "$opt" in + o) OUT_FILE="$OPTARG" ;; + h) SHOW_HELP=true ;; + \?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;; + :) echo "Option -$OPTARG requires an argument." >&2; exit 2 ;; + esac +done +shift $((OPTIND - 1)) + +$SHOW_HELP && { usage; exit 0; } + +# --- Axes (edit or override via env) --- +# You can override these by exporting env vars before running, e.g.: +# export MODELS="gpt-4o-mini llama-3.1-8b" +# shellcheck disable=SC2206 +DATASETS=${DATASETS:-"enwik9 human_reference"} +CONTEXTS=${CONTEXTS:-"64"} + +# Convert space-separated env vars to bash arrays +# shellcheck disable=SC2206 +DATASETS_ARR=($DATASETS) +CONTEXTS_ARR=($CONTEXTS) + +# --- CSV helpers --- +csv_escape() { + # Escape double quotes by doubling them, and wrap the whole field in quotes. + local s="$1" + s=${s//\"/\"\"} + printf '%s' "$s" +} + +emit() { + # Write to file or stdout + if [[ -n "$OUT_FILE" ]]; then + printf "%s\n" "$1" >> "$OUT_FILE" + else + printf "%s\n" "$1" + fi +} + +# Prepare output +if [[ -n "$OUT_FILE" ]]; then + : > "$OUT_FILE" # truncate/initialize +fi + +# Header +emit "id,input,model,dataset,context_size" + +# --- Generate rows (Cartesian product) --- +id=0 +model="cnn" +for file in /home/tdpeuter/data/ml-inputs/*; do + for dataset in "${DATASETS_ARR[@]}"; do + for ctx in "${CONTEXTS_ARR[@]}"; do + # CSV-quote each field + row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx" + emit "$row" + id=$((id+1)) + done + done +done + +# Done +if [[ -n "$OUT_FILE" ]]; then + echo "CSV written to: $OUT_FILE" +fi diff --git a/config/local.sh b/config/local.sh new file mode 100644 index 0000000..91f79d5 --- /dev/null +++ b/config/local.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +INPUT_FILE="config/sub.csv" + +JOBID="$(date +%s | tail -c 9)" +GIT_HASH="$(git rev-parse --short HEAD)" +DATE="$(date "+%Y%m%d")" +ID="${JOBID}-${GIT_HASH}-${DATE}" +STAT_FILE="results/${ID}/results.csv" +MODELS=/home/tdpeuter/data/ml-models + +while read -r line; do + IFS=',' read -r id input model dataset context <<< "$line" + + if [[ "${id}" == "id" ]]; then + continue + fi + + python main.py compress \ + --model-load-path "${MODELS}/${dataset}/${context}/${model}-1024.pt" \ + --input-file "${input}" \ + --output-file "results/${ID}/${input}.pt" & + exit_code="${?}" + if [ "${exit_code}" -eq 0 ]; then + echo "DONE" + fi +done < "${INPUT_FILE}" diff --git a/config/configuration.nix b/config/nix/configuration.nix similarity index 100% rename from config/configuration.nix rename to config/nix/configuration.nix diff --git a/config/flake.lock b/config/nix/flake.lock similarity index 100% rename from config/flake.lock rename to config/nix/flake.lock diff --git a/config/flake.nix b/config/nix/flake.nix similarity index 100% rename from config/flake.nix rename to config/nix/flake.nix diff --git a/config/sub.csv b/config/sub.csv new file mode 100644 index 0000000..98fdf7a --- /dev/null +++ b/config/sub.csv @@ -0,0 +1,5 @@ +id,input,model,dataset,context_size +0,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,enwik9,64 +1,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,human_reference,64 +2,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,enwik9,64 +3,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,human_reference,64 diff --git a/config/urls.txt b/config/urls.txt new file mode 100644 index 0000000..417b877 --- /dev/null +++ b/config/urls.txt @@ -0,0 +1,2 @@ +https://download.mozilla.org/?product=firefox-latest&os=win&lang=en-US +https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz \ No newline at end of file