Add testing configs

This commit is contained in:
Tibo De Peuter 2025-12-11 22:21:47 +01:00
parent 5de8181959
commit ff11c1deb3
Signed by: tdpeuter
GPG key ID: 38297DE43F75FFE2
9 changed files with 250 additions and 2 deletions

View file

@ -0,0 +1,95 @@
#!/usr/bin/env bash
# Download all URLs (one per line) from a txt file into a destination directory.
# This script is written by Copilot
set -uo pipefail
usage() {
echo "Usage: $0 <urls.txt> <destination_dir>"
echo "Example: $0 urls.txt ~/Downloads/files"
exit 1
}
# ---- Args & prerequisites ----
[[ $# -ne 2 ]] && usage
URLS_FILE="$1"
DEST_DIR="$2"
if [[ ! -f "$URLS_FILE" ]]; then
echo "Error: URL list file not found: $URLS_FILE" >&2
exit 2
fi
mkdir -p "$DEST_DIR" || {
echo "Error: Cannot create/access destination directory: $DEST_DIR" >&2
exit 3
}
# Prefer curl if available; otherwise try wget
DOWNLOADER=""
if command -v wget >/dev/null 2>&1; then
DOWNLOADER="wget"
else
echo "Error: Neither 'curl' nor 'wget' found. Please install one." >&2
exit 4
fi
echo "Using downloader: $DOWNLOADER"
echo "Reading URLs from: $URLS_FILE"
echo "Saving to: $DEST_DIR"
echo
# ---- Download loop ----
# Reads lines including the last one even if it lacks a trailing newline.
while IFS= read -r url || [[ -n "$url" ]]; do
# Skip empty lines and comments
[[ -z "$url" ]] && continue
[[ "$url" =~ ^[[:space:]]*# ]] && continue
# Optional: strip leading/trailing whitespace
url="$(printf '%s' "$url" | awk '{$1=$1;print}')"
# Basic scheme check
if ! [[ "$url" =~ ^https?:// ]]; then
echo "Skipping (invalid URL scheme): $url" >&2
continue
fi
echo "→ Downloading: $url"
if [[ "$DOWNLOADER" == "curl" ]]; then
# -f fail on HTTP errors
# -L follow redirects
# -C - resume if possible
# --retry 3 retry transient failures
# -OJ save using server-provided filename (Content-Disposition) if present
# (cd to dest so curl -O/-OJ writes there)
(
cd "$DEST_DIR" && \
curl -fL -C - --retry 3 --remote-header-name -OJ "$url"
) || {
echo " ⚠️ Failed: $url" >&2
}
else
# wget:
# --content-disposition: respect server-provided filename
# --tries=3, --timeout=10: retry/transient handling
# --directory-prefix: write to dest
# --no-clobber: skip file if it already exists
wget -q --content-disposition --tries=3 --timeout=10 \
--directory-prefix="$DEST_DIR" --no-clobber "$url" || {
echo " ⚠️ Failed: $url" >&2
}
fi
# Extract .gz files
if [[ "$url" =~ \.gz$ ]]; then
filename="${url##*/}"
echo "Extracting: $filename"
gunzip "$DEST_DIR/${filename}"
fi
done < "$URLS_FILE"
echo
echo "✅ Done. Files saved in: $DEST_DIR"

106
config/generate_csv.sh Normal file
View file

@ -0,0 +1,106 @@
#!/usr/bin/env bash
# Generate a CSV that enumerates a test grid for your Python benchmarking script.
# Columns: model,context_size,extra_args
#
# Example:
# ./generate_grid_csv.sh > grid.csv
# ./generate_grid_csv.sh -o grid.csv
#
# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS)
# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted.
set -euo pipefail
OUT_FILE=""
SHOW_HELP=false
usage() {
cat <<'EOF'
Usage:
generate_grid_csv.sh [-o output.csv]
Options:
-o <file> Write CSV to this file instead of stdout
-h Show this help
Customize the axes by editing arrays in the script:
MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA
Examples:
./generate_grid_csv.sh > grid.csv
./generate_grid_csv.sh -o grid.csv
Tip:
You can also override arrays via env vars (space-separated), e.g.:
MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv
EOF
}
# --- Parse flags ---
while getopts ":o:h" opt; do
case "$opt" in
o) OUT_FILE="$OPTARG" ;;
h) SHOW_HELP=true ;;
\?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;;
:) echo "Option -$OPTARG requires an argument." >&2; exit 2 ;;
esac
done
shift $((OPTIND - 1))
$SHOW_HELP && { usage; exit 0; }
# --- Axes (edit or override via env) ---
# You can override these by exporting env vars before running, e.g.:
# export MODELS="gpt-4o-mini llama-3.1-8b"
# shellcheck disable=SC2206
DATASETS=${DATASETS:-"enwik9 human_reference"}
CONTEXTS=${CONTEXTS:-"64"}
# Convert space-separated env vars to bash arrays
# shellcheck disable=SC2206
DATASETS_ARR=($DATASETS)
CONTEXTS_ARR=($CONTEXTS)
# --- CSV helpers ---
csv_escape() {
# Escape double quotes by doubling them, and wrap the whole field in quotes.
local s="$1"
s=${s//\"/\"\"}
printf '%s' "$s"
}
emit() {
# Write to file or stdout
if [[ -n "$OUT_FILE" ]]; then
printf "%s\n" "$1" >> "$OUT_FILE"
else
printf "%s\n" "$1"
fi
}
# Prepare output
if [[ -n "$OUT_FILE" ]]; then
: > "$OUT_FILE" # truncate/initialize
fi
# Header
emit "id,input,model,dataset,context_size"
# --- Generate rows (Cartesian product) ---
id=0
model="cnn"
for file in /home/tdpeuter/data/ml-inputs/*; do
for dataset in "${DATASETS_ARR[@]}"; do
for ctx in "${CONTEXTS_ARR[@]}"; do
# CSV-quote each field
row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx"
emit "$row"
id=$((id+1))
done
done
done
# Done
if [[ -n "$OUT_FILE" ]]; then
echo "CSV written to: $OUT_FILE"
fi

27
config/local.sh Normal file
View file

@ -0,0 +1,27 @@
#!/usr/bin/env bash
INPUT_FILE="config/sub.csv"
JOBID="$(date +%s | tail -c 9)"
GIT_HASH="$(git rev-parse --short HEAD)"
DATE="$(date "+%Y%m%d")"
ID="${JOBID}-${GIT_HASH}-${DATE}"
STAT_FILE="results/${ID}/results.csv"
MODELS=/home/tdpeuter/data/ml-models
while read -r line; do
IFS=',' read -r id input model dataset context <<< "$line"
if [[ "${id}" == "id" ]]; then
continue
fi
python main.py compress \
--model-load-path "${MODELS}/${dataset}/${context}/${model}-1024.pt" \
--input-file "${input}" \
--output-file "results/${ID}/${input}.pt" &
exit_code="${?}"
if [ "${exit_code}" -eq 0 ]; then
echo "DONE"
fi
done < "${INPUT_FILE}"

5
config/sub.csv Normal file
View file

@ -0,0 +1,5 @@
id,input,model,dataset,context_size
0,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,enwik9,64
1,/home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe,cnn,human_reference,64
2,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,enwik9,64
3,/home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna,cnn,human_reference,64
1 id input model dataset context_size
2 0 /home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe cnn enwik9 64
3 1 /home/tdpeuter/data/ml-inputs/Firefox Setup 146.0.exe cnn human_reference 64
4 2 /home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna cnn enwik9 64
5 3 /home/tdpeuter/data/ml-inputs/GCF_000005845.2_ASM584v2_genomic.fna cnn human_reference 64

2
config/urls.txt Normal file
View file

@ -0,0 +1,2 @@
https://download.mozilla.org/?product=firefox-latest&os=win&lang=en-US
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz