106 lines
2.5 KiB
Bash
106 lines
2.5 KiB
Bash
#!/usr/bin/env bash
|
|
# Generate a CSV that enumerates a test grid for your Python benchmarking script.
|
|
# Columns: model,context_size,extra_args
|
|
#
|
|
# Example:
|
|
# ./generate_grid_csv.sh > grid.csv
|
|
# ./generate_grid_csv.sh -o grid.csv
|
|
#
|
|
# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS)
|
|
# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted.
|
|
|
|
set -euo pipefail
|
|
|
|
OUT_FILE=""
|
|
SHOW_HELP=false
|
|
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage:
|
|
generate_grid_csv.sh [-o output.csv]
|
|
|
|
Options:
|
|
-o <file> Write CSV to this file instead of stdout
|
|
-h Show this help
|
|
|
|
Customize the axes by editing arrays in the script:
|
|
MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA
|
|
|
|
Examples:
|
|
./generate_grid_csv.sh > grid.csv
|
|
./generate_grid_csv.sh -o grid.csv
|
|
|
|
Tip:
|
|
You can also override arrays via env vars (space-separated), e.g.:
|
|
MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv
|
|
EOF
|
|
}
|
|
|
|
# --- Parse flags ---
|
|
while getopts ":o:h" opt; do
|
|
case "$opt" in
|
|
o) OUT_FILE="$OPTARG" ;;
|
|
h) SHOW_HELP=true ;;
|
|
\?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;;
|
|
:) echo "Option -$OPTARG requires an argument." >&2; exit 2 ;;
|
|
esac
|
|
done
|
|
shift $((OPTIND - 1))
|
|
|
|
$SHOW_HELP && { usage; exit 0; }
|
|
|
|
# --- Axes (edit or override via env) ---
|
|
# You can override these by exporting env vars before running, e.g.:
|
|
# export MODELS="gpt-4o-mini llama-3.1-8b"
|
|
# shellcheck disable=SC2206
|
|
DATASETS=${DATASETS:-"enwik9 human_reference"}
|
|
CONTEXTS=${CONTEXTS:-"64"}
|
|
|
|
# Convert space-separated env vars to bash arrays
|
|
# shellcheck disable=SC2206
|
|
DATASETS_ARR=($DATASETS)
|
|
CONTEXTS_ARR=($CONTEXTS)
|
|
|
|
# --- CSV helpers ---
|
|
csv_escape() {
|
|
# Escape double quotes by doubling them, and wrap the whole field in quotes.
|
|
local s="$1"
|
|
s=${s//\"/\"\"}
|
|
printf '%s' "$s"
|
|
}
|
|
|
|
emit() {
|
|
# Write to file or stdout
|
|
if [[ -n "$OUT_FILE" ]]; then
|
|
printf "%s\n" "$1" >> "$OUT_FILE"
|
|
else
|
|
printf "%s\n" "$1"
|
|
fi
|
|
}
|
|
|
|
# Prepare output
|
|
if [[ -n "$OUT_FILE" ]]; then
|
|
: > "$OUT_FILE" # truncate/initialize
|
|
fi
|
|
|
|
# Header
|
|
emit "id,input,model,dataset,context_size"
|
|
|
|
# --- Generate rows (Cartesian product) ---
|
|
id=0
|
|
model="cnn"
|
|
for file in /home/tdpeuter/data/ml-inputs/*; do
|
|
for dataset in "${DATASETS_ARR[@]}"; do
|
|
for ctx in "${CONTEXTS_ARR[@]}"; do
|
|
# CSV-quote each field
|
|
row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx"
|
|
emit "$row"
|
|
id=$((id+1))
|
|
done
|
|
done
|
|
done
|
|
|
|
# Done
|
|
if [[ -n "$OUT_FILE" ]]; then
|
|
echo "CSV written to: $OUT_FILE"
|
|
fi
|