Add testing configs

2025-12-11 22:21:47 +01:00 · 2025-12-11 22:21:47 +01:00 · ff11c1deb3
commit ff11c1deb3
parent 5de8181959
9 changed files with 250 additions and 2 deletions
--- a/config/generate_csv.sh
+++ b/config/generate_csv.sh
@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+# Generate a CSV that enumerates a test grid for your Python benchmarking script.
+# Columns: model,context_size,extra_args
+#
+# Example:
+#   ./generate_grid_csv.sh > grid.csv
+#   ./generate_grid_csv.sh -o grid.csv
+#
+# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS)
+# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted.
+
+set -euo pipefail
+
+OUT_FILE=""
+SHOW_HELP=false
+
+usage() {
+  cat <<'EOF'
+Usage:
+  generate_grid_csv.sh [-o output.csv]
+
+Options:
+  -o <file>   Write CSV to this file instead of stdout
+  -h          Show this help
+
+Customize the axes by editing arrays in the script:
+  MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA
+
+Examples:
+  ./generate_grid_csv.sh > grid.csv
+  ./generate_grid_csv.sh -o grid.csv
+
+Tip:
+  You can also override arrays via env vars (space-separated), e.g.:
+    MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv
+EOF
+}
+
+# --- Parse flags ---
+while getopts ":o:h" opt; do
+  case "$opt" in
+    o) OUT_FILE="$OPTARG" ;;
+    h) SHOW_HELP=true ;;
+    \?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;;
+    :)  echo "Option -$OPTARG requires an argument." >&2; exit 2 ;;
+  esac
+done
+shift $((OPTIND - 1))
+
+$SHOW_HELP && { usage; exit 0; }
+
+# --- Axes (edit or override via env) ---
+# You can override these by exporting env vars before running, e.g.:
+#   export MODELS="gpt-4o-mini llama-3.1-8b"
+# shellcheck disable=SC2206
+DATASETS=${DATASETS:-"enwik9 human_reference"}
+CONTEXTS=${CONTEXTS:-"64"}
+
+# Convert space-separated env vars to bash arrays
+# shellcheck disable=SC2206
+DATASETS_ARR=($DATASETS)
+CONTEXTS_ARR=($CONTEXTS)
+
+# --- CSV helpers ---
+csv_escape() {
+  # Escape double quotes by doubling them, and wrap the whole field in quotes.
+  local s="$1"
+  s=${s//\"/\"\"}
+  printf '%s' "$s"
+}
+
+emit() {
+  # Write to file or stdout
+  if [[ -n "$OUT_FILE" ]]; then
+    printf "%s\n" "$1" >> "$OUT_FILE"
+  else
+    printf "%s\n" "$1"
+  fi
+}
+
+# Prepare output
+if [[ -n "$OUT_FILE" ]]; then
+  : > "$OUT_FILE"  # truncate/initialize
+fi
+
+# Header
+emit "id,input,model,dataset,context_size"
+
+# --- Generate rows (Cartesian product) ---
+id=0
+model="cnn"
+for file in /home/tdpeuter/data/ml-inputs/*; do
+  for dataset in "${DATASETS_ARR[@]}"; do
+    for ctx in "${CONTEXTS_ARR[@]}"; do
+      # CSV-quote each field
+      row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx"
+      emit "$row"
+      id=$((id+1))
+    done
+  done
+done
+
+# Done
+if [[ -n "$OUT_FILE" ]]; then
+  echo "CSV written to: $OUT_FILE"
+fi