This repository has been archived on 2025-12-23. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
2025ML-project-neural_compr.../config/generate_csv.sh

106 lines
2.5 KiB
Bash

#!/usr/bin/env bash
# Generate a CSV that enumerates a test grid for your Python benchmarking script.
# Columns: model,context_size,extra_args
#
# Example:
# ./generate_grid_csv.sh > grid.csv
# ./generate_grid_csv.sh -o grid.csv
#
# You can customize the axes below (MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS)
# and add common extra args (COMMON_EXTRA). All fields are safely CSV-quoted.
set -euo pipefail
OUT_FILE=""
SHOW_HELP=false
usage() {
cat <<'EOF'
Usage:
generate_grid_csv.sh [-o output.csv]
Options:
-o <file> Write CSV to this file instead of stdout
-h Show this help
Customize the axes by editing arrays in the script:
MODELS, CONTEXTS, TEMPERATURES, MAX_TOKENS, COMMON_EXTRA
Examples:
./generate_grid_csv.sh > grid.csv
./generate_grid_csv.sh -o grid.csv
Tip:
You can also override arrays via env vars (space-separated), e.g.:
MODELS="gpt-4o-mini llama-3.1-8b" CONTEXTS="4096 8192" ./generate_grid_csv.sh > grid.csv
EOF
}
# --- Parse flags ---
while getopts ":o:h" opt; do
case "$opt" in
o) OUT_FILE="$OPTARG" ;;
h) SHOW_HELP=true ;;
\?) echo "Invalid option: -$OPTARG" >&2; usage; exit 2 ;;
:) echo "Option -$OPTARG requires an argument." >&2; exit 2 ;;
esac
done
shift $((OPTIND - 1))
$SHOW_HELP && { usage; exit 0; }
# --- Axes (edit or override via env) ---
# You can override these by exporting env vars before running, e.g.:
# export MODELS="gpt-4o-mini llama-3.1-8b"
# shellcheck disable=SC2206
DATASETS=${DATASETS:-"enwik9 human_reference"}
CONTEXTS=${CONTEXTS:-"64"}
# Convert space-separated env vars to bash arrays
# shellcheck disable=SC2206
DATASETS_ARR=($DATASETS)
CONTEXTS_ARR=($CONTEXTS)
# --- CSV helpers ---
csv_escape() {
# Escape double quotes by doubling them, and wrap the whole field in quotes.
local s="$1"
s=${s//\"/\"\"}
printf '%s' "$s"
}
emit() {
# Write to file or stdout
if [[ -n "$OUT_FILE" ]]; then
printf "%s\n" "$1" >> "$OUT_FILE"
else
printf "%s\n" "$1"
fi
}
# Prepare output
if [[ -n "$OUT_FILE" ]]; then
: > "$OUT_FILE" # truncate/initialize
fi
# Header
emit "id,input,model,dataset,context_size"
# --- Generate rows (Cartesian product) ---
id=0
model="cnn"
for file in /home/tdpeuter/data/ml-inputs/*; do
for dataset in "${DATASETS_ARR[@]}"; do
for ctx in "${CONTEXTS_ARR[@]}"; do
# CSV-quote each field
row="${id},$(csv_escape "${file}"),$(csv_escape "${model}"),$(csv_escape "${dataset}"),$ctx"
emit "$row"
id=$((id+1))
done
done
done
# Done
if [[ -n "$OUT_FILE" ]]; then
echo "CSV written to: $OUT_FILE"
fi