run.sh - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

run.sh (4061B)
      1 #!/usr/bin/env bash
      2 set -uo pipefail
      3 # Note: no set -e. The main loop handles errors per-run so one failure
      4 # doesn't kill the entire harness. Critical setup errors still exit explicitly.
      5 
      6 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
      7 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
      8 
      9 # shellcheck source=lib/workspace.sh
     10 source "$SCRIPT_DIR/lib/workspace.sh"
     11 # shellcheck source=lib/invoke.sh
     12 source "$SCRIPT_DIR/lib/invoke.sh"
     13 # shellcheck source=lib/evaluate.sh
     14 source "$SCRIPT_DIR/lib/evaluate.sh"
     15 # shellcheck source=lib/resume.sh
     16 source "$SCRIPT_DIR/lib/resume.sh"
     17 
     18 GRID_FILE="${1:-$PROJECT_DIR/grid.yaml}"
     19 PROFILE="${2:-smoke}"
     20 RESULTS_DIR="$PROJECT_DIR/results"
     21 
     22 # Preflight: verify claude is available and authenticated
     23 if ! command -v claude > /dev/null 2>&1; then
     24   echo "ERROR: claude CLI not found in PATH."
     25   exit 1
     26 fi
     27 
     28 echo "========================================"
     29 echo "Loop Benchmarking Harness"
     30 echo "========================================"
     31 echo "Grid file:  $GRID_FILE"
     32 echo "Profile:    $PROFILE"
     33 echo "Results:    $RESULTS_DIR"
     34 echo "========================================"
     35 
     36 # Compute the experiment grid
     37 cells=$(python3 "$SCRIPT_DIR/lib/compute_grid.py" "$GRID_FILE" "$PROFILE")
     38 total_cells=$(echo "$cells" | wc -l)
     39 echo "Grid cells: $total_cells"
     40 echo ""
     41 
     42 # Track progress
     43 completed=0
     44 skipped=0
     45 failed=0
     46 
     47 while IFS= read -r cell_json; do
     48   task=$(echo "$cell_json" | jq -r '.task')
     49   cell_id=$(echo "$cell_json" | jq -r '.cell_id')
     50   runs_per_cell=$(echo "$cell_json" | jq -r '.runs_per_cell')
     51   model=$(echo "$cell_json" | jq -r '.model')
     52   prompt_style=$(echo "$cell_json" | jq -r '.prompt_style')
     53 
     54   for run_num in $(seq 1 "$runs_per_cell"); do
     55     run_id="${cell_id}_run${run_num}"
     56 
     57     # Check for existing results (resume support)
     58     if should_skip "$RESULTS_DIR" "$run_id"; then
     59       echo "SKIP: $run_id"
     60       skipped=$((skipped + 1))
     61       continue
     62     fi
     63 
     64     echo "----------------------------------------"
     65     echo "RUN:  $run_id"
     66     echo "Task: $task | Model: $model | Prompt: $prompt_style"
     67     echo "----------------------------------------"
     68 
     69     # Run everything in a subshell so cd's don't affect the main loop
     70     (
     71       # Create run results directory
     72       run_dir="$RESULTS_DIR/runs/$run_id"
     73       mkdir -p "$run_dir"
     74 
     75       # Save cell config as meta.json
     76       echo "$cell_json" | jq --arg run_id "$run_id" --argjson run_num "$run_num" \
     77         '. + {run_id: $run_id, run_number: $run_num, started_at: (now | todate)}' \
     78         > "$run_dir/meta.json"
     79 
     80       # Create isolated workspace
     81       echo "  Creating workspace..."
     82       workspace=$(create_workspace "$PROJECT_DIR" "$task" "$cell_json")
     83       echo "  Workspace: $workspace"
     84 
     85       # Invoke claude
     86       echo "  Invoking claude (model=$model)..."
     87       if invoke_claude "$cell_json" "$workspace" "$run_dir" "$PROJECT_DIR"; then
     88         echo "  Claude completed successfully"
     89       else
     90         echo "  Claude exited with error (exit code: $?)"
     91       fi
     92 
     93       # Run evaluation
     94       echo "  Running evaluation..."
     95       task_dir="$PROJECT_DIR/tasks/$task"
     96       evaluate "$task_dir" "$workspace" "$cell_json" "$run_dir"
     97       echo "  Evaluation complete"
     98 
     99       # Append to run index
    100       jq -c '{
    101         run_id: .run_id,
    102         task: .task,
    103         model: .model,
    104         cell_id: .cell_id,
    105         completed_at: .completed_at
    106       }' "$run_dir/meta.json" >> "$RESULTS_DIR/index.jsonl"
    107 
    108       # Archive and cleanup workspace
    109       echo "  Archiving workspace..."
    110       cleanup_workspace "$workspace" "$run_dir"
    111     ) || true
    112 
    113     # Count results (outside subshell)
    114     run_dir="$RESULTS_DIR/runs/$run_id"
    115     if [[ -f "$run_dir/eval_results.json" ]]; then
    116       completed=$((completed + 1))
    117     else
    118       failed=$((failed + 1))
    119     fi
    120     echo "  Done. ($completed completed, $skipped skipped, $failed failed)"
    121     echo ""
    122   done
    123 done <<< "$cells"
    124 
    125 echo "========================================"
    126 echo "All runs complete."
    127 echo "Completed: $completed | Skipped: $skipped | Failed: $failed"
    128 echo "========================================"
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README