run.sh (4061B)
1 #!/usr/bin/env bash 2 set -uo pipefail 3 # Note: no set -e. The main loop handles errors per-run so one failure 4 # doesn't kill the entire harness. Critical setup errors still exit explicitly. 5 6 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 7 PROJECT_DIR="$(dirname "$SCRIPT_DIR")" 8 9 # shellcheck source=lib/workspace.sh 10 source "$SCRIPT_DIR/lib/workspace.sh" 11 # shellcheck source=lib/invoke.sh 12 source "$SCRIPT_DIR/lib/invoke.sh" 13 # shellcheck source=lib/evaluate.sh 14 source "$SCRIPT_DIR/lib/evaluate.sh" 15 # shellcheck source=lib/resume.sh 16 source "$SCRIPT_DIR/lib/resume.sh" 17 18 GRID_FILE="${1:-$PROJECT_DIR/grid.yaml}" 19 PROFILE="${2:-smoke}" 20 RESULTS_DIR="$PROJECT_DIR/results" 21 22 # Preflight: verify claude is available and authenticated 23 if ! command -v claude > /dev/null 2>&1; then 24 echo "ERROR: claude CLI not found in PATH." 25 exit 1 26 fi 27 28 echo "========================================" 29 echo "Loop Benchmarking Harness" 30 echo "========================================" 31 echo "Grid file: $GRID_FILE" 32 echo "Profile: $PROFILE" 33 echo "Results: $RESULTS_DIR" 34 echo "========================================" 35 36 # Compute the experiment grid 37 cells=$(python3 "$SCRIPT_DIR/lib/compute_grid.py" "$GRID_FILE" "$PROFILE") 38 total_cells=$(echo "$cells" | wc -l) 39 echo "Grid cells: $total_cells" 40 echo "" 41 42 # Track progress 43 completed=0 44 skipped=0 45 failed=0 46 47 while IFS= read -r cell_json; do 48 task=$(echo "$cell_json" | jq -r '.task') 49 cell_id=$(echo "$cell_json" | jq -r '.cell_id') 50 runs_per_cell=$(echo "$cell_json" | jq -r '.runs_per_cell') 51 model=$(echo "$cell_json" | jq -r '.model') 52 prompt_style=$(echo "$cell_json" | jq -r '.prompt_style') 53 54 for run_num in $(seq 1 "$runs_per_cell"); do 55 run_id="${cell_id}_run${run_num}" 56 57 # Check for existing results (resume support) 58 if should_skip "$RESULTS_DIR" "$run_id"; then 59 echo "SKIP: $run_id" 60 skipped=$((skipped + 1)) 61 continue 62 fi 63 64 echo "----------------------------------------" 65 echo "RUN: $run_id" 66 echo "Task: $task | Model: $model | Prompt: $prompt_style" 67 echo "----------------------------------------" 68 69 # Run everything in a subshell so cd's don't affect the main loop 70 ( 71 # Create run results directory 72 run_dir="$RESULTS_DIR/runs/$run_id" 73 mkdir -p "$run_dir" 74 75 # Save cell config as meta.json 76 echo "$cell_json" | jq --arg run_id "$run_id" --argjson run_num "$run_num" \ 77 '. + {run_id: $run_id, run_number: $run_num, started_at: (now | todate)}' \ 78 > "$run_dir/meta.json" 79 80 # Create isolated workspace 81 echo " Creating workspace..." 82 workspace=$(create_workspace "$PROJECT_DIR" "$task" "$cell_json") 83 echo " Workspace: $workspace" 84 85 # Invoke claude 86 echo " Invoking claude (model=$model)..." 87 if invoke_claude "$cell_json" "$workspace" "$run_dir" "$PROJECT_DIR"; then 88 echo " Claude completed successfully" 89 else 90 echo " Claude exited with error (exit code: $?)" 91 fi 92 93 # Run evaluation 94 echo " Running evaluation..." 95 task_dir="$PROJECT_DIR/tasks/$task" 96 evaluate "$task_dir" "$workspace" "$cell_json" "$run_dir" 97 echo " Evaluation complete" 98 99 # Append to run index 100 jq -c '{ 101 run_id: .run_id, 102 task: .task, 103 model: .model, 104 cell_id: .cell_id, 105 completed_at: .completed_at 106 }' "$run_dir/meta.json" >> "$RESULTS_DIR/index.jsonl" 107 108 # Archive and cleanup workspace 109 echo " Archiving workspace..." 110 cleanup_workspace "$workspace" "$run_dir" 111 ) || true 112 113 # Count results (outside subshell) 114 run_dir="$RESULTS_DIR/runs/$run_id" 115 if [[ -f "$run_dir/eval_results.json" ]]; then 116 completed=$((completed + 1)) 117 else 118 failed=$((failed + 1)) 119 fi 120 echo " Done. ($completed completed, $skipped skipped, $failed failed)" 121 echo "" 122 done 123 done <<< "$cells" 124 125 echo "========================================" 126 echo "All runs complete." 127 echo "Completed: $completed | Skipped: $skipped | Failed: $failed" 128 echo "========================================"