evaluate.sh - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

evaluate.sh (6131B)
      1 #!/usr/bin/env bash
      2 # Evaluation dispatch for benchmark runs.
      3 # Runs structural, functional, and quality checks against the workspace.
      4 
      5 evaluate() {
      6   local task_dir="$1"
      7   local workspace="$2"
      8   local cell_json="$3"
      9   local run_dir="$4"
     10 
     11   local language
     12   language=$(echo "$cell_json" | jq -r '.language')
     13 
     14   local eval_results='{"structural": null, "functional": null, "quality": null, "score": null}'
     15 
     16   # Helper: safely merge JSON into eval_results
     17   merge_result() {
     18     local key="$1"
     19     local output="$2"
     20 
     21     if [[ -z "$output" ]]; then
     22       eval_results=$(echo "$eval_results" | jq --arg k "$key" '.[$k] = {"pass": false, "error": "no output"}')
     23       return
     24     fi
     25 
     26     if echo "$output" | jq . > /dev/null 2>&1; then
     27       eval_results=$(echo "$eval_results" | jq --arg k "$key" --argjson v "$output" '.[$k] = $v')
     28     else
     29       # Truncate long non-JSON output to avoid jq issues
     30       local truncated="${output:0:500}"
     31       eval_results=$(echo "$eval_results" | jq --arg k "$key" --arg e "$truncated" '.[$k] = {"pass": false, "error": $e}')
     32     fi
     33   }
     34 
     35   # --- Structural checks ---
     36   if [[ -f "$task_dir/eval/structural.sh" ]]; then
     37     local structural_output
     38     structural_output=$(bash "$task_dir/eval/structural.sh" "$workspace" "$language" 2>&1) || true
     39     merge_result "structural" "$structural_output"
     40   fi
     41 
     42   # --- Functional tests ---
     43   if [[ -d "$task_dir/eval/tests" ]]; then
     44     local functional_output
     45     functional_output=$(run_functional_tests "$task_dir" "$workspace" "$language" "$run_dir" 2>&1) || true
     46     merge_result "functional" "$functional_output"
     47   fi
     48 
     49   # --- Quality checks ---
     50   if [[ -f "$task_dir/eval/quality.sh" ]]; then
     51     local quality_output
     52     quality_output=$(bash "$task_dir/eval/quality.sh" "$workspace" "$language" 2>&1) || true
     53     merge_result "quality" "$quality_output"
     54   fi
     55 
     56   # --- Compute aggregate score ---
     57   local scoring_file="$task_dir/scoring.yaml"
     58   if [[ -f "$scoring_file" ]]; then
     59     local score
     60     score=$(compute_score "$eval_results" "$scoring_file")
     61     eval_results=$(echo "$eval_results" | jq --argjson score "$score" '.score = $score')
     62   fi
     63 
     64   echo "$eval_results" | jq '.' > "$run_dir/eval_results.json"
     65 }
     66 
     67 run_functional_tests() {
     68   local task_dir="$1"
     69   local workspace="$2"
     70   local language="$3"
     71   local run_dir="$4"
     72 
     73   # Check for Playwright tests
     74   if [[ -f "$task_dir/eval/tests/functional.spec.ts" ]]; then
     75     run_playwright_tests "$task_dir" "$workspace" "$run_dir"
     76     return
     77   fi
     78 
     79   # Check for vitest tests
     80   if [[ -f "$task_dir/eval/tests/functional.test.ts" ]]; then
     81     run_vitest_tests "$task_dir" "$workspace" "$run_dir"
     82     return
     83   fi
     84 
     85   # Check for shell-based tests
     86   if [[ -f "$task_dir/eval/tests/functional.sh" ]]; then
     87     bash "$task_dir/eval/tests/functional.sh" "$workspace" "$language"
     88     return
     89   fi
     90 
     91   echo '{"pass": false, "error": "no test files found"}'
     92 }
     93 
     94 run_playwright_tests() {
     95   local task_dir="$1"
     96   local workspace="$2"
     97   local run_dir="$3"
     98 
     99   # Install Playwright in workspace if not already present
    100   cd "$workspace" || return 1
    101   npm install --save-dev @playwright/test > /dev/null 2>&1
    102   npx playwright install chromium > /dev/null 2>&1
    103 
    104   # Copy test files into a temporary test directory
    105   local test_dir="$workspace/__eval_tests__"
    106   mkdir -p "$test_dir"
    107   cp "$task_dir/eval/tests/"*.spec.ts "$test_dir/" 2>/dev/null || true
    108 
    109   # Run Playwright tests with JSON reporter
    110   local result
    111   result=$(npx playwright test --config="$task_dir/eval/playwright.config.ts" \
    112     --reporter=json \
    113     "$test_dir" 2>/dev/null) || true
    114 
    115   # Clean up eval test files
    116   rm -rf "$test_dir"
    117 
    118   # Parse Playwright JSON output into our format
    119   if echo "$result" | jq . > /dev/null 2>&1; then
    120     echo "$result" | jq '{
    121       framework: "playwright",
    122       total: (.stats.expected + .stats.unexpected + .stats.skipped),
    123       passed: .stats.expected,
    124       failed: .stats.unexpected,
    125       skipped: .stats.skipped,
    126       score: (if (.stats.expected + .stats.unexpected) > 0
    127         then (.stats.expected / (.stats.expected + .stats.unexpected))
    128         else 0 end),
    129       pass: (.stats.unexpected == 0)
    130     }'
    131   else
    132     echo '{"framework": "playwright", "pass": false, "error": "playwright output not parseable"}'
    133   fi
    134 }
    135 
    136 run_vitest_tests() {
    137   local task_dir="$1"
    138   local workspace="$2"
    139   local run_dir="$3"
    140 
    141   cd "$workspace" || return 1
    142   npm install --save-dev vitest > /dev/null 2>&1
    143 
    144   # Copy test files
    145   local test_dir="$workspace/__eval_tests__"
    146   mkdir -p "$test_dir"
    147   cp "$task_dir/eval/tests/"*.test.ts "$test_dir/" 2>/dev/null || true
    148 
    149   # Run vitest with JSON reporter
    150   local result
    151   result=$(npx vitest run --reporter=json "$test_dir" 2>/dev/null) || true
    152 
    153   rm -rf "$test_dir"
    154 
    155   if echo "$result" | jq . > /dev/null 2>&1; then
    156     echo "$result" | jq '{
    157       framework: "vitest",
    158       total: .numTotalTests,
    159       passed: .numPassedTests,
    160       failed: .numFailedTests,
    161       skipped: (.numPendingTests + .numTodoTests),
    162       score: (if .numTotalTests > 0
    163         then (.numPassedTests / .numTotalTests)
    164         else 0 end),
    165       pass: (.numFailedTests == 0)
    166     }'
    167   else
    168     echo '{"framework": "vitest", "pass": false, "error": "vitest output not parseable"}'
    169   fi
    170 }
    171 
    172 compute_score() {
    173   local eval_results="$1"
    174   local scoring_file="$2"
    175 
    176   # Read weights from scoring.yaml
    177   local w_functional w_structural w_quality
    178   w_functional=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['functional'])")
    179   w_structural=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['structural'])")
    180   w_quality=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['quality'])")
    181 
    182   # Extract individual scores, defaulting to 0
    183   echo "$eval_results" | jq --argjson wf "$w_functional" --argjson ws "$w_structural" --argjson wq "$w_quality" '
    184     (if .functional.score then .functional.score else 0 end) as $fs |
    185     (if .structural.score then .structural.score else 0 end) as $ss |
    186     (if .quality.score then .quality.score else 0 end) as $qs |
    187     ($fs * $wf + $ss * $ws + $qs * $wq)
    188   '
    189 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README