evaluate.sh (6131B)
1 #!/usr/bin/env bash 2 # Evaluation dispatch for benchmark runs. 3 # Runs structural, functional, and quality checks against the workspace. 4 5 evaluate() { 6 local task_dir="$1" 7 local workspace="$2" 8 local cell_json="$3" 9 local run_dir="$4" 10 11 local language 12 language=$(echo "$cell_json" | jq -r '.language') 13 14 local eval_results='{"structural": null, "functional": null, "quality": null, "score": null}' 15 16 # Helper: safely merge JSON into eval_results 17 merge_result() { 18 local key="$1" 19 local output="$2" 20 21 if [[ -z "$output" ]]; then 22 eval_results=$(echo "$eval_results" | jq --arg k "$key" '.[$k] = {"pass": false, "error": "no output"}') 23 return 24 fi 25 26 if echo "$output" | jq . > /dev/null 2>&1; then 27 eval_results=$(echo "$eval_results" | jq --arg k "$key" --argjson v "$output" '.[$k] = $v') 28 else 29 # Truncate long non-JSON output to avoid jq issues 30 local truncated="${output:0:500}" 31 eval_results=$(echo "$eval_results" | jq --arg k "$key" --arg e "$truncated" '.[$k] = {"pass": false, "error": $e}') 32 fi 33 } 34 35 # --- Structural checks --- 36 if [[ -f "$task_dir/eval/structural.sh" ]]; then 37 local structural_output 38 structural_output=$(bash "$task_dir/eval/structural.sh" "$workspace" "$language" 2>&1) || true 39 merge_result "structural" "$structural_output" 40 fi 41 42 # --- Functional tests --- 43 if [[ -d "$task_dir/eval/tests" ]]; then 44 local functional_output 45 functional_output=$(run_functional_tests "$task_dir" "$workspace" "$language" "$run_dir" 2>&1) || true 46 merge_result "functional" "$functional_output" 47 fi 48 49 # --- Quality checks --- 50 if [[ -f "$task_dir/eval/quality.sh" ]]; then 51 local quality_output 52 quality_output=$(bash "$task_dir/eval/quality.sh" "$workspace" "$language" 2>&1) || true 53 merge_result "quality" "$quality_output" 54 fi 55 56 # --- Compute aggregate score --- 57 local scoring_file="$task_dir/scoring.yaml" 58 if [[ -f "$scoring_file" ]]; then 59 local score 60 score=$(compute_score "$eval_results" "$scoring_file") 61 eval_results=$(echo "$eval_results" | jq --argjson score "$score" '.score = $score') 62 fi 63 64 echo "$eval_results" | jq '.' > "$run_dir/eval_results.json" 65 } 66 67 run_functional_tests() { 68 local task_dir="$1" 69 local workspace="$2" 70 local language="$3" 71 local run_dir="$4" 72 73 # Check for Playwright tests 74 if [[ -f "$task_dir/eval/tests/functional.spec.ts" ]]; then 75 run_playwright_tests "$task_dir" "$workspace" "$run_dir" 76 return 77 fi 78 79 # Check for vitest tests 80 if [[ -f "$task_dir/eval/tests/functional.test.ts" ]]; then 81 run_vitest_tests "$task_dir" "$workspace" "$run_dir" 82 return 83 fi 84 85 # Check for shell-based tests 86 if [[ -f "$task_dir/eval/tests/functional.sh" ]]; then 87 bash "$task_dir/eval/tests/functional.sh" "$workspace" "$language" 88 return 89 fi 90 91 echo '{"pass": false, "error": "no test files found"}' 92 } 93 94 run_playwright_tests() { 95 local task_dir="$1" 96 local workspace="$2" 97 local run_dir="$3" 98 99 # Install Playwright in workspace if not already present 100 cd "$workspace" || return 1 101 npm install --save-dev @playwright/test > /dev/null 2>&1 102 npx playwright install chromium > /dev/null 2>&1 103 104 # Copy test files into a temporary test directory 105 local test_dir="$workspace/__eval_tests__" 106 mkdir -p "$test_dir" 107 cp "$task_dir/eval/tests/"*.spec.ts "$test_dir/" 2>/dev/null || true 108 109 # Run Playwright tests with JSON reporter 110 local result 111 result=$(npx playwright test --config="$task_dir/eval/playwright.config.ts" \ 112 --reporter=json \ 113 "$test_dir" 2>/dev/null) || true 114 115 # Clean up eval test files 116 rm -rf "$test_dir" 117 118 # Parse Playwright JSON output into our format 119 if echo "$result" | jq . > /dev/null 2>&1; then 120 echo "$result" | jq '{ 121 framework: "playwright", 122 total: (.stats.expected + .stats.unexpected + .stats.skipped), 123 passed: .stats.expected, 124 failed: .stats.unexpected, 125 skipped: .stats.skipped, 126 score: (if (.stats.expected + .stats.unexpected) > 0 127 then (.stats.expected / (.stats.expected + .stats.unexpected)) 128 else 0 end), 129 pass: (.stats.unexpected == 0) 130 }' 131 else 132 echo '{"framework": "playwright", "pass": false, "error": "playwright output not parseable"}' 133 fi 134 } 135 136 run_vitest_tests() { 137 local task_dir="$1" 138 local workspace="$2" 139 local run_dir="$3" 140 141 cd "$workspace" || return 1 142 npm install --save-dev vitest > /dev/null 2>&1 143 144 # Copy test files 145 local test_dir="$workspace/__eval_tests__" 146 mkdir -p "$test_dir" 147 cp "$task_dir/eval/tests/"*.test.ts "$test_dir/" 2>/dev/null || true 148 149 # Run vitest with JSON reporter 150 local result 151 result=$(npx vitest run --reporter=json "$test_dir" 2>/dev/null) || true 152 153 rm -rf "$test_dir" 154 155 if echo "$result" | jq . > /dev/null 2>&1; then 156 echo "$result" | jq '{ 157 framework: "vitest", 158 total: .numTotalTests, 159 passed: .numPassedTests, 160 failed: .numFailedTests, 161 skipped: (.numPendingTests + .numTodoTests), 162 score: (if .numTotalTests > 0 163 then (.numPassedTests / .numTotalTests) 164 else 0 end), 165 pass: (.numFailedTests == 0) 166 }' 167 else 168 echo '{"framework": "vitest", "pass": false, "error": "vitest output not parseable"}' 169 fi 170 } 171 172 compute_score() { 173 local eval_results="$1" 174 local scoring_file="$2" 175 176 # Read weights from scoring.yaml 177 local w_functional w_structural w_quality 178 w_functional=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['functional'])") 179 w_structural=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['structural'])") 180 w_quality=$(python3 -c "import yaml; d=yaml.safe_load(open('$scoring_file')); print(d['weights']['quality'])") 181 182 # Extract individual scores, defaulting to 0 183 echo "$eval_results" | jq --argjson wf "$w_functional" --argjson ws "$w_structural" --argjson wq "$w_quality" ' 184 (if .functional.score then .functional.score else 0 end) as $fs | 185 (if .structural.score then .structural.score else 0 end) as $ss | 186 (if .quality.score then .quality.score else 0 end) as $qs | 187 ($fs * $wf + $ss * $ws + $qs * $wq) 188 ' 189 }