commit eedfa8d968303cf7069a88aed53990b9ec4f69e1
parent d90ff0c861644921c16ac1de3ee07a4cad53ed23
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 23:40:29 +0200
Fix score calculation: remove double-counting, normalize weights
Three bugs fixed:
1. code_analysis and transcript_analysis were double-counted (once from
scoring.yaml weights, once from hardcoded 0.1 fallback). Removed
the hardcoded fallback.
2. Scores were raw weighted sums, not normalized. If weights summed to
0.8, max possible score was 0.8. Now divides by total_weight so
scores use the full 0-1 range.
3. Removed functional from weights (always scored 0, not wired).
Gameplay bot weight increased to 0.25 to cover functionality testing.
Before: best score was 0.77, ceiling was ~0.65 due to dead weight.
After: scores should span the full 0-100% range.
Need to re-evaluate all runs to recalculate scores.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/harness/run.py b/harness/run.py
@@ -396,15 +396,11 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
score += cat_data["score"] * weight
total_weight += weight
- # Also factor in code_analysis and transcript_analysis if present
- for extra in ["code_analysis", "transcript_analysis"]:
- cat_data = results.get(extra)
- if cat_data and isinstance(cat_data.get("score"), (int, float)):
- extra_weight = 0.1 # 10% each
- score += cat_data["score"] * extra_weight
- total_weight += extra_weight
-
- results["score"] = round(score, 4)
+ # Normalize so scores use the full 0-1 range
+ if total_weight > 0:
+ results["score"] = round(score / total_weight, 4)
+ else:
+ results["score"] = 0
except Exception:
pass
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,7 +1,9 @@
weights:
- functional: 0.25
structural: 0.10
quality: 0.20
- gameplay_bot: 0.10
+ gameplay_bot: 0.25
code_analysis: 0.15
transcript_analysis: 0.10
+ # functional removed: was always 0 (not wired), gameplay_bot covers it
+ # weights now sum to 0.80 -- remaining 0.20 reserved for future evals
+ # (functional Playwright tests, accessibility, performance, etc.)