commit afa692e0d3097240748b70e991a55fe00a7fb034
parent cfb04f1f1d0b27c76825f4289a9de007f7b8bf00
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 10:42:18 +0200
Outcome = gameplay + SonarQube, not gameplay + lint/typecheck
SonarQube measures actual code quality (cognitive complexity, bugs,
smells). The old "quality" (lint, typecheck, bundle) is really "does
the project build" - an output metric, not an outcome.
Outcome (headline): 50% gameplay_bot + 50% sonarqube
Output (tracked): build quality (lint/typecheck), structural, code
analysis, transcript analysis
Run detail updated: "Code Quality (SonarQube)" in outcome section,
"Build Quality" in output section.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -252,8 +252,8 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
</div>
);
})()}
- <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
- <ScoreBar label="Quality" score={eval_results.quality?.score} />
+ <ScoreBar label="Gameplay" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
+ <ScoreBar label="Code Quality (SonarQube)" score={(eval_results as Record<string, any>).sonarqube?.score} />
{/* Separator */}
<div style={{ borderTop: "1px solid var(--border)", margin: "10px 0" }} />
@@ -262,12 +262,9 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
<div style={{ fontSize: "0.6rem", fontWeight: 700, textTransform: "uppercase", letterSpacing: "0.08em", color: "var(--text-muted)", marginBottom: "8px" }}>
Output Metrics
</div>
+ <ScoreBar label="Build Quality" score={eval_results.quality?.score} />
<ScoreBar label="Structural" score={eval_results.structural?.score} />
- <ScoreBar label="Functional" score={eval_results.functional?.score} />
<ScoreBar label="Code Analysis" score={(eval_results as Record<string, any>).code_analysis?.score} />
- {(eval_results as Record<string, any>).sonarqube?.score != null && (
- <ScoreBar label="SonarQube" score={(eval_results as Record<string, any>).sonarqube.score} />
- )}
<ScoreBar label="Transcript" score={(eval_results as Record<string, any>).transcript_analysis?.score} />
</>
)}
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,11 +1,12 @@
# Outcome score (the headline number)
+# gameplay_bot: does the game actually work? (16 Playwright tests)
+# sonarqube: is the code quality good? (cognitive complexity, bugs, smells)
outcome_weights:
gameplay_bot: 0.50
- quality: 0.50
+ sonarqube: 0.50
-# Output metrics (tracked, displayed, but don't affect headline score)
-# These are computed and stored but not blended into the outcome score:
-# - structural
-# - code_analysis
-# - transcript_analysis
-# - sonarqube
+# Output metrics (tracked, displayed, but don't affect headline score):
+# - quality (lint, typecheck, bundle size - "does the project build cleanly")
+# - structural (entry point exists, build succeeds)
+# - code_analysis (function length, nesting, naming, separation of concerns)
+# - transcript_analysis (agent efficiency, wasted turns)