commit a17094f75481fdd28a4029943e449f81dcce0273
parent 626a12905a15dc3abd353945cc6a774083a19f2f
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 10:29:28 +0200
Restructure scoring: outcome vs output, flexible scatter, methodology nav
Scoring:
- Headline score = 50% gameplay bot + 50% quality (outcomes)
- Structural, code analysis, transcript, SonarQube tracked as output
metrics but don't affect headline score
- Removed functional eval (dead code, never wired)
Run detail:
- Large outcome score at top with color coding
- "OUTCOME" section: gameplay bot + quality bars
- "OUTPUT METRICS" section: structural, code analysis, SonarQube, transcript
- Agent behavior card relabeled "Agent Process"
Scatter plots:
- Dropdown selectors for x and y axes
- Available: outcome, gameplay, quality, code quality, structural,
SonarQube, cost, turns, wall time, transcript
- Default: cost vs outcome, turns vs outcome
Navigation:
- Vertical divider before Methodology link
- Methodology page placeholder (content coming)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
7 files changed, 166 insertions(+), 60 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -33,13 +33,15 @@ The grid is a cartesian product of configuration variables. You define the axes
All evaluation is deterministic code. No LLM grading.
-Categories (weights in `tasks/tetris/scoring.yaml`):
-- **Functional** (25%): gameplay bot (16 Playwright tests)
-- **Quality** (20%): lint, type check, bundle size
-- **Code analysis** (15%): file count, function length, nesting depth, naming consistency, separation of concerns, duplication, HTML validation
-- **Structural** (10%): entry point exists, build succeeds
-- **Gameplay bot** (10%): auto-calibrating Tetris player that tests all mechanics
-- **Transcript analysis** (10%): agent efficiency, wasted turns, self-testing
+Outcome score (the headline number, defined in `tasks/tetris/scoring.yaml`):
+- **Gameplay bot** (50%): auto-calibrating Tetris player that tests all mechanics
+- **Quality** (50%): lint, type check, bundle size
+
+Output metrics (tracked and displayed, but not in headline score):
+- **Structural**: entry point exists, build succeeds
+- **Code analysis**: file count, function length, nesting depth, naming consistency, separation of concerns, duplication, HTML validation
+- **Transcript analysis**: agent efficiency, wasted turns, self-testing
+- **SonarQube**: automated code quality scan
## Dashboard
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -232,16 +232,43 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
{/* Scores */}
<div className="card" style={{ padding: "16px" }}>
- <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Evaluation</h3>
{eval_results && (
<>
- <ScoreBar label="Overall" score={eval_results.score} />
+ {/* OUTCOME */}
+ <div style={{ fontSize: "0.6rem", fontWeight: 700, textTransform: "uppercase", letterSpacing: "0.08em", color: "var(--text-muted)", marginBottom: "8px" }}>
+ Outcome
+ </div>
+ {(() => {
+ const pct = eval_results.score != null ? Math.round(eval_results.score * 100) : null;
+ const color = pct != null
+ ? pct >= 70 ? "var(--green)" : pct >= 40 ? "var(--yellow)" : "var(--red)"
+ : "var(--text-muted)";
+ return (
+ <div style={{ textAlign: "center", marginBottom: "12px" }}>
+ <div style={{ fontFamily: "var(--font-mono)", fontWeight: 700, fontSize: "2.25rem", color }}>
+ {pct != null ? `${pct}%` : "-"}
+ </div>
+ <div style={{ fontSize: "0.65rem", color: "var(--text-muted)" }}>overall score</div>
+ </div>
+ );
+ })()}
+ <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
+ <ScoreBar label="Quality" score={eval_results.quality?.score} />
+
+ {/* Separator */}
+ <div style={{ borderTop: "1px solid var(--border)", margin: "10px 0" }} />
+
+ {/* OUTPUT METRICS */}
+ <div style={{ fontSize: "0.6rem", fontWeight: 700, textTransform: "uppercase", letterSpacing: "0.08em", color: "var(--text-muted)", marginBottom: "8px" }}>
+ Output Metrics
+ </div>
<ScoreBar label="Structural" score={eval_results.structural?.score} />
<ScoreBar label="Functional" score={eval_results.functional?.score} />
- <ScoreBar label="Quality" score={eval_results.quality?.score} />
<ScoreBar label="Code Analysis" score={(eval_results as Record<string, any>).code_analysis?.score} />
+ {(eval_results as Record<string, any>).sonarqube?.score != null && (
+ <ScoreBar label="SonarQube" score={(eval_results as Record<string, any>).sonarqube.score} />
+ )}
<ScoreBar label="Transcript" score={(eval_results as Record<string, any>).transcript_analysis?.score} />
- <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
</>
)}
</div>
@@ -295,7 +322,7 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
{/* Transcript analysis details */}
{(eval_results as Record<string, any>)?.transcript_analysis && !(eval_results as Record<string, any>).transcript_analysis.error && (
<div className="card" style={{ padding: "16px" }}>
- <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Agent Behavior</h4>
+ <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Agent Process</h4>
{(() => {
const ta = (eval_results as Record<string, any>).transcript_analysis;
return (
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -14,33 +14,72 @@ import { groupIntoCells } from "../lib/analysis";
interface ScatterPlotProps {
runs: Run[];
- xMetric: string;
- yMetric: string;
+ defaultX?: string;
+ defaultY?: string;
}
-type CellMetricKey = "cost" | "score" | "turns" | "wall_time";
+type CellMetricKey =
+ | "cost"
+ | "score"
+ | "turns"
+ | "wall_time"
+ | "gameplay"
+ | "quality"
+ | "code_quality"
+ | "structural"
+ | "sonarqube"
+ | "transcript";
+
+interface MetricDef {
+ label: string;
+ cellKey: CellMetricKey;
+ scale: number;
+ format: (v: number) => string;
+}
-const METRIC_CONFIG: Record<
- string,
- {
- label: string;
- cellKey: CellMetricKey;
- scale: number;
- format: (v: number) => string;
- }
-> = {
+const METRIC_CONFIG: Record<string, MetricDef> = {
cost: {
label: "Cost ($)",
cellKey: "cost",
scale: 1,
format: (v) => `$${v.toFixed(2)}`,
},
- score: {
- label: "Score (%)",
+ outcome: {
+ label: "Outcome Score (%)",
cellKey: "score",
scale: 100,
format: (v) => `${v.toFixed(0)}%`,
},
+ gameplay: {
+ label: "Gameplay (%)",
+ cellKey: "gameplay",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
+ quality: {
+ label: "Quality (%)",
+ cellKey: "quality",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
+ code_quality: {
+ label: "Code Quality (%)",
+ cellKey: "code_quality",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
+ structural: {
+ label: "Structural (%)",
+ cellKey: "structural",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
+ sonarqube: {
+ label: "SonarQube (%)",
+ cellKey: "sonarqube",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
turns: {
label: "Turns",
cellKey: "turns",
@@ -53,8 +92,19 @@ const METRIC_CONFIG: Record<
scale: 1,
format: (v) => `${Math.round(v)}s`,
},
+ transcript: {
+ label: "Transcript (%)",
+ cellKey: "transcript",
+ scale: 100,
+ format: (v) => `${v.toFixed(0)}%`,
+ },
};
+const METRIC_OPTIONS = Object.entries(METRIC_CONFIG).map(([key, conf]) => ({
+ value: key,
+ label: conf.label,
+}));
+
const MODEL_COLORS: Record<string, string> = {
haiku: "hsl(193 44% 67%)",
sonnet: "hsl(40 71% 73%)",
@@ -300,11 +350,25 @@ function CentroidTooltip({ data }: { data: CentroidDatum }) {
);
}
+const selectStyle: React.CSSProperties = {
+ background: "hsl(217 16% 15.5%)",
+ color: "hsl(213 14% 80%)",
+ border: "1px solid hsl(217 17% 28%)",
+ borderRadius: "2px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ padding: "4px 6px",
+ cursor: "pointer",
+};
+
export default function ScatterPlot({
runs,
- xMetric,
- yMetric,
+ defaultX = "cost",
+ defaultY = "outcome",
}: ScatterPlotProps) {
+ const [xMetric, setXMetric] = React.useState(defaultX);
+ const [yMetric, setYMetric] = React.useState(defaultY);
+
const xConf = METRIC_CONFIG[xMetric];
const yConf = METRIC_CONFIG[yMetric];
if (!xConf || !yConf) return null;
@@ -356,8 +420,26 @@ export default function ScatterPlot({
return (
<div className="card" style={{ position: "relative" }}>
- <h3 style={{ marginBottom: "16px" }}>
- {xConf.label} vs {yConf.label}{" "}
+ <div style={{ display: "flex", alignItems: "center", gap: "8px", marginBottom: "16px", flexWrap: "wrap" }}>
+ <select
+ value={xMetric}
+ onChange={(e) => setXMetric(e.target.value)}
+ style={selectStyle}
+ >
+ {METRIC_OPTIONS.map((opt) => (
+ <option key={opt.value} value={opt.value}>{opt.label}</option>
+ ))}
+ </select>
+ <span style={{ fontSize: "12px", color: "hsl(213 14% 55%)" }}>vs</span>
+ <select
+ value={yMetric}
+ onChange={(e) => setYMetric(e.target.value)}
+ style={selectStyle}
+ >
+ {METRIC_OPTIONS.map((opt) => (
+ <option key={opt.value} value={opt.value}>{opt.label}</option>
+ ))}
+ </select>
<span
style={{
fontSize: "12px",
@@ -367,7 +449,7 @@ export default function ScatterPlot({
>
({totalCells} cells)
</span>
- </h3>
+ </div>
{/* Legend */}
<div
diff --git a/dashboard/src/layouts/Base.astro b/dashboard/src/layouts/Base.astro
@@ -49,6 +49,8 @@ try {
<a href="/insights">Insights</a>
<a href="/explore">Explore</a>
<a href="/compare">Compare</a>
+ <span style="border-left: 1px solid hsl(var(--border)); height: 16px;"></span>
+ <a href="/methodology">Methodology</a>
</nav>
<button
id="theme-toggle"
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -22,8 +22,8 @@ const runs = loadAllRuns();
</div>
<div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
- <ScatterPlot client:load runs={runs} xMetric="cost" yMetric="score" />
- <ScatterPlot client:load runs={runs} xMetric="turns" yMetric="score" />
+ <ScatterPlot client:load runs={runs} defaultX="cost" defaultY="outcome" />
+ <ScatterPlot client:load runs={runs} defaultX="turns" defaultY="outcome" />
</div>
<div style="margin-top: 32px;">
diff --git a/harness/run.py b/harness/run.py
@@ -256,11 +256,11 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
results = {
"structural": None,
- "functional": None,
"quality": None,
"code_analysis": None,
"transcript_analysis": None,
"gameplay_bot": None,
+ "outcome_score": None,
"score": None,
}
@@ -270,17 +270,6 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
output = run_eval_script(structural_sh, workspace, language)
results["structural"] = safe_parse_json(output)
- # Functional
- tests_dir = task_dir / "eval" / "tests"
- if tests_dir.is_dir():
- if (tests_dir / "functional.sh").exists():
- output = run_eval_script(tests_dir / "functional.sh", workspace, language)
- results["functional"] = safe_parse_json(output)
- elif (tests_dir / "functional.spec.ts").exists():
- results["functional"] = {"pass": False, "error": "playwright eval not yet wired", "score": 0}
- elif (tests_dir / "functional.test.ts").exists():
- results["functional"] = {"pass": False, "error": "vitest eval not yet wired", "score": 0}
-
# Quality (lint, typecheck, bundle size)
quality_sh = task_dir / "eval" / "quality.sh"
if quality_sh.exists():
@@ -396,27 +385,29 @@ def evaluate(task_dir: Path, workspace: Path, cell: dict, run_dir: Path):
except Exception as e:
results["sonarqube"] = {"error": str(e), "score": 0}
- # Compute weighted score from scoring.yaml
+ # Compute outcome score from scoring.yaml (gameplay_bot + quality only)
try:
scoring_file = task_dir / "scoring.yaml"
if scoring_file.exists():
import yaml
scoring = yaml.safe_load(scoring_file.read_text())
- weights = scoring.get("weights", {})
+ outcome_weights = scoring.get("outcome_weights", {})
score = 0.0
total_weight = 0.0
- for category, weight in weights.items():
+ for category, weight in outcome_weights.items():
cat_data = results.get(category)
if cat_data and isinstance(cat_data.get("score"), (int, float)):
score += cat_data["score"] * weight
total_weight += weight
- # Normalize so scores use the full 0-1 range
if total_weight > 0:
- results["score"] = round(score / total_weight, 4)
+ results["outcome_score"] = round(score / total_weight, 4)
else:
- results["score"] = 0
+ results["outcome_score"] = 0
+
+ # Alias so existing code that reads "score" keeps working
+ results["score"] = results["outcome_score"]
except Exception:
pass
diff --git a/tasks/tetris/scoring.yaml b/tasks/tetris/scoring.yaml
@@ -1,9 +1,11 @@
-weights:
- structural: 0.10
- quality: 0.20
- gameplay_bot: 0.25
- code_analysis: 0.15
- transcript_analysis: 0.10
- # functional removed: was always 0 (not wired), gameplay_bot covers it
- # weights now sum to 0.80 -- remaining 0.20 reserved for future evals
- # (functional Playwright tests, accessibility, performance, etc.)
+# Outcome score (the headline number)
+outcome_weights:
+ gameplay_bot: 0.50
+ quality: 0.50
+
+# Output metrics (tracked, displayed, but don't affect headline score)
+# These are computed and stored but not blended into the outcome score:
+# - structural
+# - code_analysis
+# - transcript_analysis
+# - sonarqube