commit a25191cd2a25892e07b8dd4a14baa6f7c6035e42
parent 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 09:11:00 +0200
Add n= to chart labels, per-dimension metric selection
Charts now show sample size (e.g., "haiku (n=54)").
Insights metric switcher expanded: Overall, Gameplay, Code, Structural,
Lint/Types, Agent Efficiency, Cost, Turns, Time.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -37,7 +37,7 @@ function aggregateByModel(runs: Run[]): ModelScore[] {
}
return Object.entries(byModel).map(([model, data]) => ({
- model,
+ model: `${model} (n=${data.scores.length})`,
avg_score: data.scores.length > 0
? Math.round(
(data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
@@ -75,7 +75,7 @@ function aggregateByTask(runs: Run[]): TaskScore[] {
}
return Object.entries(byTask).map(([task, data]) => ({
- task,
+ task: `${task} (n=${data.total})`,
avg_score: data.scores.length > 0
? Math.round(
(data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx
@@ -9,10 +9,15 @@ interface InsightsProps {
}
const METRICS = [
- { key: "score", label: "Score" },
+ { key: "score", label: "Overall" },
+ { key: "gameplay", label: "Gameplay" },
+ { key: "code_quality", label: "Code" },
+ { key: "structural", label: "Structural" },
+ { key: "quality", label: "Lint/Types" },
+ { key: "transcript", label: "Agent Eff." },
{ key: "cost", label: "Cost" },
{ key: "turns", label: "Turns" },
- { key: "wall_time", label: "Wall Time" },
+ { key: "wall_time", label: "Time" },
];
export default function Insights({ runs }: InsightsProps) {
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -47,6 +47,11 @@ const METRICS: Record<string, MetricExtractor> = {
cost: (r) => r.claude_output?.total_cost_usd ?? null,
turns: (r) => r.claude_output?.num_turns ?? null,
wall_time: (r) => r.meta.wall_time_seconds ?? null,
+ gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null,
+ code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null,
+ structural: (r) => r.eval_results?.structural?.score ?? null,
+ quality: (r) => r.eval_results?.quality?.score ?? null,
+ transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
};
export function computeMainEffects(