Add n= to chart labels, per-dimension metric selection - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit a25191cd2a25892e07b8dd4a14baa6f7c6035e42
parent 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 09:11:00 +0200

Add n= to chart labels, per-dimension metric selection

Charts now show sample size (e.g., "haiku (n=54)").
Insights metric switcher expanded: Overall, Gameplay, Code, Structural,
Lint/Types, Agent Efficiency, Cost, Turns, Time.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/Charts.tsx  | 4 ++--
M dashboard/src/components/Insights.tsx  | 9 +++++++--
M dashboard/src/lib/analysis.ts  | 5 +++++

3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -37,7 +37,7 @@ function aggregateByModel(runs: Run[]): ModelScore[] {
   }
 
   return Object.entries(byModel).map(([model, data]) => ({
-    model,
+    model: `${model} (n=${data.scores.length})`,
     avg_score: data.scores.length > 0
       ? Math.round(
           (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
@@ -75,7 +75,7 @@ function aggregateByTask(runs: Run[]): TaskScore[] {
   }
 
   return Object.entries(byTask).map(([task, data]) => ({
-    task,
+    task: `${task} (n=${data.total})`,
     avg_score: data.scores.length > 0
       ? Math.round(
           (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx
@@ -9,10 +9,15 @@ interface InsightsProps {
 }
 
 const METRICS = [
-  { key: "score", label: "Score" },
+  { key: "score", label: "Overall" },
+  { key: "gameplay", label: "Gameplay" },
+  { key: "code_quality", label: "Code" },
+  { key: "structural", label: "Structural" },
+  { key: "quality", label: "Lint/Types" },
+  { key: "transcript", label: "Agent Eff." },
   { key: "cost", label: "Cost" },
   { key: "turns", label: "Turns" },
-  { key: "wall_time", label: "Wall Time" },
+  { key: "wall_time", label: "Time" },
 ];
 
 export default function Insights({ runs }: InsightsProps) {
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -47,6 +47,11 @@ const METRICS: Record<string, MetricExtractor> = {
   cost: (r) => r.claude_output?.total_cost_usd ?? null,
   turns: (r) => r.claude_output?.num_turns ?? null,
   wall_time: (r) => r.meta.wall_time_seconds ?? null,
+  gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null,
+  code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null,
+  structural: (r) => r.eval_results?.structural?.score ?? null,
+  quality: (r) => r.eval_results?.quality?.score ?? null,
+  transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
 };
 
 export function computeMainEffects(

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/Charts.tsx	\|	4	++--
M	dashboard/src/components/Insights.tsx	\|	9	+++++++--
M	dashboard/src/lib/analysis.ts	\|	5	+++++