loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit a25191cd2a25892e07b8dd4a14baa6f7c6035e42
parent 43d776d66e46c1b135bd5b3b01aac17ded5ee2e5
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 09:11:00 +0200

Add n= to chart labels, per-dimension metric selection

Charts now show sample size (e.g., "haiku (n=54)").
Insights metric switcher expanded: Overall, Gameplay, Code, Structural,
Lint/Types, Agent Efficiency, Cost, Turns, Time.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 4++--
Mdashboard/src/components/Insights.tsx | 9+++++++--
Mdashboard/src/lib/analysis.ts | 5+++++
3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -37,7 +37,7 @@ function aggregateByModel(runs: Run[]): ModelScore[] { } return Object.entries(byModel).map(([model, data]) => ({ - model, + model: `${model} (n=${data.scores.length})`, avg_score: data.scores.length > 0 ? Math.round( (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100 @@ -75,7 +75,7 @@ function aggregateByTask(runs: Run[]): TaskScore[] { } return Object.entries(byTask).map(([task, data]) => ({ - task, + task: `${task} (n=${data.total})`, avg_score: data.scores.length > 0 ? Math.round( (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100 diff --git a/dashboard/src/components/Insights.tsx b/dashboard/src/components/Insights.tsx @@ -9,10 +9,15 @@ interface InsightsProps { } const METRICS = [ - { key: "score", label: "Score" }, + { key: "score", label: "Overall" }, + { key: "gameplay", label: "Gameplay" }, + { key: "code_quality", label: "Code" }, + { key: "structural", label: "Structural" }, + { key: "quality", label: "Lint/Types" }, + { key: "transcript", label: "Agent Eff." }, { key: "cost", label: "Cost" }, { key: "turns", label: "Turns" }, - { key: "wall_time", label: "Wall Time" }, + { key: "wall_time", label: "Time" }, ]; export default function Insights({ runs }: InsightsProps) { diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts @@ -47,6 +47,11 @@ const METRICS: Record<string, MetricExtractor> = { cost: (r) => r.claude_output?.total_cost_usd ?? null, turns: (r) => r.claude_output?.num_turns ?? null, wall_time: (r) => r.meta.wall_time_seconds ?? null, + gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null, + code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null, + structural: (r) => r.eval_results?.structural?.score ?? null, + quality: (r) => r.eval_results?.quality?.score ?? null, + transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null, }; export function computeMainEffects(

Impressum · Datenschutz