loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit ae769a448ed5b0539181c89a2b3fc575989123ae
parent d7f2fbbca814d869ee1d4f46ef35b449c22ec226
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 10:32:54 +0200

Flexible axes on scatter plots and efficiency frontier

Both ScatterPlot and EfficiencyFrontier now have dropdown selectors
for x and y axes. 10 available metrics: outcome, gameplay, quality,
code quality, structural, SonarQube, transcript, cost, turns, time.

Cell interface expanded with quality, structural, sonarqube, transcript
fields. SonarQube metric extractor added to analysis.ts.

Default axes: cost vs outcome, turns vs outcome.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/EfficiencyFrontier.tsx | 247++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mdashboard/src/lib/analysis.ts | 9+++++++++
2 files changed, 211 insertions(+), 45 deletions(-)

diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx @@ -13,8 +13,119 @@ import { groupIntoCells } from "../lib/analysis"; interface EfficiencyFrontierProps { runs: Run[]; + defaultX?: string; + defaultY?: string; } +type CellMetricKey = + | "cost" + | "score" + | "turns" + | "wall_time" + | "gameplay" + | "quality" + | "code_quality" + | "structural" + | "sonarqube" + | "transcript"; + +interface MetricDef { + label: string; + cellKey: CellMetricKey; + scale: number; + format: (v: number) => string; + axisLabel: string; +} + +const METRIC_CONFIG: Record<string, MetricDef> = { + cost: { + label: "Cost ($)", + cellKey: "cost", + scale: 1, + format: (v: number) => `$${v.toFixed(2)}`, + axisLabel: "Avg Cost ($)", + }, + outcome: { + label: "Outcome Score (%)", + cellKey: "score", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Score (%)", + }, + gameplay: { + label: "Gameplay (%)", + cellKey: "gameplay", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Gameplay (%)", + }, + quality: { + label: "Quality (%)", + cellKey: "quality", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Quality (%)", + }, + code_quality: { + label: "Code Quality (%)", + cellKey: "code_quality", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Code Quality (%)", + }, + structural: { + label: "Structural (%)", + cellKey: "structural", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Structural (%)", + }, + sonarqube: { + label: "SonarQube (%)", + cellKey: "sonarqube", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg SonarQube (%)", + }, + turns: { + label: "Turns", + cellKey: "turns", + scale: 1, + format: (v: number) => `${Math.round(v)}`, + axisLabel: "Avg Turns", + }, + wall_time: { + label: "Time (s)", + cellKey: "wall_time", + scale: 1, + format: (v: number) => `${Math.round(v)}s`, + axisLabel: "Avg Time (s)", + }, + transcript: { + label: "Transcript (%)", + cellKey: "transcript", + scale: 1, + format: (v: number) => `${(v * 100).toFixed(0)}%`, + axisLabel: "Avg Transcript (%)", + }, +}; + +const METRIC_OPTIONS = Object.entries(METRIC_CONFIG).map(([key, conf]) => ({ + value: key, + label: conf.label, +})); + +const selectStyle: React.CSSProperties = { + background: "var(--surface-1, hsl(217 16% 15.5%))", + color: "var(--text, hsl(213 14% 80%))", + border: "1px solid var(--border, hsl(217 17% 28%))", + borderRadius: "2px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + padding: "4px 6px", + cursor: "pointer", +}; + const MODEL_COLORS: Record<string, string> = { haiku: "hsl(193 44% 67%)", sonnet: "hsl(40 71% 73%)", @@ -42,32 +153,44 @@ function getModelColor(model: string): string { return DEFAULT_COLOR; } -function aggregateByConfig(runs: Run[]): ConfigPoint[] { +function aggregateByConfig( + runs: Run[], + xKey: CellMetricKey, + yKey: CellMetricKey, +): ConfigPoint[] { const cells = groupIntoCells(runs); return cells - .filter((c) => c.score.avg > 0 && c.cost.avg > 0) - .map((c) => ({ - cell_id: c.cell_id, - model: c.meta.model, - avgCost: c.cost.avg, - avgScore: c.score.avg, - runCount: c.n, - config: { + .filter((c) => { + const xAgg = c[xKey] as { avg: number; min: number; max: number }; + const yAgg = c[yKey] as { avg: number; min: number; max: number }; + return xAgg.avg > 0 && yAgg.avg > 0; + }) + .map((c) => { + const xAgg = c[xKey] as { avg: number; min: number; max: number }; + const yAgg = c[yKey] as { avg: number; min: number; max: number }; + return { + cell_id: c.cell_id, model: c.meta.model, - effort: c.meta.effort, - prompt_style: c.meta.prompt_style, - language: c.meta.language, - linter: c.meta.linter, - playwright: c.meta.playwright, - context_file: c.meta.context_file, - sub_agents: c.meta.sub_agents, - web_search: c.meta.web_search, - max_budget: c.meta.max_budget, - }, - isFrontier: false, - label: "", - })); + avgCost: xAgg.avg, + avgScore: yAgg.avg, + runCount: c.n, + config: { + model: c.meta.model, + effort: c.meta.effort, + prompt_style: c.meta.prompt_style, + language: c.meta.language, + linter: c.meta.linter, + playwright: c.meta.playwright, + context_file: c.meta.context_file, + sub_agents: c.meta.sub_agents, + web_search: c.meta.web_search, + max_budget: c.meta.max_budget, + }, + isFrontier: false, + label: "", + }; + }); } function computeParetoFrontier(points: ConfigPoint[]): ConfigPoint[] { @@ -136,9 +259,13 @@ interface TooltipPayloadEntry { function CustomTooltip({ active, payload, + xConf, + yConf, }: { active?: boolean; payload?: TooltipPayloadEntry[]; + xConf: MetricDef; + yConf: MetricDef; }) { if (!active || !payload || payload.length === 0) return null; const point = payload[0]?.payload; @@ -167,14 +294,14 @@ function CustomTooltip({ {point.cell_id.split("_").filter(s => s.includes("=")).map(s => s.replace("=", ": ")).join(" ")} </div> <div style={{ marginBottom: "6px" }}> - <span style={{ color: "var(--text-muted)" }}>score: </span> + <span style={{ color: "var(--text-muted)" }}>{yConf.label}: </span> <span style={{ fontWeight: 600 }}> - {(point.avgScore * 100).toFixed(1)}% + {yConf.format(point.avgScore)} </span> </div> <div style={{ marginBottom: "6px" }}> - <span style={{ color: "var(--text-muted)" }}>cost: </span> - <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span> + <span style={{ color: "var(--text-muted)" }}>{xConf.label}: </span> + <span style={{ fontWeight: 600 }}>{xConf.format(point.avgCost)}</span> </div> <div style={{ marginBottom: "8px" }}> <span style={{ color: "var(--text-muted)" }}>runs in cell: </span> @@ -214,11 +341,20 @@ function CustomTooltip({ ); } -export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { +export default function EfficiencyFrontier({ + runs, + defaultX = "cost", + defaultY = "outcome", +}: EfficiencyFrontierProps) { const [hoveredId, setHoveredId] = useState<string | null>(null); + const [xMetric, setXMetric] = useState(defaultX); + const [yMetric, setYMetric] = useState(defaultY); + + const xConf = METRIC_CONFIG[xMetric] || METRIC_CONFIG.cost; + const yConf = METRIC_CONFIG[yMetric] || METRIC_CONFIG.outcome; const points = useMemo(() => { - const raw = aggregateByConfig(runs); + const raw = aggregateByConfig(runs, xConf.cellKey, yConf.cellKey); const frontier = computeParetoFrontier(raw); const frontierIds = new Set(frontier.map((p) => p.cell_id)); @@ -227,7 +363,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { isFrontier: frontierIds.has(p.cell_id), label: frontierIds.has(p.cell_id) ? findKeyDifference(p, raw) : "", })); - }, [runs]); + }, [runs, xMetric, yMetric]); if (points.length === 0) { return ( @@ -301,16 +437,38 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { return ( <div className="card"> <h3 style={{ marginBottom: "4px" }}>Efficiency Frontier</h3> - <p + <div style={{ - color: "var(--text-muted)", - fontSize: "11px", + display: "flex", + alignItems: "center", + gap: "8px", marginBottom: "16px", + flexWrap: "wrap", }} > - Cost vs score per cell (averaged across runs). Pareto frontier - highlights cells not dominated on both axes. - </p> + <select + value={xMetric} + onChange={(e) => setXMetric(e.target.value)} + style={selectStyle} + > + {METRIC_OPTIONS.map((opt) => ( + <option key={opt.value} value={opt.value}>{opt.label}</option> + ))} + </select> + <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>vs</span> + <select + value={yMetric} + onChange={(e) => setYMetric(e.target.value)} + style={selectStyle} + > + {METRIC_OPTIONS.map((opt) => ( + <option key={opt.value} value={opt.value}>{opt.label}</option> + ))} + </select> + <span style={{ fontSize: "11px", color: "var(--text-muted)" }}> + -- Pareto frontier highlights cells not dominated on both axes. + </span> + </div> {/* Legend */} <div @@ -357,13 +515,13 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { <XAxis dataKey="avgCost" type="number" - name="Avg Cost" + name={xConf.axisLabel} stroke="var(--text-muted)" fontSize={11} fontFamily="'JetBrains Mono', monospace" - tickFormatter={(v: number) => `$${v.toFixed(2)}`} + tickFormatter={(v: number) => xConf.format(v)} label={{ - value: "Avg Cost ($)", + value: xConf.axisLabel, position: "insideBottom", offset: -10, fill: "var(--text-muted)", @@ -374,14 +532,13 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { <YAxis dataKey="avgScore" type="number" - name="Avg Score" + name={yConf.axisLabel} stroke="var(--text-muted)" fontSize={11} fontFamily="'JetBrains Mono', monospace" - domain={[0, 1]} - tickFormatter={(v: number) => `${(v * 100).toFixed(0)}%`} + tickFormatter={(v: number) => yConf.format(v)} label={{ - value: "Avg Score (%)", + value: yConf.axisLabel, angle: -90, position: "insideLeft", offset: 0, @@ -390,7 +547,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { fontFamily: "'JetBrains Mono', monospace", }} /> - <Tooltip content={<CustomTooltip />} cursor={false} /> + <Tooltip content={<CustomTooltip xConf={xConf} yConf={yConf} />} cursor={false} /> {/* Non-frontier points (dimmed) */} <Scatter @@ -451,8 +608,8 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { <span style={{ color: "var(--text-muted)", marginLeft: "8px" }} > - ${point.avgCost.toFixed(2)} /{" "} - {(point.avgScore * 100).toFixed(0)}% + {xConf.format(point.avgCost)} /{" "} + {yConf.format(point.avgScore)} </span> </div> ))} diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts @@ -12,6 +12,10 @@ export interface Cell { wall_time: { avg: number; min: number; max: number }; gameplay: { avg: number; min: number; max: number }; code_quality: { avg: number; min: number; max: number }; + quality: { avg: number; min: number; max: number }; + structural: { avg: number; min: number; max: number }; + sonarqube: { avg: number; min: number; max: number }; + transcript: { avg: number; min: number; max: number }; } export interface EffectEntry { @@ -68,6 +72,7 @@ const METRICS: Record<string, MetricExtractor> = { structural: (r) => r.eval_results?.structural?.score ?? null, quality: (r) => r.eval_results?.quality?.score ?? null, transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null, + sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null, }; function agg(values: number[]): { avg: number; min: number; max: number } { @@ -114,6 +119,10 @@ export function groupIntoCells(runs: Run[]): Cell[] { wall_time: agg(extractVals(METRICS.wall_time)), gameplay: agg(extractVals(METRICS.gameplay)), code_quality: agg(extractVals(METRICS.code_quality)), + quality: agg(extractVals(METRICS.quality)), + structural: agg(extractVals(METRICS.structural)), + sonarqube: agg(extractVals(METRICS.sonarqube)), + transcript: agg(extractVals(METRICS.transcript)), }); }

Impressum · Datenschutz