Flexible axes on scatter plots and efficiency frontier - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit ae769a448ed5b0539181c89a2b3fc575989123ae
parent d7f2fbbca814d869ee1d4f46ef35b449c22ec226
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 10:32:54 +0200

Flexible axes on scatter plots and efficiency frontier

Both ScatterPlot and EfficiencyFrontier now have dropdown selectors
for x and y axes. 10 available metrics: outcome, gameplay, quality,
code quality, structural, SonarQube, transcript, cost, turns, time.

Cell interface expanded with quality, structural, sonarqube, transcript
fields. SonarQube metric extractor added to analysis.ts.

Default axes: cost vs outcome, turns vs outcome.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/EfficiencyFrontier.tsx  | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M dashboard/src/lib/analysis.ts  | 9 +++++++++

2 files changed, 211 insertions(+), 45 deletions(-)
diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx
@@ -13,8 +13,119 @@ import { groupIntoCells } from "../lib/analysis";
 
 interface EfficiencyFrontierProps {
   runs: Run[];
+  defaultX?: string;
+  defaultY?: string;
 }
 
+type CellMetricKey =
+  | "cost"
+  | "score"
+  | "turns"
+  | "wall_time"
+  | "gameplay"
+  | "quality"
+  | "code_quality"
+  | "structural"
+  | "sonarqube"
+  | "transcript";
+
+interface MetricDef {
+  label: string;
+  cellKey: CellMetricKey;
+  scale: number;
+  format: (v: number) => string;
+  axisLabel: string;
+}
+
+const METRIC_CONFIG: Record<string, MetricDef> = {
+  cost: {
+    label: "Cost ($)",
+    cellKey: "cost",
+    scale: 1,
+    format: (v: number) => `$${v.toFixed(2)}`,
+    axisLabel: "Avg Cost ($)",
+  },
+  outcome: {
+    label: "Outcome Score (%)",
+    cellKey: "score",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Score (%)",
+  },
+  gameplay: {
+    label: "Gameplay (%)",
+    cellKey: "gameplay",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Gameplay (%)",
+  },
+  quality: {
+    label: "Quality (%)",
+    cellKey: "quality",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Quality (%)",
+  },
+  code_quality: {
+    label: "Code Quality (%)",
+    cellKey: "code_quality",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Code Quality (%)",
+  },
+  structural: {
+    label: "Structural (%)",
+    cellKey: "structural",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Structural (%)",
+  },
+  sonarqube: {
+    label: "SonarQube (%)",
+    cellKey: "sonarqube",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg SonarQube (%)",
+  },
+  turns: {
+    label: "Turns",
+    cellKey: "turns",
+    scale: 1,
+    format: (v: number) => `${Math.round(v)}`,
+    axisLabel: "Avg Turns",
+  },
+  wall_time: {
+    label: "Time (s)",
+    cellKey: "wall_time",
+    scale: 1,
+    format: (v: number) => `${Math.round(v)}s`,
+    axisLabel: "Avg Time (s)",
+  },
+  transcript: {
+    label: "Transcript (%)",
+    cellKey: "transcript",
+    scale: 1,
+    format: (v: number) => `${(v * 100).toFixed(0)}%`,
+    axisLabel: "Avg Transcript (%)",
+  },
+};
+
+const METRIC_OPTIONS = Object.entries(METRIC_CONFIG).map(([key, conf]) => ({
+  value: key,
+  label: conf.label,
+}));
+
+const selectStyle: React.CSSProperties = {
+  background: "var(--surface-1, hsl(217 16% 15.5%))",
+  color: "var(--text, hsl(213 14% 80%))",
+  border: "1px solid var(--border, hsl(217 17% 28%))",
+  borderRadius: "2px",
+  fontFamily: "'JetBrains Mono', monospace",
+  fontSize: "11px",
+  padding: "4px 6px",
+  cursor: "pointer",
+};
+
 const MODEL_COLORS: Record<string, string> = {
   haiku: "hsl(193 44% 67%)",
   sonnet: "hsl(40 71% 73%)",
@@ -42,32 +153,44 @@ function getModelColor(model: string): string {
   return DEFAULT_COLOR;
 }
 
-function aggregateByConfig(runs: Run[]): ConfigPoint[] {
+function aggregateByConfig(
+  runs: Run[],
+  xKey: CellMetricKey,
+  yKey: CellMetricKey,
+): ConfigPoint[] {
   const cells = groupIntoCells(runs);
 
   return cells
-    .filter((c) => c.score.avg > 0 && c.cost.avg > 0)
-    .map((c) => ({
-      cell_id: c.cell_id,
-      model: c.meta.model,
-      avgCost: c.cost.avg,
-      avgScore: c.score.avg,
-      runCount: c.n,
-      config: {
+    .filter((c) => {
+      const xAgg = c[xKey] as { avg: number; min: number; max: number };
+      const yAgg = c[yKey] as { avg: number; min: number; max: number };
+      return xAgg.avg > 0 && yAgg.avg > 0;
+    })
+    .map((c) => {
+      const xAgg = c[xKey] as { avg: number; min: number; max: number };
+      const yAgg = c[yKey] as { avg: number; min: number; max: number };
+      return {
+        cell_id: c.cell_id,
         model: c.meta.model,
-        effort: c.meta.effort,
-        prompt_style: c.meta.prompt_style,
-        language: c.meta.language,
-        linter: c.meta.linter,
-        playwright: c.meta.playwright,
-        context_file: c.meta.context_file,
-        sub_agents: c.meta.sub_agents,
-        web_search: c.meta.web_search,
-        max_budget: c.meta.max_budget,
-      },
-      isFrontier: false,
-      label: "",
-    }));
+        avgCost: xAgg.avg,
+        avgScore: yAgg.avg,
+        runCount: c.n,
+        config: {
+          model: c.meta.model,
+          effort: c.meta.effort,
+          prompt_style: c.meta.prompt_style,
+          language: c.meta.language,
+          linter: c.meta.linter,
+          playwright: c.meta.playwright,
+          context_file: c.meta.context_file,
+          sub_agents: c.meta.sub_agents,
+          web_search: c.meta.web_search,
+          max_budget: c.meta.max_budget,
+        },
+        isFrontier: false,
+        label: "",
+      };
+    });
 }
 
 function computeParetoFrontier(points: ConfigPoint[]): ConfigPoint[] {
@@ -136,9 +259,13 @@ interface TooltipPayloadEntry {
 function CustomTooltip({
   active,
   payload,
+  xConf,
+  yConf,
 }: {
   active?: boolean;
   payload?: TooltipPayloadEntry[];
+  xConf: MetricDef;
+  yConf: MetricDef;
 }) {
   if (!active || !payload || payload.length === 0) return null;
   const point = payload[0]?.payload;
@@ -167,14 +294,14 @@ function CustomTooltip({
         {point.cell_id.split("_").filter(s => s.includes("=")).map(s => s.replace("=", ": ")).join(" ")}
       </div>
       <div style={{ marginBottom: "6px" }}>
-        <span style={{ color: "var(--text-muted)" }}>score: </span>
+        <span style={{ color: "var(--text-muted)" }}>{yConf.label}: </span>
         <span style={{ fontWeight: 600 }}>
-          {(point.avgScore * 100).toFixed(1)}%
+          {yConf.format(point.avgScore)}
         </span>
       </div>
       <div style={{ marginBottom: "6px" }}>
-        <span style={{ color: "var(--text-muted)" }}>cost: </span>
-        <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span>
+        <span style={{ color: "var(--text-muted)" }}>{xConf.label}: </span>
+        <span style={{ fontWeight: 600 }}>{xConf.format(point.avgCost)}</span>
       </div>
       <div style={{ marginBottom: "8px" }}>
         <span style={{ color: "var(--text-muted)" }}>runs in cell: </span>
@@ -214,11 +341,20 @@ function CustomTooltip({
   );
 }
 
-export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
+export default function EfficiencyFrontier({
+  runs,
+  defaultX = "cost",
+  defaultY = "outcome",
+}: EfficiencyFrontierProps) {
   const [hoveredId, setHoveredId] = useState<string | null>(null);
+  const [xMetric, setXMetric] = useState(defaultX);
+  const [yMetric, setYMetric] = useState(defaultY);
+
+  const xConf = METRIC_CONFIG[xMetric] || METRIC_CONFIG.cost;
+  const yConf = METRIC_CONFIG[yMetric] || METRIC_CONFIG.outcome;
 
   const points = useMemo(() => {
-    const raw = aggregateByConfig(runs);
+    const raw = aggregateByConfig(runs, xConf.cellKey, yConf.cellKey);
     const frontier = computeParetoFrontier(raw);
     const frontierIds = new Set(frontier.map((p) => p.cell_id));
 
@@ -227,7 +363,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
       isFrontier: frontierIds.has(p.cell_id),
       label: frontierIds.has(p.cell_id) ? findKeyDifference(p, raw) : "",
     }));
-  }, [runs]);
+  }, [runs, xMetric, yMetric]);
 
   if (points.length === 0) {
     return (
@@ -301,16 +437,38 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
   return (
     <div className="card">
       <h3 style={{ marginBottom: "4px" }}>Efficiency Frontier</h3>
-      <p
+      <div
         style={{
-          color: "var(--text-muted)",
-          fontSize: "11px",
+          display: "flex",
+          alignItems: "center",
+          gap: "8px",
           marginBottom: "16px",
+          flexWrap: "wrap",
         }}
       >
-        Cost vs score per cell (averaged across runs). Pareto frontier
-        highlights cells not dominated on both axes.
-      </p>
+        <select
+          value={xMetric}
+          onChange={(e) => setXMetric(e.target.value)}
+          style={selectStyle}
+        >
+          {METRIC_OPTIONS.map((opt) => (
+            <option key={opt.value} value={opt.value}>{opt.label}</option>
+          ))}
+        </select>
+        <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>vs</span>
+        <select
+          value={yMetric}
+          onChange={(e) => setYMetric(e.target.value)}
+          style={selectStyle}
+        >
+          {METRIC_OPTIONS.map((opt) => (
+            <option key={opt.value} value={opt.value}>{opt.label}</option>
+          ))}
+        </select>
+        <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>
+          -- Pareto frontier highlights cells not dominated on both axes.
+        </span>
+      </div>
 
       {/* Legend */}
       <div
@@ -357,13 +515,13 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
           <XAxis
             dataKey="avgCost"
             type="number"
-            name="Avg Cost"
+            name={xConf.axisLabel}
             stroke="var(--text-muted)"
             fontSize={11}
             fontFamily="'JetBrains Mono', monospace"
-            tickFormatter={(v: number) => `$${v.toFixed(2)}`}
+            tickFormatter={(v: number) => xConf.format(v)}
             label={{
-              value: "Avg Cost ($)",
+              value: xConf.axisLabel,
               position: "insideBottom",
               offset: -10,
               fill: "var(--text-muted)",
@@ -374,14 +532,13 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
           <YAxis
             dataKey="avgScore"
             type="number"
-            name="Avg Score"
+            name={yConf.axisLabel}
             stroke="var(--text-muted)"
             fontSize={11}
             fontFamily="'JetBrains Mono', monospace"
-            domain={[0, 1]}
-            tickFormatter={(v: number) => `${(v * 100).toFixed(0)}%`}
+            tickFormatter={(v: number) => yConf.format(v)}
             label={{
-              value: "Avg Score (%)",
+              value: yConf.axisLabel,
               angle: -90,
               position: "insideLeft",
               offset: 0,
@@ -390,7 +547,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
               fontFamily: "'JetBrains Mono', monospace",
             }}
           />
-          <Tooltip content={<CustomTooltip />} cursor={false} />
+          <Tooltip content={<CustomTooltip xConf={xConf} yConf={yConf} />} cursor={false} />
 
           {/* Non-frontier points (dimmed) */}
           <Scatter
@@ -451,8 +608,8 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
                 <span
                   style={{ color: "var(--text-muted)", marginLeft: "8px" }}
                 >
-                  ${point.avgCost.toFixed(2)} /{" "}
-                  {(point.avgScore * 100).toFixed(0)}%
+                  {xConf.format(point.avgCost)} /{" "}
+                  {yConf.format(point.avgScore)}
                 </span>
               </div>
             ))}
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -12,6 +12,10 @@ export interface Cell {
   wall_time: { avg: number; min: number; max: number };
   gameplay: { avg: number; min: number; max: number };
   code_quality: { avg: number; min: number; max: number };
+  quality: { avg: number; min: number; max: number };
+  structural: { avg: number; min: number; max: number };
+  sonarqube: { avg: number; min: number; max: number };
+  transcript: { avg: number; min: number; max: number };
 }
 
 export interface EffectEntry {
@@ -68,6 +72,7 @@ const METRICS: Record<string, MetricExtractor> = {
   structural: (r) => r.eval_results?.structural?.score ?? null,
   quality: (r) => r.eval_results?.quality?.score ?? null,
   transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
+  sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null,
 };
 
 function agg(values: number[]): { avg: number; min: number; max: number } {
@@ -114,6 +119,10 @@ export function groupIntoCells(runs: Run[]): Cell[] {
       wall_time: agg(extractVals(METRICS.wall_time)),
       gameplay: agg(extractVals(METRICS.gameplay)),
       code_quality: agg(extractVals(METRICS.code_quality)),
+      quality: agg(extractVals(METRICS.quality)),
+      structural: agg(extractVals(METRICS.structural)),
+      sonarqube: agg(extractVals(METRICS.sonarqube)),
+      transcript: agg(extractVals(METRICS.transcript)),
     });
   }

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/EfficiencyFrontier.tsx	\|	247	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
M	dashboard/src/lib/analysis.ts	\|	9	+++++++++