loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

EfficiencyFrontier.tsx (16798B)


      1 import { useState, useMemo } from "react";
      2 import { getModelColor as sharedGetModelColor, MODEL_COLORS as SHARED_MODEL_COLORS } from "../lib/colors";
      3 import {
      4   ScatterChart,
      5   Scatter,
      6   XAxis,
      7   YAxis,
      8   CartesianGrid,
      9   Tooltip,
     10   ResponsiveContainer,
     11 } from "recharts";
     12 import type { Run } from "../lib/types";
     13 import { groupIntoCells } from "../lib/analysis";
     14 
     15 interface EfficiencyFrontierProps {
     16   runs: Run[];
     17   defaultX?: string;
     18   defaultY?: string;
     19 }
     20 
     21 type CellMetricKey =
     22   | "cost"
     23   | "score"
     24   | "turns"
     25   | "wall_time"
     26   | "gameplay"
     27   | "quality"
     28   | "code_quality"
     29   | "structural"
     30   | "sonarqube"
     31   | "transcript";
     32 
     33 interface MetricDef {
     34   label: string;
     35   cellKey: CellMetricKey;
     36   scale: number;
     37   format: (v: number) => string;
     38   axisLabel: string;
     39 }
     40 
     41 const METRIC_CONFIG: Record<string, MetricDef> = {
     42   cost: {
     43     label: "Cost ($)",
     44     cellKey: "cost",
     45     scale: 1,
     46     format: (v: number) => `$${v.toFixed(2)}`,
     47     axisLabel: "Avg Cost ($)",
     48   },
     49   outcome: {
     50     label: "Outcome Score (%)",
     51     cellKey: "score",
     52     scale: 1,
     53     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     54     axisLabel: "Avg Score (%)",
     55   },
     56   gameplay: {
     57     label: "Gameplay (%)",
     58     cellKey: "gameplay",
     59     scale: 1,
     60     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     61     axisLabel: "Avg Gameplay (%)",
     62   },
     63   quality: {
     64     label: "Quality (%)",
     65     cellKey: "quality",
     66     scale: 1,
     67     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     68     axisLabel: "Avg Quality (%)",
     69   },
     70   code_quality: {
     71     label: "Code Quality (%)",
     72     cellKey: "code_quality",
     73     scale: 1,
     74     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     75     axisLabel: "Avg Code Quality (%)",
     76   },
     77   structural: {
     78     label: "Structural (%)",
     79     cellKey: "structural",
     80     scale: 1,
     81     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     82     axisLabel: "Avg Structural (%)",
     83   },
     84   sonarqube: {
     85     label: "SonarQube (%)",
     86     cellKey: "sonarqube",
     87     scale: 1,
     88     format: (v: number) => `${(v * 100).toFixed(0)}%`,
     89     axisLabel: "Avg SonarQube (%)",
     90   },
     91   turns: {
     92     label: "Turns",
     93     cellKey: "turns",
     94     scale: 1,
     95     format: (v: number) => `${Math.round(v)}`,
     96     axisLabel: "Avg Turns",
     97   },
     98   wall_time: {
     99     label: "Time (s)",
    100     cellKey: "wall_time",
    101     scale: 1,
    102     format: (v: number) => `${Math.round(v)}s`,
    103     axisLabel: "Avg Time (s)",
    104   },
    105   transcript: {
    106     label: "Transcript (%)",
    107     cellKey: "transcript",
    108     scale: 1,
    109     format: (v: number) => `${(v * 100).toFixed(0)}%`,
    110     axisLabel: "Avg Transcript (%)",
    111   },
    112 };
    113 
    114 const METRIC_OPTIONS = Object.entries(METRIC_CONFIG).map(([key, conf]) => ({
    115   value: key,
    116   label: conf.label,
    117 }));
    118 
    119 const selectStyle: React.CSSProperties = {
    120   background: "var(--surface-1, hsl(217 16% 15.5%))",
    121   color: "var(--text, hsl(213 14% 80%))",
    122   border: "1px solid var(--border, hsl(217 17% 28%))",
    123   borderRadius: "2px",
    124   fontFamily: "'JetBrains Mono', monospace",
    125   fontSize: "11px",
    126   padding: "4px 6px",
    127   cursor: "pointer",
    128 };
    129 
    130 interface ConfigPoint {
    131   cell_id: string;
    132   model: string;
    133   avgCost: number;
    134   avgScore: number;
    135   runCount: number;
    136   config: Record<string, string>;
    137   isFrontier: boolean;
    138   label: string;
    139 }
    140 
    141 function getModelColor(model: string): string {
    142   return sharedGetModelColor(model);
    143 }
    144 
    145 function aggregateByConfig(
    146   runs: Run[],
    147   xKey: CellMetricKey,
    148   yKey: CellMetricKey,
    149 ): ConfigPoint[] {
    150   const cells = groupIntoCells(runs);
    151 
    152   return cells
    153     .filter((c) => {
    154       const xAgg = c[xKey] as { avg: number; min: number; max: number };
    155       const yAgg = c[yKey] as { avg: number; min: number; max: number };
    156       return xAgg.avg > 0 && yAgg.avg > 0;
    157     })
    158     .map((c) => {
    159       const xAgg = c[xKey] as { avg: number; min: number; max: number };
    160       const yAgg = c[yKey] as { avg: number; min: number; max: number };
    161       return {
    162         cell_id: c.cell_id,
    163         model: c.meta.model,
    164         avgCost: xAgg.avg,
    165         avgScore: yAgg.avg,
    166         runCount: c.n,
    167         config: {
    168           model: c.meta.model,
    169           effort: c.meta.effort,
    170           prompt_style: c.meta.prompt_style,
    171           language: c.meta.language,
    172           linter: c.meta.linter,
    173           playwright: c.meta.playwright,
    174           context_file: c.meta.context_file,
    175           sub_agents: c.meta.sub_agents,
    176           web_search: c.meta.web_search,
    177           max_budget: c.meta.max_budget,
    178         },
    179         isFrontier: false,
    180         label: "",
    181       };
    182     });
    183 }
    184 
    185 function computeParetoFrontier(points: ConfigPoint[]): ConfigPoint[] {
    186   const frontier: ConfigPoint[] = [];
    187 
    188   for (const p of points) {
    189     let dominated = false;
    190     for (const q of points) {
    191       if (q === p) continue;
    192       if (q.avgScore >= p.avgScore && q.avgCost <= p.avgCost) {
    193         if (q.avgScore > p.avgScore || q.avgCost < p.avgCost) {
    194           dominated = true;
    195           break;
    196         }
    197       }
    198     }
    199     if (!dominated) {
    200       frontier.push(p);
    201     }
    202   }
    203 
    204   frontier.sort((a, b) => a.avgCost - b.avgCost);
    205   return frontier;
    206 }
    207 
    208 function findKeyDifference(
    209   point: ConfigPoint,
    210   allPoints: ConfigPoint[]
    211 ): string {
    212   const configKeys = Object.keys(point.config);
    213   const valueCounts: Record<string, Record<string, number>> = {};
    214 
    215   for (const key of configKeys) {
    216     valueCounts[key] = {};
    217     for (const p of allPoints) {
    218       const val = p.config[key] || "";
    219       valueCounts[key][val] = (valueCounts[key][val] || 0) + 1;
    220     }
    221   }
    222 
    223   let bestKey = "";
    224   let bestRarity = Infinity;
    225 
    226   for (const key of configKeys) {
    227     if (key === "model") continue;
    228     const val = point.config[key];
    229     const count = valueCounts[key][val] || 0;
    230     const total = allPoints.length;
    231     const rarity = count / total;
    232     if (rarity < bestRarity && rarity < 1) {
    233       bestRarity = rarity;
    234       bestKey = key;
    235     }
    236   }
    237 
    238   if (bestKey) {
    239     return `${point.config.model} / ${bestKey}=${point.config[bestKey]}`;
    240   }
    241   return point.config.model;
    242 }
    243 
    244 interface TooltipPayloadEntry {
    245   payload?: ConfigPoint;
    246 }
    247 
    248 function CustomTooltip({
    249   active,
    250   payload,
    251   xConf,
    252   yConf,
    253 }: {
    254   active?: boolean;
    255   payload?: TooltipPayloadEntry[];
    256   xConf: MetricDef;
    257   yConf: MetricDef;
    258 }) {
    259   if (!active || !payload || payload.length === 0) return null;
    260   const point = payload[0]?.payload;
    261   if (!point) return null;
    262 
    263   return (
    264     <div
    265       style={{
    266         background: "var(--surface-1)",
    267         border: "1px solid var(--border)",
    268         padding: "12px",
    269         fontFamily: "'JetBrains Mono', monospace",
    270         fontSize: "11px",
    271         color: "var(--text)",
    272         maxWidth: "300px",
    273       }}
    274     >
    275       <div
    276         style={{
    277           fontWeight: 600,
    278           marginBottom: "8px",
    279           fontSize: "12px",
    280           color: getModelColor(point.model),
    281         }}
    282       >
    283         {point.cell_id.split("_").filter(s => s.includes("=")).map(s => s.replace("=", ": ")).join(" ")}
    284       </div>
    285       <div style={{ marginBottom: "6px" }}>
    286         <span style={{ color: "var(--text-muted)" }}>{yConf.label}: </span>
    287         <span style={{ fontWeight: 600 }}>
    288           {yConf.format(point.avgScore)}
    289         </span>
    290       </div>
    291       <div style={{ marginBottom: "6px" }}>
    292         <span style={{ color: "var(--text-muted)" }}>{xConf.label}: </span>
    293         <span style={{ fontWeight: 600 }}>{xConf.format(point.avgCost)}</span>
    294       </div>
    295       <div style={{ marginBottom: "8px" }}>
    296         <span style={{ color: "var(--text-muted)" }}>runs in cell: </span>
    297         <span>{point.runCount}</span>
    298       </div>
    299       {point.isFrontier && (
    300         <div
    301           style={{
    302             color: "var(--green)",
    303             fontWeight: 600,
    304             fontSize: "10px",
    305             textTransform: "uppercase",
    306             letterSpacing: "1px",
    307             marginBottom: "8px",
    308           }}
    309         >
    310           Pareto Frontier
    311         </div>
    312       )}
    313       <div
    314         style={{
    315           borderTop: "1px solid var(--border)",
    316           paddingTop: "8px",
    317           display: "grid",
    318           gridTemplateColumns: "auto 1fr",
    319           gap: "2px 8px",
    320         }}
    321       >
    322         {Object.entries(point.config).map(([key, val]) => (
    323           <div key={key} style={{ display: "contents" }}>
    324             <span style={{ color: "var(--text-muted)" }}>{key}:</span>
    325             <span>{val}</span>
    326           </div>
    327         ))}
    328       </div>
    329     </div>
    330   );
    331 }
    332 
    333 export default function EfficiencyFrontier({
    334   runs,
    335   defaultX = "cost",
    336   defaultY = "outcome",
    337 }: EfficiencyFrontierProps) {
    338   const [hoveredId, setHoveredId] = useState<string | null>(null);
    339   const [xMetric, setXMetric] = useState(defaultX);
    340   const [yMetric, setYMetric] = useState(defaultY);
    341 
    342   const xConf = METRIC_CONFIG[xMetric] || METRIC_CONFIG.cost;
    343   const yConf = METRIC_CONFIG[yMetric] || METRIC_CONFIG.outcome;
    344 
    345   const points = useMemo(() => {
    346     const raw = aggregateByConfig(runs, xConf.cellKey, yConf.cellKey);
    347     const frontier = computeParetoFrontier(raw);
    348     const frontierIds = new Set(frontier.map((p) => p.cell_id));
    349 
    350     return raw.map((p) => ({
    351       ...p,
    352       isFrontier: frontierIds.has(p.cell_id),
    353       label: frontierIds.has(p.cell_id) ? findKeyDifference(p, raw) : "",
    354     }));
    355   }, [runs, xMetric, yMetric]);
    356 
    357   if (points.length === 0) {
    358     return (
    359       <div
    360         className="card"
    361         style={{
    362           textAlign: "center",
    363           padding: "40px",
    364           color: "var(--text-muted)",
    365         }}
    366       >
    367         Not enough data to compute efficiency frontier.
    368       </div>
    369     );
    370   }
    371 
    372   const frontierPoints = points
    373     .filter((p) => p.isFrontier)
    374     .sort((a, b) => a.avgCost - b.avgCost);
    375   const nonFrontierPoints = points.filter((p) => !p.isFrontier);
    376 
    377   // Custom shape for non-frontier dots (small, dimmed)
    378   const nonFrontierShape = (props: {
    379     cx?: number;
    380     cy?: number;
    381     payload?: ConfigPoint;
    382   }) => {
    383     const { cx, cy, payload } = props;
    384     if (cx == null || cy == null || !payload) return null;
    385     const color = getModelColor(payload.model);
    386     const opacity =
    387       hoveredId === null ? 0.4 : hoveredId === payload.cell_id ? 1 : 0.2;
    388     return (
    389       <circle
    390         cx={cx}
    391         cy={cy}
    392         r={5}
    393         fill={color}
    394         fillOpacity={opacity}
    395         stroke="none"
    396         style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }}
    397       />
    398     );
    399   };
    400 
    401   // Custom shape for frontier dots (large, prominent, green ring)
    402   const frontierShape = (props: {
    403     cx?: number;
    404     cy?: number;
    405     payload?: ConfigPoint;
    406   }) => {
    407     const { cx, cy, payload } = props;
    408     if (cx == null || cy == null || !payload) return null;
    409     const color = getModelColor(payload.model);
    410     const opacity =
    411       hoveredId === null ? 1 : hoveredId === payload.cell_id ? 1 : 0.5;
    412     return (
    413       <circle
    414         cx={cx}
    415         cy={cy}
    416         r={9}
    417         fill={color}
    418         fillOpacity={opacity}
    419         stroke="hsl(92 28% 65%)"
    420         strokeWidth={2}
    421         style={{ cursor: "pointer", transition: "fill-opacity 0.15s" }}
    422       />
    423     );
    424   };
    425 
    426   return (
    427     <div className="card">
    428       <h3 style={{ marginBottom: "4px" }}>Efficiency Frontier</h3>
    429       <div
    430         style={{
    431           display: "flex",
    432           alignItems: "center",
    433           gap: "8px",
    434           marginBottom: "16px",
    435           flexWrap: "wrap",
    436         }}
    437       >
    438         <select
    439           value={xMetric}
    440           onChange={(e) => setXMetric(e.target.value)}
    441           style={selectStyle}
    442         >
    443           {METRIC_OPTIONS.map((opt) => (
    444             <option key={opt.value} value={opt.value}>{opt.label}</option>
    445           ))}
    446         </select>
    447         <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>vs</span>
    448         <select
    449           value={yMetric}
    450           onChange={(e) => setYMetric(e.target.value)}
    451           style={selectStyle}
    452         >
    453           {METRIC_OPTIONS.map((opt) => (
    454             <option key={opt.value} value={opt.value}>{opt.label}</option>
    455           ))}
    456         </select>
    457         <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>
    458           -- Pareto frontier highlights cells not dominated on both axes.
    459         </span>
    460       </div>
    461 
    462       {/* Legend */}
    463       <div
    464         style={{
    465           display: "flex",
    466           gap: "16px",
    467           marginBottom: "12px",
    468           fontSize: "11px",
    469           color: "var(--text-muted)",
    470           flexWrap: "wrap",
    471         }}
    472       >
    473         {Object.entries(SHARED_MODEL_COLORS).filter(([k]) => !k.startsWith("slot-")).map(([model, color]) => (
    474           <div
    475             key={model}
    476             style={{ display: "flex", alignItems: "center", gap: "6px" }}
    477           >
    478             <div
    479               style={{
    480                 width: "8px",
    481                 height: "8px",
    482                 background: color,
    483               }}
    484             />
    485             <span>{model}</span>
    486           </div>
    487         ))}
    488         <div style={{ display: "flex", alignItems: "center", gap: "6px" }}>
    489           <div
    490             style={{
    491               width: "12px",
    492               height: "12px",
    493               border: "2px solid hsl(92 28% 65%)",
    494               background: "transparent",
    495             }}
    496           />
    497           <span>frontier</span>
    498         </div>
    499       </div>
    500 
    501       <ResponsiveContainer width="100%" height={420}>
    502         <ScatterChart margin={{ top: 20, right: 30, bottom: 20, left: 20 }}>
    503           <CartesianGrid strokeDasharray="3 3" stroke="var(--border)" />
    504           <XAxis
    505             dataKey="avgCost"
    506             type="number"
    507             name={xConf.axisLabel}
    508             stroke="var(--text-muted)"
    509             fontSize={11}
    510             fontFamily="'JetBrains Mono', monospace"
    511             tickFormatter={(v: number) => xConf.format(v)}
    512             label={{
    513               value: xConf.axisLabel,
    514               position: "insideBottom",
    515               offset: -10,
    516               fill: "var(--text-muted)",
    517               fontSize: 11,
    518               fontFamily: "'JetBrains Mono', monospace",
    519             }}
    520           />
    521           <YAxis
    522             dataKey="avgScore"
    523             type="number"
    524             name={yConf.axisLabel}
    525             stroke="var(--text-muted)"
    526             fontSize={11}
    527             fontFamily="'JetBrains Mono', monospace"
    528             tickFormatter={(v: number) => yConf.format(v)}
    529             label={{
    530               value: yConf.axisLabel,
    531               angle: -90,
    532               position: "insideLeft",
    533               offset: 0,
    534               fill: "var(--text-muted)",
    535               fontSize: 11,
    536               fontFamily: "'JetBrains Mono', monospace",
    537             }}
    538           />
    539           <Tooltip content={<CustomTooltip xConf={xConf} yConf={yConf} />} cursor={false} />
    540 
    541           {/* Non-frontier points (dimmed) */}
    542           <Scatter
    543             name="cells"
    544             data={nonFrontierPoints}
    545             shape={nonFrontierShape}
    546             isAnimationActive={false}
    547             legendType="none"
    548           />
    549 
    550           {/* Frontier points (prominent) with connecting line */}
    551           <Scatter
    552             name="frontier"
    553             data={frontierPoints}
    554             shape={frontierShape}
    555             isAnimationActive={false}
    556             legendType="none"
    557             line={{ stroke: "hsl(92 28% 65%)", strokeWidth: 1.5, strokeDasharray: "6 3" }}
    558             lineType="joint"
    559             lineJointType="linear"
    560           />
    561         </ScatterChart>
    562       </ResponsiveContainer>
    563 
    564       {/* Frontier labels below the chart */}
    565       {frontierPoints.length > 0 && (
    566         <div
    567           style={{
    568             marginTop: "12px",
    569             display: "flex",
    570             flexWrap: "wrap",
    571             gap: "8px",
    572           }}
    573         >
    574           {frontierPoints
    575             .sort((a, b) => a.avgCost - b.avgCost)
    576             .map((point) => (
    577               <div
    578                 key={point.cell_id}
    579                 onMouseEnter={() => setHoveredId(point.cell_id)}
    580                 onMouseLeave={() => setHoveredId(null)}
    581                 style={{
    582                   padding: "4px 8px",
    583                   background: "var(--surface-2)",
    584                   border: "1px solid var(--border)",
    585                   fontSize: "10px",
    586                   fontFamily: "'JetBrains Mono', monospace",
    587                   color: getModelColor(point.model),
    588                   cursor: "default",
    589                   transition: "border-color 0.15s",
    590                   borderColor:
    591                     hoveredId === point.cell_id
    592                       ? "hsl(92 28% 65%)"
    593                       : "var(--border)",
    594                 }}
    595               >
    596                 {point.label}
    597                 <span
    598                   style={{ color: "var(--text-muted)", marginLeft: "8px" }}
    599                 >
    600                   {xConf.format(point.avgCost)} /{" "}
    601                   {yConf.format(point.avgScore)}
    602                 </span>
    603               </div>
    604             ))}
    605         </div>
    606       )}
    607     </div>
    608   );
    609 }

Impressum · Datenschutz