loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

BumpChart.tsx (17657B)


      1 import { useState, useMemo } from "react";
      2 import {
      3   LineChart,
      4   Line,
      5   XAxis,
      6   YAxis,
      7   CartesianGrid,
      8   Tooltip,
      9   ResponsiveContainer,
     10   ReferenceDot,
     11 } from "recharts";
     12 import type { Run } from "../lib/types";
     13 import { AXIS_NAMES, type AxisName } from "../lib/types";
     14 import { groupIntoCells, type Cell } from "../lib/analysis";
     15 import { getModelColor } from "../lib/colors";
     16 
     17 interface BumpChartProps {
     18   runs: Run[];
     19 }
     20 
     21 const AXIS_LABELS: Record<AxisName, string> = {
     22   model: "Model",
     23   effort: "Effort",
     24   prompt_style: "Prompt Style",
     25   language: "Language",
     26   human_language: "Human Language",
     27   tool_read: "Read Tool",
     28   tool_write: "Write Tool",
     29   tool_edit: "Edit Tool",
     30   tool_glob: "Glob Tool",
     31   tool_grep: "Grep Tool",
     32   linter: "Linter",
     33   playwright: "Playwright",
     34   context_file: "Context File",
     35   web_search: "Web Search",
     36   max_budget: "Budget",
     37   tests_provided: "Tests Provided",
     38   strategy: "Strategy",
     39   design_guidance: "Design Guidance",
     40   architecture: "Architecture",
     41   error_checking: "Error Checking",
     42   context_noise: "Context Noise",
     43   renderer: "Renderer",
     44   provider: "Provider",
     45 };
     46 
     47 // All axes except "model" since we rank by model
     48 const CONDITION_AXES = AXIS_NAMES.filter((a) => a !== "model");
     49 
     50 interface RankedPoint {
     51   conditionValue: string;
     52   rank: number;
     53   avgScore: number;
     54   model: string;
     55   n: number;
     56 }
     57 
     58 interface CrossingPoint {
     59   conditionValue: string;
     60   x: number;
     61   rank: number;
     62   models: [string, string];
     63 }
     64 
     65 function computeRankings(
     66   runs: Run[],
     67   axis: AxisName
     68 ): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } {
     69   // Group runs into cells, then work with cell averages
     70   const cells = groupIntoCells(runs);
     71   const scoredCells = cells.filter((c) => c.score.avg > 0);
     72 
     73   // Get unique condition values for the selected axis
     74   const conditionValues = Array.from(
     75     new Set(scoredCells.map((c) => String(c.meta[axis])))
     76   ).sort();
     77 
     78   // Get unique models
     79   const models = Array.from(new Set(scoredCells.map((c) => c.meta.model))).sort();
     80 
     81   // For each condition value, compute average of cell averages per model, then rank
     82   const ranked: Record<string, RankedPoint[]> = {};
     83   for (const model of models) {
     84     ranked[model] = [];
     85   }
     86 
     87   const prevRanks: Record<string, number> = {};
     88 
     89   const crossings: CrossingPoint[] = [];
     90 
     91   for (let ci = 0; ci < conditionValues.length; ci++) {
     92     const cv = conditionValues[ci];
     93     const cellsForCondition = scoredCells.filter(
     94       (c) => String(c.meta[axis]) === cv
     95     );
     96 
     97     // Compute average of cell averages per model for this condition
     98     const modelScores: Array<{
     99       model: string;
    100       avgScore: number;
    101       n: number;
    102     }> = [];
    103     for (const model of models) {
    104       const modelCells = cellsForCondition.filter(
    105         (c) => c.meta.model === model
    106       );
    107 
    108       if (modelCells.length > 0) {
    109         const avg = modelCells.reduce((s, c) => s + c.score.avg, 0) / modelCells.length;
    110         modelScores.push({ model, avgScore: avg, n: modelCells.length });
    111       }
    112     }
    113 
    114     // Sort by avgScore descending (higher score = rank 1)
    115     modelScores.sort((a, b) => b.avgScore - a.avgScore);
    116 
    117     // Assign ranks
    118     const currentRanks: Record<string, number> = {};
    119     for (let i = 0; i < modelScores.length; i++) {
    120       const ms = modelScores[i];
    121       const rank = i + 1;
    122       currentRanks[ms.model] = rank;
    123       ranked[ms.model].push({
    124         conditionValue: cv,
    125         rank,
    126         avgScore: ms.avgScore,
    127         model: ms.model,
    128         n: ms.n,
    129       });
    130     }
    131 
    132     // Detect crossings: if any two models swapped relative rank order
    133     if (ci > 0) {
    134       for (let i = 0; i < models.length; i++) {
    135         for (let j = i + 1; j < models.length; j++) {
    136           const m1 = models[i];
    137           const m2 = models[j];
    138           const prev1 = prevRanks[m1];
    139           const prev2 = prevRanks[m2];
    140           const curr1 = currentRanks[m1];
    141           const curr2 = currentRanks[m2];
    142 
    143           if (
    144             prev1 !== undefined &&
    145             prev2 !== undefined &&
    146             curr1 !== undefined &&
    147             curr2 !== undefined
    148           ) {
    149             // Check if they crossed: relative order changed
    150             if (
    151               (prev1 < prev2 && curr1 > curr2) ||
    152               (prev1 > prev2 && curr1 < curr2)
    153             ) {
    154               // Approximate crossing rank as average of the two at the crossing point
    155               const crossRank = (curr1 + curr2) / 2;
    156               crossings.push({
    157                 conditionValue: cv,
    158                 x: ci,
    159                 rank: crossRank,
    160                 models: [m1, m2],
    161               });
    162             }
    163           }
    164         }
    165       }
    166     }
    167 
    168     Object.assign(prevRanks, currentRanks);
    169   }
    170 
    171   return { ranked, crossings };
    172 }
    173 
    174 // eslint-disable-next-line @typescript-eslint/no-explicit-any
    175 type DotProps = { cx?: number; cy?: number; payload?: any; stroke?: string };
    176 
    177 function makeRankDot(
    178   model: string,
    179   color: string,
    180   lookup: Record<string, Record<string, RankedPoint>>
    181 ) {
    182   return function RankDot({ cx, cy, payload }: DotProps) {
    183     if (cx === undefined || cy === undefined || !payload) return null;
    184     const point = lookup[model]?.[payload.conditionValue];
    185     if (!point) return null;
    186     return (
    187       <g>
    188         <circle
    189           cx={cx}
    190           cy={cy}
    191           r={5}
    192           fill={color}
    193           stroke="var(--surface-1)"
    194           strokeWidth={2}
    195         />
    196         <text
    197           x={cx + 10}
    198           y={cy - 8}
    199           fill="var(--text)"
    200           fontSize={10}
    201           fontFamily="'JetBrains Mono', monospace"
    202           textAnchor="start"
    203         >
    204           {(point.avgScore * 100).toFixed(0)}%
    205         </text>
    206       </g>
    207     );
    208   };
    209 }
    210 
    211 function CustomTooltipContent({
    212   active,
    213   payload,
    214   lookup,
    215 }: {
    216   active?: boolean;
    217   // eslint-disable-next-line @typescript-eslint/no-explicit-any
    218   payload?: Array<{ dataKey?: string; payload?: any; stroke: string }>;
    219   label?: string;
    220   lookup: Record<string, Record<string, RankedPoint>>;
    221 }) {
    222   if (!active || !payload || payload.length === 0) return null;
    223 
    224   const conditionValue = payload[0]?.payload?.conditionValue;
    225   if (!conditionValue) return null;
    226 
    227   // Resolve actual RankedPoint data from lookup
    228   const resolved = payload
    229     .filter((entry) => entry.dataKey && lookup[entry.dataKey])
    230     .map((entry) => ({
    231       point: lookup[entry.dataKey!]?.[conditionValue],
    232       stroke: entry.stroke,
    233     }))
    234     .filter((r) => r.point);
    235 
    236   const sorted = [...resolved].sort(
    237     (a, b) => a.point!.rank - b.point!.rank
    238   );
    239 
    240   return (
    241     <div
    242       style={{
    243         background: "var(--surface-1)",
    244         border: "1px solid var(--border)",
    245         padding: "8px 12px",
    246         fontFamily: "'JetBrains Mono', monospace",
    247         fontSize: "11px",
    248       }}
    249     >
    250       <div
    251         style={{
    252           color: "var(--text)",
    253           fontWeight: 600,
    254           marginBottom: "6px",
    255         }}
    256       >
    257         {conditionValue}
    258       </div>
    259       {sorted.map((entry) => (
    260         <div
    261           key={entry.point!.model}
    262           style={{
    263             display: "flex",
    264             alignItems: "center",
    265             gap: "8px",
    266             marginBottom: "2px",
    267           }}
    268         >
    269           <span
    270             style={{
    271               display: "inline-block",
    272               width: 8,
    273               height: 8,
    274               background: entry.stroke,
    275               flexShrink: 0,
    276             }}
    277           />
    278           <span style={{ color: "var(--text-muted)", width: "16px" }}>
    279             #{entry.point!.rank}
    280           </span>
    281           <span style={{ color: "var(--text)" }}>
    282             {entry.point!.model}
    283           </span>
    284           <span style={{ color: "var(--text-muted)", marginLeft: "auto" }}>
    285             {(entry.point!.avgScore * 100).toFixed(1)}% ({entry.point!.n} cells)
    286           </span>
    287         </div>
    288       ))}
    289     </div>
    290   );
    291 }
    292 
    293 export default function BumpChart({ runs }: BumpChartProps) {
    294   // Pre-compute which axes are useful: need 2+ condition values AND 2+ models with scores
    295   const validAxes = useMemo(() => {
    296     const cells = groupIntoCells(runs);
    297     const scoredCells = cells.filter((c) => c.score.avg > 0);
    298     return CONDITION_AXES.filter((axis) => {
    299       const conditionValues = Array.from(
    300         new Set(scoredCells.map((c) => String(c.meta[axis])))
    301       );
    302       if (conditionValues.length < 2) return false;
    303       // Check that at least one condition value has 2+ models with scores
    304       for (const cv of conditionValues) {
    305         const modelsWithScores = new Set(
    306           scoredCells.filter((c) => String(c.meta[axis]) === cv).map((c) => c.meta.model)
    307         );
    308         if (modelsWithScores.size >= 2) return true;
    309       }
    310       return false;
    311     });
    312   }, [runs]);
    313 
    314   const [selectedAxis, setSelectedAxis] = useState<AxisName>(
    315     validAxes.includes("prompt_style") ? "prompt_style" : validAxes[0] ?? "prompt_style"
    316   );
    317 
    318   const { ranked, crossings, conditionValues, models } = useMemo(() => {
    319     const { ranked, crossings } = computeRankings(runs, selectedAxis);
    320     const conditionValues = Array.from(
    321       new Set(runs.map((r) => String(r.meta[selectedAxis])))
    322     ).sort();
    323     const models = Object.keys(ranked).filter(
    324       (m) => ranked[m].length > 0
    325     );
    326     return { ranked, crossings, conditionValues, models };
    327   }, [runs, selectedAxis]);
    328 
    329   // Build a lookup: model -> conditionValue -> RankedPoint
    330   const pointLookup = useMemo(() => {
    331     const lookup: Record<string, Record<string, RankedPoint>> = {};
    332     for (const model of models) {
    333       lookup[model] = {};
    334       for (const point of ranked[model]) {
    335         lookup[model][point.conditionValue] = point;
    336       }
    337     }
    338     return lookup;
    339   }, [models, ranked]);
    340 
    341   // Build recharts data: one entry per condition value
    342   const chartData = useMemo(() => {
    343     return conditionValues.map((cv) => {
    344       const entry: Record<string, unknown> = { conditionValue: cv };
    345       for (const model of models) {
    346         const point = pointLookup[model]?.[cv];
    347         if (point) {
    348           entry[model] = point.rank;
    349         }
    350       }
    351       return entry;
    352     });
    353   }, [conditionValues, models, pointLookup]);
    354 
    355   const maxRank = models.length;
    356 
    357   const scoredCells = groupIntoCells(runs).filter((c) => c.score.avg > 0);
    358 
    359   if (scoredCells.length === 0) {
    360     return (
    361       <div
    362         className="card"
    363         style={{
    364           textAlign: "center",
    365           padding: "40px",
    366           color: "var(--text-muted)",
    367         }}
    368       >
    369         No scored cells available for ranking.
    370       </div>
    371     );
    372   }
    373 
    374   if (validAxes.length === 0) {
    375     return (
    376       <div className="card">
    377         <h3 style={{ margin: 0 }}>Model Rankings by Condition</h3>
    378         <div
    379           style={{
    380             textAlign: "center",
    381             padding: "40px",
    382             color: "var(--text-muted)",
    383             fontSize: "0.8rem",
    384           }}
    385         >
    386           Not enough data to compare models. Rankings need at least 2 condition
    387           values where 2 or more models have scored cells.
    388         </div>
    389       </div>
    390     );
    391   }
    392 
    393   return (
    394     <div className="card">
    395       <div
    396         style={{
    397           display: "flex",
    398           alignItems: "center",
    399           justifyContent: "space-between",
    400           marginBottom: "16px",
    401           flexWrap: "wrap",
    402           gap: "12px",
    403         }}
    404       >
    405         <div>
    406           <h3 style={{ margin: 0 }}>Model Rankings by Condition</h3>
    407           <p
    408             style={{
    409               color: "var(--text-muted)",
    410               fontSize: "0.75rem",
    411               margin: "4px 0 0",
    412             }}
    413           >
    414             Rank 1 = best average cell score. Crossings indicate rank swaps.
    415           </p>
    416         </div>
    417         <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
    418           <label
    419             style={{
    420               fontSize: "0.75rem",
    421               color: "var(--text-muted)",
    422             }}
    423           >
    424             Condition:
    425           </label>
    426           <select
    427             value={selectedAxis}
    428             onChange={(e) => setSelectedAxis(e.target.value as AxisName)}
    429             style={{
    430               background: "var(--surface-2)",
    431               color: "var(--text)",
    432               border: "1px solid var(--border)",
    433               padding: "4px 8px",
    434               fontFamily: "'JetBrains Mono', monospace",
    435               fontSize: "0.75rem",
    436               borderRadius: "0",
    437               cursor: "pointer",
    438             }}
    439           >
    440             {validAxes.map((axis) => (
    441               <option key={axis} value={axis}>
    442                 {AXIS_LABELS[axis]}
    443               </option>
    444             ))}
    445           </select>
    446         </div>
    447       </div>
    448 
    449       {conditionValues.length < 2 || models.length < 2 ? (
    450         <div
    451           style={{
    452             textAlign: "center",
    453             padding: "40px",
    454             color: "var(--text-muted)",
    455             fontSize: "0.8rem",
    456           }}
    457         >
    458           {models.length < 2
    459             ? `Need at least 2 models with scored runs for "${AXIS_LABELS[selectedAxis]}" to show rankings. Currently only: ${models.join(", ") || "none"}`
    460             : `Need at least 2 values for "${AXIS_LABELS[selectedAxis]}" to show rankings. Currently only: ${conditionValues.join(", ") || "none"}`}
    461         </div>
    462       ) : (
    463         <>
    464           <ResponsiveContainer width="100%" height={300}>
    465             <LineChart
    466               data={chartData}
    467               margin={{ top: 20, right: 60, bottom: 10, left: 10 }}
    468             >
    469               <CartesianGrid
    470                 strokeDasharray="3 3"
    471                 stroke="var(--border)"
    472                 vertical={false}
    473               />
    474               <XAxis
    475                 dataKey="conditionValue"
    476                 stroke="var(--text-muted)"
    477                 fontSize={11}
    478                 fontFamily="'JetBrains Mono', monospace"
    479                 tickLine={false}
    480                 axisLine={{ stroke: "var(--border)" }}
    481               />
    482               <YAxis
    483                 domain={[0.5, maxRank + 0.5]}
    484                 ticks={Array.from({ length: maxRank }, (_, i) => i + 1)}
    485                 reversed
    486                 stroke="var(--text-muted)"
    487                 fontSize={11}
    488                 fontFamily="'JetBrains Mono', monospace"
    489                 tickLine={false}
    490                 axisLine={{ stroke: "var(--border)" }}
    491                 label={{
    492                   value: "Rank",
    493                   angle: -90,
    494                   position: "insideLeft",
    495                   fill: "var(--text-muted)",
    496                   fontSize: 11,
    497                   fontFamily: "'JetBrains Mono', monospace",
    498                 }}
    499                 tickFormatter={(v: number) => `#${v}`}
    500               />
    501               <Tooltip
    502                 content={<CustomTooltipContent lookup={pointLookup} />}
    503                 cursor={{ stroke: "var(--border)", strokeDasharray: "3 3" }}
    504               />
    505               {models.map((model) => (
    506                 <Line
    507                   key={model}
    508                   type="linear"
    509                   dataKey={model}
    510                   stroke={getModelColor(model)}
    511                   strokeWidth={2.5}
    512                   dot={makeRankDot(
    513                     model,
    514                     getModelColor(model),
    515                     pointLookup
    516                   )}
    517                   activeDot={false}
    518                   name={model}
    519                   connectNulls
    520                 />
    521               ))}
    522               {crossings.map((crossing, i) => (
    523                 <ReferenceDot
    524                   key={`crossing-${i}`}
    525                   x={crossing.conditionValue}
    526                   y={crossing.rank}
    527                   r={10}
    528                   fill="none"
    529                   stroke="var(--yellow)"
    530                   strokeWidth={1.5}
    531                   strokeDasharray="3 2"
    532                 />
    533               ))}
    534             </LineChart>
    535           </ResponsiveContainer>
    536 
    537           {/* Legend */}
    538           <div
    539             style={{
    540               display: "flex",
    541               alignItems: "center",
    542               justifyContent: "center",
    543               gap: "20px",
    544               marginTop: "12px",
    545               flexWrap: "wrap",
    546             }}
    547           >
    548             {models.map((model) => (
    549               <div
    550                 key={model}
    551                 style={{
    552                   display: "flex",
    553                   alignItems: "center",
    554                   gap: "6px",
    555                   fontSize: "0.75rem",
    556                   fontFamily: "'JetBrains Mono', monospace",
    557                 }}
    558               >
    559                 <span
    560                   style={{
    561                     display: "inline-block",
    562                     width: 12,
    563                     height: 3,
    564                     background: getModelColor(model),
    565                   }}
    566                 />
    567                 <span style={{ color: "var(--text)" }}>{model}</span>
    568               </div>
    569             ))}
    570             {crossings.length > 0 && (
    571               <div
    572                 style={{
    573                   display: "flex",
    574                   alignItems: "center",
    575                   gap: "6px",
    576                   fontSize: "0.75rem",
    577                   fontFamily: "'JetBrains Mono', monospace",
    578                 }}
    579               >
    580                 <span
    581                   style={{
    582                     display: "inline-block",
    583                     width: 12,
    584                     height: 12,
    585                     borderRadius: "50%",
    586                     border: "1.5px dashed var(--yellow)",
    587                   }}
    588                 />
    589                 <span style={{ color: "var(--text-muted)" }}>
    590                   rank swap
    591                 </span>
    592               </div>
    593             )}
    594           </div>
    595         </>
    596       )}
    597     </div>
    598   );
    599 }

Impressum · Datenschutz