loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

Charts.tsx (13409B)


      1 import { useState, useMemo } from "react";
      2 import {
      3   ComposedChart,
      4   Bar,
      5   Scatter,
      6   XAxis,
      7   YAxis,
      8   CartesianGrid,
      9   Tooltip,
     10   ResponsiveContainer,
     11   Cell,
     12 } from "recharts";
     13 import type { Run } from "../lib/types";
     14 import { getModelColor, modelSortOrder } from "../lib/colors";
     15 import { confidenceInterval } from "../lib/analysis";
     16 import ModelSelector from "./ModelSelector";
     17 
     18 interface ChartsProps {
     19   runs: Run[];
     20 }
     21 
     22 interface BoxPlotData {
     23   label: string;
     24   min: number;
     25   q1: number;
     26   median: number;
     27   q3: number;
     28   max: number;
     29   cellCount: number;
     30   runCount: number;
     31   scores: number[];
     32   // Derived fields for recharts stacked bar trick
     33   base: number;    // invisible bar height = q1
     34   iqr: number;     // visible box height = q3 - q1
     35   color: string;
     36   // 95% confidence interval of the mean
     37   ciMean: number;
     38   ciLower: number;
     39   ciUpper: number;
     40 }
     41 
     42 
     43 const SMUI = {
     44   surface0: "hsl(213 16% 12%)",
     45   surface1: "hsl(217 16% 15.5%)",
     46   surface2: "hsl(216 15% 19%)",
     47   border: "hsl(217 17% 28%)",
     48   muted: "hsl(213 14% 65%)",
     49   frost1: "hsl(176 25% 65%)",
     50   frost2: "hsl(193 44% 67%)",
     51   frost3: "hsl(210 34% 63%)",
     52   frost4: "hsl(213 32% 52%)",
     53   green: "hsl(92 28% 65%)",
     54   yellow: "hsl(40 71% 73%)",
     55   red: "hsl(355 52% 64%)",
     56   purple: "hsl(311 24% 63%)",
     57 };
     58 
     59 // MODEL_COLORS imported from ../lib/colors
     60 
     61 const TOOLTIP_STYLE = {
     62   background: SMUI.surface1,
     63   border: `1px solid ${SMUI.border}`,
     64   borderRadius: "0",
     65   fontFamily: "'JetBrains Mono', monospace",
     66   fontSize: "11px",
     67   padding: "8px 12px",
     68 };
     69 
     70 interface CellAggregate {
     71   cell_id: string;
     72   model: string;
     73   task: string;
     74   avgScore: number;
     75   avgCost: number;
     76   passRate: number;
     77   runCount: number;
     78 }
     79 
     80 function aggregateCells(runs: Run[]): CellAggregate[] {
     81   const byCell: Record<string, {
     82     model: string;
     83     task: string;
     84     scores: number[];
     85     costs: number[];
     86     passes: number;
     87     total: number;
     88   }> = {};
     89 
     90   for (const run of runs) {
     91     const cellId = run.meta.cell_id;
     92     if (!byCell[cellId]) {
     93       byCell[cellId] = {
     94         model: run.meta.actual_model || run.meta.model,
     95         task: run.meta.task,
     96         scores: [],
     97         costs: [],
     98         passes: 0,
     99         total: 0,
    100       };
    101     }
    102 
    103     byCell[cellId].total++;
    104     if (run.eval_results?.score != null) {
    105       byCell[cellId].scores.push(run.eval_results.score);
    106     }
    107     if (run.claude_output?.total_cost_usd != null) {
    108       byCell[cellId].costs.push(run.claude_output.total_cost_usd);
    109     }
    110     if (run.eval_results?.functional?.pass) {
    111       byCell[cellId].passes++;
    112     }
    113   }
    114 
    115   return Object.entries(byCell).map(([cell_id, data]) => ({
    116     cell_id,
    117     model: data.model,
    118     task: data.task,
    119     avgScore: data.scores.length > 0
    120       ? data.scores.reduce((a, b) => a + b, 0) / data.scores.length
    121       : 0,
    122     avgCost: data.costs.length > 0
    123       ? data.costs.reduce((a, b) => a + b, 0) / data.costs.length
    124       : 0,
    125     passRate: data.total > 0
    126       ? data.passes / data.total
    127       : 0,
    128     runCount: data.total,
    129   }));
    130 }
    131 
    132 // MODEL_ORDER imported via modelSortOrder from ../lib/colors
    133 
    134 function quantile(sorted: number[], q: number): number {
    135   if (sorted.length === 0) return 0;
    136   if (sorted.length === 1) return sorted[0];
    137   const pos = q * (sorted.length - 1);
    138   const lo = Math.floor(pos);
    139   const hi = Math.ceil(pos);
    140   const frac = pos - lo;
    141   return sorted[lo] + frac * (sorted[hi] - sorted[lo]);
    142 }
    143 
    144 function computeBoxStats(values: number[]): { min: number; q1: number; median: number; q3: number; max: number } {
    145   const sorted = [...values].sort((a, b) => a - b);
    146   return {
    147     min: sorted.length > 0 ? sorted[0] : 0,
    148     q1: quantile(sorted, 0.25),
    149     median: quantile(sorted, 0.5),
    150     q3: quantile(sorted, 0.75),
    151     max: sorted.length > 0 ? sorted[sorted.length - 1] : 0,
    152   };
    153 }
    154 
    155 function aggregateByModel(runs: Run[]): BoxPlotData[] {
    156   const cells = aggregateCells(runs);
    157   const byModel: Record<string, CellAggregate[]> = {};
    158 
    159   for (const cell of cells) {
    160     if (!byModel[cell.model]) byModel[cell.model] = [];
    161     byModel[cell.model].push(cell);
    162   }
    163 
    164   const sortedEntries = Object.entries(byModel).sort(([a], [b]) =>
    165     modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b)
    166   );
    167 
    168   return sortedEntries.map(([model, modelCells]) => {
    169     const scores = modelCells.map((c) => Math.round(c.avgScore * 100));
    170     const totalRuns = modelCells.reduce((sum, c) => sum + c.runCount, 0);
    171     const stats = computeBoxStats(scores);
    172     const baseModel = model;
    173     const ci = confidenceInterval(scores);
    174     return {
    175       label: `${model}|(n=${totalRuns})`,
    176       ...stats,
    177       base: stats.q1,
    178       iqr: stats.q3 - stats.q1,
    179       cellCount: modelCells.length,
    180       runCount: totalRuns,
    181       scores,
    182       color: getModelColor(baseModel),
    183       ciMean: ci.mean,
    184       ciLower: ci.lower,
    185       ciUpper: ci.upper,
    186     };
    187   });
    188 }
    189 
    190 
    191 // Custom shape: draws a box from q1 to q3 with whiskers from min to max and a median line
    192 function BoxPlotShape(props: any) {
    193   const { x, y, width, height, payload } = props as {
    194     x: number; y: number; width: number; height: number;
    195     payload: BoxPlotData;
    196   };
    197   if (!payload || height === undefined) return null;
    198 
    199   const { min, median, max, color, cellCount, ciLower, ciUpper, ciMean } = payload;
    200   const lowN = cellCount < 3;
    201   const boxOpacity = lowN ? 0.4 : 1;
    202   // The bar is rendered from q1 (base) with height iqr (q3-q1).
    203   // y is the top of the bar (q3 in chart coords), y+height is the bottom (q1).
    204   const boxTop = y;
    205   const boxBottom = y + height;
    206   const boxQ3 = payload.q3;
    207   const boxQ1 = payload.q1;
    208   const centerX = x + width / 2;
    209 
    210   // Scale: we need to convert data values to pixel positions.
    211   // We know q1 maps to boxBottom and q3 maps to boxTop.
    212   const dataToY = (val: number): number => {
    213     if (boxQ3 === boxQ1) return boxTop;
    214     return boxTop + ((boxQ3 - val) / (boxQ3 - boxQ1)) * (boxBottom - boxTop);
    215   };
    216 
    217   const minY = dataToY(min);
    218   const maxY = dataToY(max);
    219   const medianY = dataToY(median);
    220   const whiskerHalfW = width * 0.3;
    221 
    222   return (
    223     <g opacity={boxOpacity}>
    224       {/* Whisker line: min to max */}
    225       <line x1={centerX} y1={minY} x2={centerX} y2={maxY} stroke={SMUI.muted} strokeWidth={1} />
    226       {/* Min whisker cap */}
    227       <line x1={centerX - whiskerHalfW} y1={minY} x2={centerX + whiskerHalfW} y2={minY} stroke={SMUI.muted} strokeWidth={1} />
    228       {/* Max whisker cap */}
    229       <line x1={centerX - whiskerHalfW} y1={maxY} x2={centerX + whiskerHalfW} y2={maxY} stroke={SMUI.muted} strokeWidth={1} />
    230       {/* Box (IQR) -- dashed stroke when low sample size */}
    231       <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} strokeDasharray={lowN ? "4 2" : undefined} />
    232       {/* Median line */}
    233       <line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} />
    234       {/* 95% CI band on the mean */}
    235       {ciLower !== ciUpper && (() => {
    236         const ciTopY = dataToY(Math.min(ciUpper, max));
    237         const ciBotY = dataToY(Math.max(ciLower, min));
    238         const ciMeanY = dataToY(ciMean);
    239         const ciHalfW = width * 0.45;
    240         return (
    241           <>
    242             {/* Shaded CI band */}
    243             <rect
    244               x={centerX - ciHalfW}
    245               y={ciTopY}
    246               width={ciHalfW * 2}
    247               height={Math.max(ciBotY - ciTopY, 1)}
    248               fill={color}
    249               fillOpacity={0.2}
    250               stroke="none"
    251             />
    252             {/* CI vertical line */}
    253             <line x1={centerX} y1={ciTopY} x2={centerX} y2={ciBotY} stroke={color} strokeWidth={1.5} strokeDasharray="2 2" />
    254             {/* CI top cap */}
    255             <line x1={centerX - 4} y1={ciTopY} x2={centerX + 4} y2={ciTopY} stroke={color} strokeWidth={1.5} />
    256             {/* CI bottom cap */}
    257             <line x1={centerX - 4} y1={ciBotY} x2={centerX + 4} y2={ciBotY} stroke={color} strokeWidth={1.5} />
    258             {/* Mean dot */}
    259             <circle cx={centerX} cy={ciMeanY} r={2.5} fill={color} stroke="none" />
    260           </>
    261         );
    262       })()}
    263     </g>
    264   );
    265 }
    266 
    267 
    268 
    269 
    270 // Custom tooltip for model box plot
    271 function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; payload?: Array<{ payload: BoxPlotData }>; label?: string }) {
    272   if (!active || !payload || payload.length === 0) return null;
    273   const d = payload[0].payload;
    274   return (
    275     <div style={TOOLTIP_STYLE}>
    276       <div style={{ marginBottom: 4, fontWeight: 600 }}>{label?.split("|")[0]}</div>
    277       <div style={{ marginBottom: 4, color: SMUI.muted, fontSize: 10 }}>n={d.runCount} runs across {d.cellCount} cells</div>
    278       <div>Max: {d.max}%</div>
    279       <div>Q3: {Math.round(d.q3)}%</div>
    280       <div>Median: {Math.round(d.median)}%</div>
    281       <div>Q1: {Math.round(d.q1)}%</div>
    282       <div>Min: {d.min}%</div>
    283       {d.ciLower !== d.ciUpper && (
    284         <div style={{ marginTop: 4, borderTop: `1px solid ${SMUI.border}`, paddingTop: 4 }}>
    285           <div style={{ color: SMUI.frost2 }}>Mean: {Math.round(d.ciMean)}%</div>
    286           <div style={{ color: SMUI.frost2 }}>95% CI: [{Math.round(d.ciLower)}% - {Math.round(d.ciUpper)}%]</div>
    287         </div>
    288       )}
    289     </div>
    290   );
    291 }
    292 
    293 
    294 export default function Charts({ runs }: ChartsProps) {
    295   // Extract unique models sorted consistently
    296   const allModels = useMemo(() => {
    297     const models = new Set<string>();
    298     for (const run of runs) {
    299       models.add(run.meta.actual_model || run.meta.model);
    300     }
    301     return [...models].sort((a, b) => modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b));
    302   }, [runs]);
    303 
    304   const [selectedModels, setSelectedModels] = useState<Set<string>>(() => new Set(allModels));
    305 
    306   if (runs.length === 0) {
    307     return (
    308       <div className="card" style={{ textAlign: "center", padding: "40px", color: SMUI.muted }}>
    309         No data to chart yet.
    310       </div>
    311     );
    312   }
    313 
    314   const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model));
    315   const modelData = aggregateByModel(filteredRuns);
    316 
    317   const detectableDifference = useMemo(() => {
    318     const ciWidths = modelData
    319       .filter((d) => d.ciLower !== d.ciUpper)
    320       .map((d) => d.ciUpper - d.ciLower);
    321     if (ciWidths.length === 0) return null;
    322     return Math.round(Math.max(...ciWidths));
    323   }, [modelData]);
    324 
    325   return (
    326     <div className="card">
    327       <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}>
    328         <div>
    329           <h3 style={{ margin: 0 }}>Score Distribution by Model</h3>
    330           <div style={{ fontSize: "10px", color: "var(--text-muted, hsl(213 14% 65%))", fontFamily: "'JetBrains Mono', monospace", marginTop: "2px" }}>
    331             (n={filteredRuns.length} runs across {modelData.reduce((sum, d) => sum + d.cellCount, 0)} cells)
    332           </div>
    333         </div>
    334         <ModelSelector
    335           allModels={allModels}
    336           selectedModels={selectedModels}
    337           onChange={setSelectedModels}
    338         />
    339       </div>
    340       <ResponsiveContainer width="100%" height={270}>
    341         <ComposedChart data={modelData} barCategoryGap="20%">
    342           <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} />
    343           <XAxis
    344             dataKey="label"
    345             stroke={SMUI.muted}
    346             tickLine={false}
    347             axisLine={{ stroke: SMUI.border }}
    348             interval={0}
    349             tick={({ x, y, payload }: any) => {
    350               const [name, count] = (payload.value as string).split("|");
    351               return (
    352                 <g>
    353                   <text x={x} y={y + 12} textAnchor="middle" fill={SMUI.muted} fontSize={10} fontFamily="'JetBrains Mono', monospace">{name}</text>
    354                   <text x={x} y={y + 24} textAnchor="middle" fill={SMUI.muted} fontSize={8} fontFamily="'JetBrains Mono', monospace" opacity={0.6}>{count}</text>
    355                 </g>
    356               );
    357             }}
    358             height={40}
    359           />
    360           <YAxis
    361             stroke={SMUI.muted}
    362             fontSize={11}
    363             fontFamily="'JetBrains Mono', monospace"
    364             domain={[0, 100]}
    365             tickLine={false}
    366             axisLine={false}
    367             yAxisId="score"
    368           />
    369           <Tooltip content={<ModelBoxTooltipContent />} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} />
    370           {/* Invisible base bar to push the visible box up to q1 */}
    371           <Bar dataKey="base" stackId="box" fill="transparent" barSize={40} yAxisId="score" />
    372           {/* Visible IQR box with custom shape for whiskers and median */}
    373           <Bar dataKey="iqr" stackId="box" barSize={40} yAxisId="score" shape={<BoxPlotShape />}>
    374             {modelData.map((entry) => (
    375               <Cell key={entry.label} fill={entry.color} />
    376             ))}
    377           </Bar>
    378           {/* Hidden scatter to keep recharts scale consistent */}
    379           <Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" />
    380         </ComposedChart>
    381       </ResponsiveContainer>
    382       {detectableDifference != null && (
    383         <div style={{
    384           fontSize: "10px",
    385           fontFamily: "'JetBrains Mono', monospace",
    386           color: SMUI.muted,
    387           marginTop: "4px",
    388           textAlign: "center",
    389         }}>
    390           Detectable difference: differences of ±{detectableDifference}% are statistically significant with current data
    391         </div>
    392       )}
    393     </div>
    394   );
    395 }

Impressum · Datenschutz