loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

TornadoChart.tsx (13712B)


      1 import type { Run } from "../lib/types";
      2 import type { AxisEffect } from "../lib/analysis";
      3 import { groupIntoCells, confidenceInterval } from "../lib/analysis";
      4 
      5 interface TornadoChartProps {
      6   effects: AxisEffect[];
      7   metric: string;
      8   totalRuns?: number;
      9   totalCells?: number;
     10   runs?: Run[];
     11 }
     12 
     13 const AXIS_LABELS: Record<string, string> = {
     14   model: "Model",
     15   effort: "Effort",
     16   prompt_style: "Prompt Style",
     17   language: "Language",
     18   human_language: "Human Language",
     19   tool_read: "Read Tool",
     20   tool_write: "Write Tool",
     21   tool_edit: "Edit Tool",
     22   tool_glob: "Glob Tool",
     23   tool_grep: "Grep Tool",
     24   linter: "Linter",
     25   playwright: "Playwright",
     26   context_file: "Context File",
     27   web_search: "Web Search",
     28   max_budget: "Budget",
     29   tests_provided: "Tests Provided",
     30   strategy: "Strategy",
     31   design_guidance: "Design Guidance",
     32   architecture: "Architecture",
     33   error_checking: "Error Checking",
     34   context_noise: "Context Noise",
     35   renderer: "Renderer",
     36   provider: "Provider",
     37 };
     38 
     39 // Metric extractors matching analysis.ts
     40 const METRIC_EXTRACTORS: Record<string, (r: Run) => number | null> = {
     41   score: (r) => r.eval_results?.score ?? null,
     42   cost: (r) => r.claude_output?.total_cost_usd ?? null,
     43   turns: (r) => r.claude_output?.num_turns ?? null,
     44   wall_time: (r) => r.meta.wall_time_seconds ?? null,
     45   gameplay: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null,
     46   code_quality: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null,
     47   structural: (r) => r.eval_results?.structural?.score ?? null,
     48   quality: (r) => r.eval_results?.quality?.score ?? null,
     49   transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
     50   sonarqube: (r) => (r.eval_results as Record<string, any>)?.sonarqube?.score ?? null,
     51 };
     52 
     53 const SKIP_KEYS = new Set([
     54   "task", "cell_id", "run_id", "run_number", "runs_per_cell",
     55   "max_budget_usd", "timeout_seconds", "base_tools", "started_at",
     56   "completed_at", "wall_time_seconds", "exit_code", "short_id",
     57   "short_cell_id", "claude_version", "sub_agents", "actual_model",
     58 ]);
     59 
     60 interface EffectCI {
     61   ciLower: number;
     62   ciUpper: number;
     63   crossesZero: boolean;
     64 }
     65 
     66 function computeEffectCIs(
     67   runs: Run[],
     68   metric: string
     69 ): Record<string, Record<string, EffectCI>> {
     70   const extract = METRIC_EXTRACTORS[metric];
     71   if (!extract || runs.length === 0) return {};
     72 
     73   const cells = groupIntoCells(runs);
     74 
     75   // Get per-cell averages
     76   const cellData: Array<{ meta: Run["meta"]; avg: number }> = [];
     77   for (const cell of cells) {
     78     const vals: number[] = [];
     79     for (const run of cell.runs) {
     80       const v = extract(run);
     81       if (v !== null) vals.push(v);
     82     }
     83     if (vals.length === 0) continue;
     84     cellData.push({ meta: cell.meta, avg: vals.reduce((a, b) => a + b, 0) / vals.length });
     85   }
     86 
     87   if (cellData.length === 0) return {};
     88 
     89   const grandMean = cellData.reduce((s, c) => s + c.avg, 0) / cellData.length;
     90 
     91   const axisKeys = Object.keys(cellData[0].meta).filter((k) => !SKIP_KEYS.has(k));
     92   const result: Record<string, Record<string, EffectCI>> = {};
     93 
     94   for (const axis of axisKeys) {
     95     const groups: Record<string, number[]> = {};
     96     for (const { meta, avg } of cellData) {
     97       const key = String((meta as Record<string, unknown>)[axis] ?? "unknown");
     98       (groups[key] ??= []).push(avg);
     99     }
    100 
    101     result[axis] = {};
    102     for (const [val, avgs] of Object.entries(groups)) {
    103       if (avgs.length < 2) {
    104         result[axis][val] = { ciLower: 0, ciUpper: 0, crossesZero: true };
    105         continue;
    106       }
    107       // Compute CI of the effect (mean - grandMean)
    108       const ci = confidenceInterval(avgs);
    109       const effectLower = ci.lower - grandMean;
    110       const effectUpper = ci.upper - grandMean;
    111       result[axis][val] = {
    112         ciLower: effectLower,
    113         ciUpper: effectUpper,
    114         crossesZero: effectLower <= 0 && effectUpper >= 0,
    115       };
    116     }
    117   }
    118 
    119   return result;
    120 }
    121 
    122 export default function TornadoChart({ effects, metric, totalRuns, totalCells, runs }: TornadoChartProps) {
    123   if (effects.length === 0) {
    124     return (
    125       <div
    126         className="card"
    127         style={{
    128           textAlign: "center",
    129           padding: "40px",
    130           color: "var(--text-muted)",
    131         }}
    132       >
    133         Not enough data to compute effects. Run more experiments with varying
    134         configurations.
    135       </div>
    136     );
    137   }
    138 
    139   // Compute CIs if runs are provided
    140   const effectCIs = runs ? computeEffectCIs(runs, metric) : {};
    141 
    142   // Scale must account for variance bands and CI whiskers extending beyond effect bars
    143   const ciExtents = Object.values(effectCIs).flatMap((axisCIs) =>
    144     Object.values(axisCIs).map((ci) => Math.max(Math.abs(ci.ciLower), Math.abs(ci.ciUpper)))
    145   );
    146   const maxExtent = Math.max(
    147     ...effects.flatMap((e) =>
    148       e.values.map((v) => Math.abs(v.effect) + v.variance)
    149     ),
    150     ...ciExtents
    151   );
    152   const scale = maxExtent > 0 ? 200 / maxExtent : 1;
    153 
    154   return (
    155     <div className="card">
    156       <h3 style={{ marginBottom: "4px" }}>Variable Impact on {metric}</h3>
    157       {totalRuns != null && totalCells != null && (
    158         <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))", marginBottom: "4px" }}>
    159           (n={totalRuns} runs across {totalCells} cells)
    160         </div>
    161       )}
    162       <p
    163         style={{
    164           color: "var(--text-muted)",
    165           fontSize: "0.75rem",
    166           marginBottom: "16px",
    167         }}
    168       >
    169         Sorted by effect size. Solid bars show effect (deviation from grand
    170         mean). Shaded bands show within-cell variance.
    171       </p>
    172 
    173       {effects.map((effect) => (
    174         <div
    175           key={effect.axis}
    176           style={{
    177             display: "flex",
    178             alignItems: "center",
    179             marginBottom: "12px",
    180             gap: "12px",
    181           }}
    182         >
    183           {/* Label */}
    184           <div
    185             style={{
    186               width: "120px",
    187               textAlign: "right",
    188               fontSize: "0.8rem",
    189               flexShrink: 0,
    190             }}
    191           >
    192             {AXIS_LABELS[effect.axis] || effect.axis}
    193           </div>
    194 
    195           {/* Bars */}
    196           <div
    197             style={{
    198               flex: 1,
    199               display: "flex",
    200               flexDirection: "column",
    201               gap: "2px",
    202             }}
    203           >
    204             {effect.values.map((entry) => {
    205               const ci = effectCIs[effect.axis]?.[entry.value];
    206               const crossesZero = ci?.crossesZero ?? true;
    207               const effectWidth = Math.abs(entry.effect) * scale;
    208               const varianceBandWidth =
    209                 (Math.abs(entry.effect) + entry.variance) * scale;
    210               const isPositive = entry.effect >= 0;
    211               const isLowN = entry.n < 3;
    212               // CI whisker positions (in px from left edge of bar area)
    213               const ciLowerPx = ci ? Math.abs(ci.ciLower) * scale : 0;
    214               const ciUpperPx = ci ? Math.abs(ci.ciUpper) * scale : 0;
    215               // For the whisker, we show the full CI extent
    216               const ciMaxPx = ci ? Math.max(ciLowerPx, ciUpperPx) : 0;
    217               const ciMinPx = ci ? Math.min(ciLowerPx, ciUpperPx) : 0;
    218               // Dim bars where CI crosses zero (effect not significant)
    219               const notSignificant = ci && crossesZero && !isLowN;
    220               const barContainerWidth = Math.max(varianceBandWidth, effectWidth, ciMaxPx, 2);
    221               return (
    222                 <div
    223                   key={entry.value}
    224                   style={{
    225                     display: "flex",
    226                     alignItems: "center",
    227                     gap: "8px",
    228                     opacity: isLowN ? 0.4 : notSignificant ? 0.5 : 1,
    229                   }}
    230                 >
    231                   <div
    232                     style={{
    233                       width: "50px",
    234                       textAlign: "right",
    235                       fontSize: "0.7rem",
    236                       fontFamily: "var(--font-mono)",
    237                       color: "var(--text-muted)",
    238                       flexShrink: 0,
    239                     }}
    240                   >
    241                     {entry.value}
    242                   </div>
    243                   <div
    244                     style={{
    245                       position: "relative",
    246                       height: "16px",
    247                       width: `${barContainerWidth}px`,
    248                     }}
    249                   >
    250                     {/* Variance band (behind, wider, semi-transparent) */}
    251                     {entry.variance > 0 && (
    252                       <div
    253                         style={{
    254                           position: "absolute",
    255                           top: "1px",
    256                           left: 0,
    257                           height: "14px",
    258                           width: `${Math.max(varianceBandWidth, 2)}px`,
    259                           background: isPositive
    260                             ? "var(--green)"
    261                             : "var(--red)",
    262                           opacity: 0.15,
    263                           borderRadius: "2px",
    264                         }}
    265                       />
    266                     )}
    267                     {/* Effect bar (foreground, solid) */}
    268                     <div
    269                       style={{
    270                         position: "absolute",
    271                         top: 0,
    272                         left: 0,
    273                         height: "16px",
    274                         width: `${Math.max(effectWidth, 2)}px`,
    275                         background: isPositive
    276                           ? "var(--green)"
    277                           : "var(--red)",
    278                         borderRadius: "2px",
    279                         opacity: 0.8,
    280                         ...(isLowN ? { borderStyle: "dashed", borderWidth: "1px", borderColor: isPositive ? "var(--green)" : "var(--red)" } : {}),
    281                       }}
    282                     />
    283                     {/* CI whisker */}
    284                     {ci && !isLowN && ciMaxPx > 0 && (
    285                       <>
    286                         {/* Whisker line */}
    287                         <div
    288                           style={{
    289                             position: "absolute",
    290                             top: "7px",
    291                             left: `${ciMinPx}px`,
    292                             width: `${Math.max(ciMaxPx - ciMinPx, 1)}px`,
    293                             height: "2px",
    294                             background: "var(--text-muted)",
    295                           }}
    296                         />
    297                         {/* Left cap */}
    298                         <div
    299                           style={{
    300                             position: "absolute",
    301                             top: "4px",
    302                             left: `${ciMinPx}px`,
    303                             width: "1px",
    304                             height: "8px",
    305                             background: "var(--text-muted)",
    306                           }}
    307                         />
    308                         {/* Right cap */}
    309                         <div
    310                           style={{
    311                             position: "absolute",
    312                             top: "4px",
    313                             left: `${ciMaxPx}px`,
    314                             width: "1px",
    315                             height: "8px",
    316                             background: "var(--text-muted)",
    317                           }}
    318                         />
    319                       </>
    320                     )}
    321                   </div>
    322                   <div
    323                     style={{
    324                       fontSize: "0.7rem",
    325                       fontFamily: "var(--font-mono)",
    326                       color: isPositive ? "var(--green)" : "var(--red)",
    327                       whiteSpace: "nowrap",
    328                     }}
    329                   >
    330                     {entry.effect >= 0 ? "+" : ""}
    331                     {(entry.effect * 100).toFixed(1)}%
    332                     {entry.variance > 0 && (
    333                       <span
    334                         style={{
    335                           color: "var(--text-muted)",
    336                           marginLeft: "4px",
    337                         }}
    338                       >
    339                         ±{(entry.variance * 100).toFixed(1)}%
    340                       </span>
    341                     )}
    342                     {ci && !isLowN && (
    343                       <span
    344                         style={{
    345                           color: crossesZero ? "var(--yellow)" : "var(--text-muted)",
    346                           marginLeft: "4px",
    347                           fontSize: "0.6rem",
    348                         }}
    349                       >
    350                         CI [{(ci.ciLower * 100).toFixed(1)}, {(ci.ciUpper * 100).toFixed(1)}]
    351                         {crossesZero ? " n.s." : ""}
    352                       </span>
    353                     )}
    354                   </div>
    355                   <div
    356                     style={{
    357                       fontSize: "0.65rem",
    358                       fontFamily: "var(--font-mono)",
    359                       color: isLowN ? "var(--yellow)" : "var(--text-muted)",
    360                       whiteSpace: "nowrap",
    361                       fontWeight: isLowN ? 600 : 400,
    362                     }}
    363                   >
    364                     n={entry.n} cell{entry.n !== 1 ? "s" : ""}
    365                   </div>
    366                 </div>
    367               );
    368             })}
    369           </div>
    370 
    371           {/* Spread */}
    372           <div
    373             style={{
    374               width: "60px",
    375               textAlign: "right",
    376               fontSize: "0.75rem",
    377               fontFamily: "var(--font-mono)",
    378               color: "var(--accent)",
    379               flexShrink: 0,
    380             }}
    381           >
    382             {(effect.spread * 100).toFixed(1)}%
    383           </div>
    384         </div>
    385       ))}
    386     </div>
    387   );
    388 }

Impressum · Datenschutz