loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

CorrelationMatrix.tsx (9500B)


      1 import type { Run } from "../lib/types";
      2 import { groupIntoCells, type Cell } from "../lib/analysis";
      3 
      4 interface CorrelationMatrixProps {
      5   runs: Run[];
      6 }
      7 
      8 const CONFIG_AXES = [
      9   { key: "model", label: "Model" },
     10   { key: "effort", label: "Effort" },
     11   { key: "prompt_style", label: "Prompt Style" },
     12   { key: "language", label: "Language" },
     13   { key: "tool_read", label: "Read Tool" },
     14   { key: "tool_write", label: "Write Tool" },
     15   { key: "tool_edit", label: "Edit Tool" },
     16   { key: "tool_glob", label: "Glob Tool" },
     17   { key: "tool_grep", label: "Grep Tool" },
     18   { key: "linter", label: "Linter" },
     19   { key: "playwright", label: "Playwright" },
     20   { key: "context_file", label: "Context File" },
     21   { key: "web_search", label: "Web Search" },
     22   { key: "max_budget", label: "Budget" },
     23   { key: "tests_provided", label: "Tests Provided" },
     24   { key: "strategy", label: "Strategy" },
     25   { key: "design_guidance", label: "Design Guidance" },
     26   { key: "architecture", label: "Architecture" },
     27   { key: "error_checking", label: "Error Checking" },
     28   { key: "context_noise", label: "Context Noise" },
     29   { key: "renderer", label: "Renderer" },
     30   { key: "provider", label: "Provider" },
     31 ] as const;
     32 
     33 type MetricExtractor = (run: Run) => number | null;
     34 
     35 const OUTCOME_METRICS: Array<{ key: string; label: string; lowerIsBetter: boolean; extract: MetricExtractor }> = [
     36   { key: "overall", label: "Overall", lowerIsBetter: false, extract: (r) => r.eval_results?.score ?? null },
     37   { key: "gameplay", label: "Gameplay", lowerIsBetter: false, extract: (r) => (r.eval_results as Record<string, any>)?.gameplay_bot?.score ?? null },
     38   { key: "code", label: "Code", lowerIsBetter: false, extract: (r) => (r.eval_results as Record<string, any>)?.code_analysis?.score ?? null },
     39   { key: "structural", label: "Structural", lowerIsBetter: false, extract: (r) => r.eval_results?.structural?.score ?? null },
     40   { key: "quality", label: "Quality", lowerIsBetter: false, extract: (r) => r.eval_results?.quality?.score ?? null },
     41   { key: "transcript", label: "Transcript", lowerIsBetter: false, extract: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null },
     42   { key: "cost", label: "Cost \u2193", lowerIsBetter: true, extract: (r) => r.claude_output?.total_cost_usd ?? null },
     43   { key: "turns", label: "Turns \u2193", lowerIsBetter: true, extract: (r) => r.claude_output?.num_turns ?? null },
     44   { key: "time", label: "Time \u2193", lowerIsBetter: true, extract: (r) => r.meta.wall_time_seconds ?? null },
     45 ];
     46 
     47 function computeSpread(cells: Cell[], axisKey: string, extract: MetricExtractor): number | null {
     48   // Compute per-cell metric averages, then group by axis value
     49   const groups: Record<string, number[]> = {};
     50   for (const cell of cells) {
     51     const vals: number[] = [];
     52     for (const run of cell.runs) {
     53       const v = extract(run);
     54       if (v !== null) vals.push(v);
     55     }
     56     if (vals.length === 0) continue;
     57     const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
     58     const groupKey = String((cell.meta as Record<string, unknown>)[axisKey] ?? "unknown");
     59     (groups[groupKey] ??= []).push(cellAvg);
     60   }
     61 
     62   const keys = Object.keys(groups);
     63   if (keys.length < 2) return null;
     64 
     65   const means = keys.map((k) => {
     66     const vals = groups[k];
     67     return vals.reduce((a, b) => a + b, 0) / vals.length;
     68   });
     69 
     70   return Math.max(...means) - Math.min(...means);
     71 }
     72 
     73 export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
     74   if (runs.length === 0) {
     75     return (
     76       <div
     77         className="card"
     78         style={{
     79           textAlign: "center",
     80           padding: "40px",
     81           color: "var(--text-muted)",
     82         }}
     83       >
     84         No data available for correlation analysis.
     85       </div>
     86     );
     87   }
     88 
     89   // Group runs into cells once, then compute spreads from cell averages
     90   const cells = groupIntoCells(runs);
     91 
     92   // Compute the full matrix: rows = config axes, columns = metrics
     93   const matrix: Array<{
     94     key: string;
     95     label: string;
     96     spreads: Array<number | null>;
     97     maxSpread: number;
     98   }> = [];
     99 
    100   for (const axis of CONFIG_AXES) {
    101     const spreads = OUTCOME_METRICS.map((metric) =>
    102       computeSpread(cells, axis.key, metric.extract)
    103     );
    104     const validSpreads = spreads.filter((s): s is number => s !== null);
    105     const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0;
    106     matrix.push({ key: axis.key, label: axis.label, spreads, maxSpread });
    107   }
    108 
    109   // Sort rows by maximum spread (most impactful variable first)
    110   matrix.sort((a, b) => b.maxSpread - a.maxSpread);
    111 
    112   // Find global max spread for color scaling
    113   const globalMax = Math.max(...matrix.map((r) => r.maxSpread), 0.001);
    114 
    115   return (
    116     <div className="card" style={{ padding: "20px" }}>
    117       <h3 style={{ marginBottom: "4px" }}>Variable Impact Matrix</h3>
    118       <p
    119         style={{
    120           color: "var(--text-muted)",
    121           fontSize: "0.75rem",
    122           marginBottom: "16px",
    123         }}
    124       >
    125         Effect size (spread) of each configuration variable on each outcome.
    126         Sorted by maximum impact. Stronger color = larger effect.
    127       </p>
    128 
    129       <div style={{ overflowX: "auto" }}>
    130         <table
    131           style={{
    132             borderCollapse: "collapse",
    133             width: "auto",
    134             minWidth: "100%",
    135           }}
    136         >
    137           <thead>
    138             <tr>
    139               <th
    140                 style={{
    141                   padding: "6px 12px",
    142                   fontSize: "11px",
    143                   textAlign: "right",
    144                   background: "var(--surface-2)",
    145                   borderBottom: "1px solid var(--border)",
    146                   borderRight: "1px solid var(--border)",
    147                   position: "sticky",
    148                   left: 0,
    149                   zIndex: 1,
    150                 }}
    151               >
    152                 Variable
    153               </th>
    154               {OUTCOME_METRICS.map((metric) => (
    155                 <th
    156                   key={metric.key}
    157                   style={{
    158                     padding: "6px 8px",
    159                     fontSize: "11px",
    160                     textAlign: "center",
    161                     background: "var(--surface-2)",
    162                     borderBottom: "1px solid var(--border)",
    163                     fontFamily: "var(--font-mono)",
    164                     fontWeight: 500,
    165                     color: "var(--text-muted)",
    166                     textTransform: "uppercase",
    167                     letterSpacing: "0.5px",
    168                     whiteSpace: "nowrap",
    169                   }}
    170                 >
    171                   {metric.label}
    172                 </th>
    173               ))}
    174             </tr>
    175           </thead>
    176           <tbody>
    177             {matrix.map((row) => (
    178               <tr key={row.key} style={{ background: "transparent" }}>
    179                 <td
    180                   style={{
    181                     padding: "5px 12px",
    182                     fontSize: "11px",
    183                     fontFamily: "var(--font-mono)",
    184                     fontWeight: 500,
    185                     textAlign: "right",
    186                     whiteSpace: "nowrap",
    187                     borderBottom: "1px solid var(--border)",
    188                     borderRight: "1px solid var(--border)",
    189                     background: "var(--surface-1)",
    190                     position: "sticky",
    191                     left: 0,
    192                     zIndex: 1,
    193                   }}
    194                 >
    195                   {row.label}
    196                 </td>
    197                 {row.spreads.map((spread, i) => {
    198                   if (spread === null) {
    199                     return (
    200                       <td
    201                         key={OUTCOME_METRICS[i].key}
    202                         style={{
    203                           padding: "5px 8px",
    204                           textAlign: "center",
    205                           fontSize: "11px",
    206                           fontFamily: "var(--font-mono)",
    207                           color: "var(--text-muted)",
    208                           borderBottom: "1px solid var(--border)",
    209                         }}
    210                       >
    211                         --
    212                       </td>
    213                     );
    214                   }
    215 
    216                   const opacity = Math.min(spread / globalMax, 1) * 0.7 + 0.05;
    217                   const isScoreMetric = !["cost", "turns", "time"].includes(
    218                     OUTCOME_METRICS[i].key
    219                   );
    220                   const displayValue = isScoreMetric
    221                     ? `${(spread * 100).toFixed(1)}%`
    222                     : OUTCOME_METRICS[i].key === "cost"
    223                       ? `$${spread.toFixed(2)}`
    224                       : OUTCOME_METRICS[i].key === "time"
    225                         ? `${Math.round(spread)}s`
    226                         : spread.toFixed(1);
    227 
    228                   return (
    229                     <td
    230                       key={OUTCOME_METRICS[i].key}
    231                       style={{
    232                         padding: "5px 8px",
    233                         textAlign: "center",
    234                         fontSize: "11px",
    235                         fontFamily: "var(--font-mono)",
    236                         fontWeight: 600,
    237                         color: "var(--text)",
    238                         borderBottom: "1px solid var(--border)",
    239                         background: `rgba(136, 192, 208, ${opacity})`,
    240                       }}
    241                     >
    242                       {displayValue}
    243                     </td>
    244                   );
    245                 })}
    246               </tr>
    247             ))}
    248           </tbody>
    249         </table>
    250       </div>
    251     </div>
    252   );
    253 }

Impressum · Datenschutz