Variability.tsx - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

Variability.tsx (21521B)
      1 import { useMemo } from "react";
      2 import type { Run, AxisName } from "../lib/types";
      3 import { AXIS_NAMES } from "../lib/types";
      4 import { groupIntoCells } from "../lib/analysis";
      5 import type { Cell } from "../lib/analysis";
      6 
      7 interface VariabilityProps {
      8   runs: Run[];
      9 }
     10 
     11 const AXIS_LABELS: Record<string, string> = {
     12   model: "Model",
     13   effort: "Effort",
     14   prompt_style: "Prompt Style",
     15   language: "Language",
     16   human_language: "Human Language",
     17   tool_read: "Read Tool",
     18   tool_write: "Write Tool",
     19   tool_edit: "Edit Tool",
     20   tool_glob: "Glob Tool",
     21   tool_grep: "Grep Tool",
     22   linter: "Linter",
     23   playwright: "Playwright",
     24   context_file: "Context File",
     25   web_search: "Web Search",
     26   max_budget: "Budget",
     27   tests_provided: "Tests Provided",
     28   strategy: "Strategy",
     29   design_guidance: "Design Guidance",
     30   architecture: "Architecture",
     31   error_checking: "Error Checking",
     32   context_noise: "Context Noise",
     33   renderer: "Renderer",
     34   provider: "Provider",
     35 };
     36 
     37 /* ---------- helpers ---------- */
     38 
     39 function quantile(sorted: number[], q: number): number {
     40   if (sorted.length === 0) return 0;
     41   if (sorted.length === 1) return sorted[0];
     42   const pos = q * (sorted.length - 1);
     43   const lo = Math.floor(pos);
     44   const hi = Math.ceil(pos);
     45   if (lo === hi) return sorted[lo];
     46   return sorted[lo] + (pos - lo) * (sorted[hi] - sorted[lo]);
     47 }
     48 
     49 function variance(values: number[]): number {
     50   if (values.length < 2) return 0;
     51   const mean = values.reduce((a, b) => a + b, 0) / values.length;
     52   return values.reduce((s, v) => s + (v - mean) ** 2, 0) / values.length;
     53 }
     54 
     55 /* ---------- Section 1: Box Plots ---------- */
     56 
     57 interface BoxPlotStats {
     58   model: string;
     59   min: number;
     60   q1: number;
     61   median: number;
     62   q3: number;
     63   max: number;
     64   points: number[];
     65   cellCount: number;
     66 }
     67 
     68 function computeBoxPlots(cells: Cell[]): BoxPlotStats[] {
     69   const MODEL_ORDER: Record<string, number> = { haiku: 1, sonnet: 2, opus: 3 };
     70   const models = Array.from(new Set(cells.map((c) => c.meta.model)))
     71     .sort((a, b) => (MODEL_ORDER[a] || 99) - (MODEL_ORDER[b] || 99));
     72   const results: BoxPlotStats[] = [];
     73 
     74   for (const model of models) {
     75     const modelCells = cells.filter((c) => c.meta.model === model);
     76     const scores = modelCells
     77       .map((c) => c.score.avg)
     78       .filter((s) => s > 0)
     79       .sort((a, b) => a - b);
     80 
     81     if (scores.length === 0) continue;
     82 
     83     results.push({
     84       model,
     85       min: scores[0],
     86       q1: quantile(scores, 0.25),
     87       median: quantile(scores, 0.5),
     88       q3: quantile(scores, 0.75),
     89       max: scores[scores.length - 1],
     90       points: scores,
     91       cellCount: scores.length,
     92     });
     93   }
     94 
     95   return results;
     96 }
     97 
     98 function BoxPlotSection({ cells }: { cells: Cell[] }) {
     99   const stats = useMemo(() => computeBoxPlots(cells), [cells]);
    100 
    101   if (stats.length === 0) {
    102     return (
    103       <div style={{ color: "var(--text-muted)", padding: "20px" }}>
    104         No scored cells available.
    105       </div>
    106     );
    107   }
    108 
    109   // Global scale across all models
    110   const globalMin = Math.min(...stats.map((s) => s.min));
    111   const globalMax = Math.max(...stats.map((s) => s.max));
    112   const range = globalMax - globalMin || 0.01;
    113 
    114   const toPercent = (v: number) => ((v - globalMin) / range) * 100;
    115 
    116   return (
    117     <div>
    118       {/* Axis labels */}
    119       <div
    120         style={{
    121           display: "flex",
    122           justifyContent: "space-between",
    123           marginBottom: "4px",
    124           paddingLeft: "140px",
    125           paddingRight: "12px",
    126         }}
    127       >
    128         <span style={axisLabelStyle}>{(globalMin * 100).toFixed(0)}%</span>
    129         <span style={axisLabelStyle}>{(globalMax * 100).toFixed(0)}%</span>
    130       </div>
    131 
    132       {stats.map((s) => {
    133         const isLowN = s.cellCount < 3;
    134         return (
    135         <div
    136           key={s.model}
    137           style={{
    138             display: "flex",
    139             alignItems: "center",
    140             marginBottom: "16px",
    141             gap: "12px",
    142             opacity: isLowN ? 0.4 : 1,
    143           }}
    144         >
    145           {/* Label */}
    146           <div style={{ width: "120px", textAlign: "right", flexShrink: 0 }}>
    147             <div style={labelPrimaryStyle}>{s.model}</div>
    148             <div style={{ ...labelSecondaryStyle, color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)" }}>
    149               median {(s.median * 100).toFixed(1)}% / n={s.cellCount} cell{s.cellCount !== 1 ? "s" : ""}
    150             </div>
    151           </div>
    152 
    153           {/* Box plot */}
    154           <div
    155             style={{
    156               flex: 1,
    157               position: "relative",
    158               height: "32px",
    159               marginRight: "12px",
    160             }}
    161           >
    162             {/* Background track */}
    163             <div
    164               style={{
    165                 position: "absolute",
    166                 top: "50%",
    167                 left: 0,
    168                 right: 0,
    169                 height: "1px",
    170                 background: "hsl(var(--border))",
    171                 transform: "translateY(-50%)",
    172               }}
    173             />
    174 
    175             {/* Whisker line: min to max */}
    176             <div
    177               style={{
    178                 position: "absolute",
    179                 top: "50%",
    180                 left: `${toPercent(s.min)}%`,
    181                 width: `${toPercent(s.max) - toPercent(s.min)}%`,
    182                 height: "2px",
    183                 background: "var(--accent)",
    184                 opacity: 0.5,
    185                 transform: "translateY(-50%)",
    186               }}
    187             />
    188 
    189             {/* Min whisker cap */}
    190             <div
    191               style={{
    192                 position: "absolute",
    193                 top: "50%",
    194                 left: `${toPercent(s.min)}%`,
    195                 width: "1px",
    196                 height: "12px",
    197                 background: "var(--accent)",
    198                 opacity: 0.5,
    199                 transform: "translate(-50%, -50%)",
    200               }}
    201             />
    202 
    203             {/* Max whisker cap */}
    204             <div
    205               style={{
    206                 position: "absolute",
    207                 top: "50%",
    208                 left: `${toPercent(s.max)}%`,
    209                 width: "1px",
    210                 height: "12px",
    211                 background: "var(--accent)",
    212                 opacity: 0.5,
    213                 transform: "translate(-50%, -50%)",
    214               }}
    215             />
    216 
    217             {/* IQR box: Q1 to Q3 */}
    218             <div
    219               style={{
    220                 position: "absolute",
    221                 top: "50%",
    222                 left: `${toPercent(s.q1)}%`,
    223                 width: `${Math.max(toPercent(s.q3) - toPercent(s.q1), 0.5)}%`,
    224                 height: "18px",
    225                 background: "var(--accent)",
    226                 opacity: 0.2,
    227                 border: "1px solid var(--accent)",
    228                 transform: "translateY(-50%)",
    229               }}
    230             />
    231 
    232             {/* Median line */}
    233             <div
    234               style={{
    235                 position: "absolute",
    236                 top: "50%",
    237                 left: `${toPercent(s.median)}%`,
    238                 width: "2px",
    239                 height: "22px",
    240                 background: "var(--accent)",
    241                 transform: "translate(-50%, -50%)",
    242               }}
    243             />
    244 
    245             {/* Individual cell dots */}
    246             {s.points.map((p, i) => (
    247               <div
    248                 key={i}
    249                 style={{
    250                   position: "absolute",
    251                   top: "50%",
    252                   left: `${toPercent(p)}%`,
    253                   width: "5px",
    254                   height: "5px",
    255                   borderRadius: "50%",
    256                   background: "var(--accent)",
    257                   opacity: 0.6,
    258                   transform: "translate(-50%, -50%)",
    259                   zIndex: 1,
    260                 }}
    261               />
    262             ))}
    263           </div>
    264         </div>
    265         );
    266       })}
    267     </div>
    268   );
    269 }
    270 
    271 /* ---------- Section 2: Reliability Ranking ---------- */
    272 
    273 interface ReliabilityRow {
    274   axis: string;
    275   value: string;
    276   avgScore: number;
    277   avgRange: number;
    278   n: number;
    279 }
    280 
    281 function computeReliability(cells: Cell[]): ReliabilityRow[] {
    282   const rows: ReliabilityRow[] = [];
    283 
    284   for (const axis of AXIS_NAMES) {
    285     const groups: Record<string, { scores: number[]; ranges: number[] }> = {};
    286     for (const cell of cells) {
    287       const val = String(
    288         (cell.meta as Record<string, unknown>)[axis] ?? "unknown"
    289       );
    290       const g = (groups[val] ??= { scores: [], ranges: [] });
    291       if (cell.score.avg > 0) {
    292         g.scores.push(cell.score.avg);
    293         g.ranges.push(cell.score.range);
    294       }
    295     }
    296 
    297     for (const [val, { scores, ranges }] of Object.entries(groups)) {
    298       if (scores.length < 2) continue;
    299       rows.push({
    300         axis,
    301         value: val,
    302         avgScore: scores.reduce((a, b) => a + b, 0) / scores.length,
    303         avgRange: ranges.reduce((a, b) => a + b, 0) / ranges.length,
    304         n: scores.length,
    305       });
    306     }
    307   }
    308 
    309   return rows.sort((a, b) => a.avgRange - b.avgRange);
    310 }
    311 
    312 function reliabilityColor(avgRange: number): string {
    313   if (avgRange <= 0.05) return "var(--green)";
    314   if (avgRange <= 0.12) return "var(--yellow)";
    315   return "var(--red)";
    316 }
    317 
    318 function ReliabilitySection({ cells }: { cells: Cell[] }) {
    319   const rows = useMemo(() => computeReliability(cells), [cells]);
    320 
    321   if (rows.length === 0) {
    322     return (
    323       <div style={{ color: "var(--text-muted)", padding: "20px" }}>
    324         Not enough multi-run cells to compute reliability.
    325       </div>
    326     );
    327   }
    328 
    329   const maxRange = Math.max(...rows.map((r) => r.avgRange), 0.01);
    330 
    331   return (
    332     <div style={{ overflowX: "auto" }}>
    333       <table style={{ borderCollapse: "collapse", width: "100%" }}>
    334         <thead>
    335           <tr>
    336             {["VARIABLE", "VALUE", "N", "AVG SCORE", "AVG RANGE", "RELIABILITY"].map(
    337               (h) => (
    338                 <th key={h} style={thStyle}>
    339                   {h}
    340                 </th>
    341               )
    342             )}
    343           </tr>
    344         </thead>
    345         <tbody>
    346           {rows.map((row, i) => {
    347             const barWidth = (row.avgRange / maxRange) * 100;
    348             const color = reliabilityColor(row.avgRange);
    349             const isLowN = row.n < 3;
    350             return (
    351               <tr
    352                 key={`${row.axis}-${row.value}`}
    353                 style={{
    354                   borderBottom: "1px solid hsl(var(--border))",
    355                   background:
    356                     i % 2 === 0 ? "transparent" : "hsl(var(--border) / 0.1)",
    357                   opacity: isLowN ? 0.4 : 1,
    358                 }}
    359               >
    360                 <td style={tdStyle}>
    361                   {AXIS_LABELS[row.axis] || row.axis}
    362                 </td>
    363                 <td style={{ ...tdStyle, fontFamily: "var(--font-mono)" }}>
    364                   {row.value}
    365                 </td>
    366                 <td
    367                   style={{
    368                     ...tdStyle,
    369                     fontFamily: "var(--font-mono)",
    370                     textAlign: "right",
    371                     color: isLowN ? "var(--yellow, hsl(40 95% 64%))" : "var(--text-muted)",
    372                     fontWeight: isLowN ? 600 : 400,
    373                   }}
    374                 >
    375                   {row.n}
    376                 </td>
    377                 <td
    378                   style={{
    379                     ...tdStyle,
    380                     fontFamily: "var(--font-mono)",
    381                     textAlign: "right",
    382                   }}
    383                 >
    384                   {(row.avgScore * 100).toFixed(1)}%
    385                 </td>
    386                 <td
    387                   style={{
    388                     ...tdStyle,
    389                     fontFamily: "var(--font-mono)",
    390                     textAlign: "right",
    391                     color,
    392                   }}
    393                 >
    394                   {(row.avgRange * 100).toFixed(1)}%
    395                 </td>
    396                 <td style={{ ...tdStyle, width: "200px" }}>
    397                   <div
    398                     style={{
    399                       position: "relative",
    400                       height: "12px",
    401                       background: "hsl(var(--border) / 0.2)",
    402                     }}
    403                   >
    404                     <div
    405                       style={{
    406                         position: "absolute",
    407                         top: 0,
    408                         left: 0,
    409                         height: "100%",
    410                         width: `${Math.max(barWidth, 1)}%`,
    411                         background: color,
    412                         opacity: 0.7,
    413                       }}
    414                     />
    415                   </div>
    416                 </td>
    417               </tr>
    418             );
    419           })}
    420         </tbody>
    421       </table>
    422     </div>
    423   );
    424 }
    425 
    426 /* ---------- Section 3: Variance Contribution ---------- */
    427 
    428 interface VarianceDecomp {
    429   totalVariance: number;
    430   withinVariance: number;
    431   betweenVariance: number;
    432   betweenPct: number;
    433   withinPct: number;
    434 }
    435 
    436 function computeVarianceDecomp(
    437   runs: Run[],
    438   cells: Cell[]
    439 ): VarianceDecomp | null {
    440   // All individual run scores
    441   const allScores = runs
    442     .map((r) => r.eval_results?.score ?? null)
    443     .filter((s): s is number => s !== null && s > 0);
    444 
    445   if (allScores.length < 2) return null;
    446 
    447   const totalVar = variance(allScores);
    448   if (totalVar === 0) return null;
    449 
    450   // Within-cell variance: average variance within each cell
    451   const cellVariances: number[] = [];
    452   for (const cell of cells) {
    453     const scores = cell.runs
    454       .map((r) => r.eval_results?.score ?? null)
    455       .filter((s): s is number => s !== null && s > 0);
    456     if (scores.length >= 2) {
    457       cellVariances.push(variance(scores));
    458     }
    459   }
    460 
    461   const withinVar =
    462     cellVariances.length > 0
    463       ? cellVariances.reduce((a, b) => a + b, 0) / cellVariances.length
    464       : 0;
    465 
    466   const betweenVar = Math.max(totalVar - withinVar, 0);
    467   const betweenPct = totalVar > 0 ? betweenVar / totalVar : 0;
    468   const withinPct = totalVar > 0 ? withinVar / totalVar : 0;
    469 
    470   return {
    471     totalVariance: totalVar,
    472     withinVariance: withinVar,
    473     betweenVariance: betweenVar,
    474     betweenPct,
    475     withinPct,
    476   };
    477 }
    478 
    479 function VarianceSection({
    480   runs,
    481   cells,
    482 }: {
    483   runs: Run[];
    484   cells: Cell[];
    485 }) {
    486   const decomp = useMemo(
    487     () => computeVarianceDecomp(runs, cells),
    488     [runs, cells]
    489   );
    490 
    491   if (!decomp) {
    492     return (
    493       <div style={{ color: "var(--text-muted)", padding: "20px" }}>
    494         Not enough data to decompose variance.
    495       </div>
    496     );
    497   }
    498 
    499   return (
    500     <div>
    501       <div
    502         style={{
    503           display: "flex",
    504           height: "32px",
    505           marginBottom: "12px",
    506           border: "1px solid hsl(var(--border))",
    507         }}
    508       >
    509         {/* Between-cell (config choices) */}
    510         <div
    511           style={{
    512             width: `${decomp.betweenPct * 100}%`,
    513             background: "var(--accent)",
    514             opacity: 0.7,
    515             display: "flex",
    516             alignItems: "center",
    517             justifyContent: "center",
    518             fontSize: "11px",
    519             fontFamily: "var(--font-mono)",
    520             color: "var(--text)",
    521             fontWeight: 600,
    522             minWidth: decomp.betweenPct > 0.08 ? undefined : "0px",
    523             overflow: "hidden",
    524             whiteSpace: "nowrap",
    525           }}
    526         >
    527           {decomp.betweenPct > 0.08 &&
    528             `${(decomp.betweenPct * 100).toFixed(0)}%`}
    529         </div>
    530         {/* Within-cell (randomness) */}
    531         <div
    532           style={{
    533             width: `${decomp.withinPct * 100}%`,
    534             background: "var(--yellow)",
    535             opacity: 0.5,
    536             display: "flex",
    537             alignItems: "center",
    538             justifyContent: "center",
    539             fontSize: "11px",
    540             fontFamily: "var(--font-mono)",
    541             color: "var(--text)",
    542             fontWeight: 600,
    543             minWidth: decomp.withinPct > 0.08 ? undefined : "0px",
    544             overflow: "hidden",
    545             whiteSpace: "nowrap",
    546           }}
    547         >
    548           {decomp.withinPct > 0.08 &&
    549             `${(decomp.withinPct * 100).toFixed(0)}%`}
    550         </div>
    551       </div>
    552 
    553       <div
    554         style={{
    555           display: "flex",
    556           gap: "24px",
    557           flexWrap: "wrap",
    558         }}
    559       >
    560         <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
    561           <div
    562             style={{
    563               width: "12px",
    564               height: "12px",
    565               background: "var(--accent)",
    566               opacity: 0.7,
    567             }}
    568           />
    569           <span style={legendStyle}>
    570             CONFIG CHOICES: {(decomp.betweenPct * 100).toFixed(0)}%
    571           </span>
    572         </div>
    573         <div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
    574           <div
    575             style={{
    576               width: "12px",
    577               height: "12px",
    578               background: "var(--yellow)",
    579               opacity: 0.5,
    580             }}
    581           />
    582           <span style={legendStyle}>
    583             RANDOMNESS: {(decomp.withinPct * 100).toFixed(0)}%
    584           </span>
    585         </div>
    586       </div>
    587 
    588       <p
    589         style={{
    590           marginTop: "12px",
    591           fontSize: "12px",
    592           color: "var(--text-muted)",
    593           lineHeight: "1.5",
    594         }}
    595       >
    596         {decomp.betweenPct >= 0.5
    597           ? `Configuration choices explain ${(decomp.betweenPct * 100).toFixed(0)}% of score variance. The config matters more than run-to-run randomness.`
    598           : decomp.betweenPct >= 0.3
    599             ? `Configuration and randomness contribute roughly equally. Scores are moderately sensitive to config choices.`
    600             : `Run-to-run randomness dominates (${(decomp.withinPct * 100).toFixed(0)}%). Config choices have limited impact on scores -- results are noisy.`}
    601       </p>
    602     </div>
    603   );
    604 }
    605 
    606 /* ---------- shared styles ---------- */
    607 
    608 const sectionHeaderStyle: React.CSSProperties = {
    609   fontSize: "11px",
    610   fontFamily: "var(--font-mono)",
    611   textTransform: "uppercase",
    612   letterSpacing: "0.08em",
    613   color: "var(--text-muted)",
    614   marginBottom: "4px",
    615 };
    616 
    617 const sectionTitleStyle: React.CSSProperties = {
    618   fontSize: "16px",
    619   fontWeight: 600,
    620   marginBottom: "4px",
    621 };
    622 
    623 const sectionDescStyle: React.CSSProperties = {
    624   fontSize: "12px",
    625   color: "var(--text-muted)",
    626   marginBottom: "16px",
    627   lineHeight: "1.4",
    628 };
    629 
    630 const cardStyle: React.CSSProperties = {
    631   border: "1px solid hsl(var(--border))",
    632   padding: "20px",
    633   marginBottom: "16px",
    634   background: "var(--surface-1)",
    635 };
    636 
    637 const axisLabelStyle: React.CSSProperties = {
    638   fontSize: "10px",
    639   fontFamily: "var(--font-mono)",
    640   color: "var(--text-muted)",
    641   textTransform: "uppercase",
    642   letterSpacing: "0.06em",
    643 };
    644 
    645 const labelPrimaryStyle: React.CSSProperties = {
    646   fontSize: "13px",
    647   fontFamily: "var(--font-mono)",
    648   fontWeight: 600,
    649   color: "var(--text)",
    650 };
    651 
    652 const labelSecondaryStyle: React.CSSProperties = {
    653   fontSize: "10px",
    654   fontFamily: "var(--font-mono)",
    655   color: "var(--text-muted)",
    656   marginTop: "2px",
    657 };
    658 
    659 const thStyle: React.CSSProperties = {
    660   fontSize: "11px",
    661   fontFamily: "var(--font-mono)",
    662   textTransform: "uppercase",
    663   letterSpacing: "0.06em",
    664   color: "var(--text-muted)",
    665   padding: "8px 12px",
    666   textAlign: "left",
    667   borderBottom: "1px solid hsl(var(--border))",
    668   fontWeight: 500,
    669 };
    670 
    671 const tdStyle: React.CSSProperties = {
    672   fontSize: "12px",
    673   padding: "6px 12px",
    674   color: "var(--text)",
    675 };
    676 
    677 const legendStyle: React.CSSProperties = {
    678   fontSize: "11px",
    679   fontFamily: "var(--font-mono)",
    680   textTransform: "uppercase",
    681   letterSpacing: "0.06em",
    682   color: "var(--text-muted)",
    683 };
    684 
    685 /* ---------- Main Component ---------- */
    686 
    687 export default function Variability({ runs }: VariabilityProps) {
    688   const cells = useMemo(() => groupIntoCells(runs), [runs]);
    689 
    690   if (runs.length === 0) {
    691     return (
    692       <div
    693         style={{
    694           ...cardStyle,
    695           textAlign: "center",
    696           padding: "40px",
    697           color: "var(--text-muted)",
    698         }}
    699       >
    700         No runs available for variability analysis.
    701       </div>
    702     );
    703   }
    704 
    705   return (
    706     <div>
    707       {/* Overall sample size subtitle */}
    708       <div style={{ fontSize: "10px", fontFamily: "'JetBrains Mono', monospace", color: "var(--text-muted, hsl(213 14% 65%))", marginBottom: "8px" }}>
    709         (n={runs.length} runs across {cells.length} cells)
    710       </div>
    711       {/* Section 1: Box Plots */}
    712       <div style={cardStyle}>
    713         <div style={sectionHeaderStyle}>CONSISTENCY</div>
    714         <div style={sectionTitleStyle}>Score Distribution by Model</div>
    715         <p style={sectionDescStyle}>
    716           Each dot is a cell (unique config). The box spans Q1-Q3; the line
    717           marks the median. Tighter boxes mean more consistent results across
    718           configs.
    719         </p>
    720         <BoxPlotSection cells={cells} />
    721       </div>
    722 
    723       {/* Section 2: Reliability Ranking */}
    724       <div style={cardStyle}>
    725         <div style={sectionHeaderStyle}>RELIABILITY</div>
    726         <div style={sectionTitleStyle}>Reliability Ranking by Variable</div>
    727         <p style={sectionDescStyle}>
    728           How much do repeat runs of the same config vary? Sorted by average
    729           range (smallest = most reliable). Green means scores are consistent
    730           across re-runs; red means volatile.
    731         </p>
    732         <ReliabilitySection cells={cells} />
    733       </div>
    734 
    735       {/* Section 3: Variance Decomposition */}
    736       <div style={cardStyle}>
    737         <div style={sectionHeaderStyle}>VARIANCE</div>
    738         <div style={sectionTitleStyle}>Variance Contribution</div>
    739         <p style={sectionDescStyle}>
    740           ANOVA-style decomposition: how much of the total score variance comes
    741           from config choices (between cells) vs run-to-run randomness (within
    742           cells)?
    743         </p>
    744         <VarianceSection runs={runs} cells={cells} />
    745       </div>
    746     </div>
    747   );
    748 }
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README