loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

StatisticalPowerCard.tsx (4073B)


      1 import { useMemo } from "react";
      2 import type { Run } from "../lib/types";
      3 import { groupIntoCells, confidenceInterval } from "../lib/analysis";
      4 
      5 interface StatisticalPowerCardProps {
      6   runs: Run[];
      7 }
      8 
      9 const SMUI = {
     10   surface1: "hsl(217 16% 15.5%)",
     11   surface2: "hsl(216 15% 19%)",
     12   border: "hsl(217 17% 28%)",
     13   muted: "hsl(213 14% 65%)",
     14   green: "hsl(92 28% 65%)",
     15   yellow: "hsl(40 71% 73%)",
     16   red: "hsl(355 52% 64%)",
     17 };
     18 
     19 export default function StatisticalPowerCard({ runs }: StatisticalPowerCardProps) {
     20   const stats = useMemo(() => {
     21     const cells = groupIntoCells(runs);
     22     const totalRuns = runs.length;
     23     const totalCells = cells.length;
     24     if (totalCells === 0) return null;
     25 
     26     const avgRunsPerCell = totalRuns / totalCells;
     27 
     28     // Compute CI width for each cell with 3+ runs
     29     const ciWidths: number[] = [];
     30     for (const cell of cells) {
     31       const scores = cell.runs
     32         .map((r) => r.eval_results?.score)
     33         .filter((s): s is number => s != null)
     34         .map((s) => s * 100);
     35       if (scores.length >= 2) {
     36         const ci = confidenceInterval(scores);
     37         const width = ci.upper - ci.lower;
     38         if (isFinite(width)) ciWidths.push(width);
     39       }
     40     }
     41 
     42     const avgCiWidth = ciWidths.length > 0
     43       ? ciWidths.reduce((a, b) => a + b, 0) / ciWidths.length
     44       : null;
     45 
     46     // Minimum detectable effect = largest CI half-width across cells
     47     const minDetectable = ciWidths.length > 0
     48       ? Math.max(...ciWidths) / 2
     49       : null;
     50 
     51     return { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable };
     52   }, [runs]);
     53 
     54   if (!stats) return null;
     55 
     56   const { totalRuns, totalCells, avgRunsPerCell, avgCiWidth, minDetectable } = stats;
     57 
     58   // Color indicator based on average CI width
     59   let statusColor = SMUI.green;
     60   let statusLabel = "Strong";
     61   if (avgCiWidth == null) {
     62     statusColor = SMUI.muted;
     63     statusLabel = "Insufficient data";
     64   } else if (avgCiWidth > 10) {
     65     statusColor = SMUI.red;
     66     statusLabel = "Low power";
     67   } else if (avgCiWidth > 5) {
     68     statusColor = SMUI.yellow;
     69     statusLabel = "Moderate";
     70   }
     71 
     72   return (
     73     <div
     74       className="card"
     75       style={{
     76         padding: "16px",
     77         marginBottom: "16px",
     78       }}
     79     >
     80       <div style={{ display: "flex", alignItems: "center", gap: "12px", marginBottom: "12px" }}>
     81         <h3 style={{ margin: 0 }}>Statistical Power</h3>
     82         <span
     83           style={{
     84             fontSize: "10px",
     85             fontFamily: "'JetBrains Mono', monospace",
     86             fontWeight: 600,
     87             color: statusColor,
     88             border: `1px solid ${statusColor}`,
     89             padding: "2px 8px",
     90             letterSpacing: "0.5px",
     91             textTransform: "uppercase",
     92           }}
     93         >
     94           {statusLabel}
     95         </span>
     96       </div>
     97       <div
     98         style={{
     99           display: "flex",
    100           gap: "24px",
    101           flexWrap: "wrap",
    102           fontSize: "13px",
    103           fontFamily: "'JetBrains Mono', monospace",
    104         }}
    105       >
    106         <div>
    107           <span style={{ color: SMUI.muted }}>runs </span>
    108           <span style={{ fontWeight: 600 }}>{totalRuns}</span>
    109         </div>
    110         <div>
    111           <span style={{ color: SMUI.muted }}>cells </span>
    112           <span style={{ fontWeight: 600 }}>{totalCells}</span>
    113         </div>
    114         <div>
    115           <span style={{ color: SMUI.muted }}>avg runs/cell </span>
    116           <span style={{ fontWeight: 600 }}>{avgRunsPerCell.toFixed(1)}</span>
    117         </div>
    118         {avgCiWidth != null && (
    119           <div>
    120             <span style={{ color: SMUI.muted }}>avg 95% CI </span>
    121             <span style={{ fontWeight: 600, color: statusColor }}>
    122               ±{(avgCiWidth / 2).toFixed(1)}%
    123             </span>
    124           </div>
    125         )}
    126         {minDetectable != null && (
    127           <div>
    128             <span style={{ color: SMUI.muted }}>min detectable effect </span>
    129             <span style={{ fontWeight: 600, color: statusColor }}>
    130               ±{minDetectable.toFixed(1)}%
    131             </span>
    132           </div>
    133         )}
    134       </div>
    135     </div>
    136   );
    137 }

Impressum · Datenschutz