loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

Grid.tsx (14070B)


      1 import { useState, useMemo } from "react";
      2 import type { Run, AxisName } from "../lib/types";
      3 import Filters from "./Filters";
      4 
      5 interface GridProps {
      6   runs: Run[];
      7   axisValues: Record<AxisName, string[]>;
      8   tasks: string[];
      9 }
     10 
     11 function scoreClass(score: number | null | undefined): string {
     12   if (score === null || score === undefined) return "";
     13   if (score >= 0.7) return "score-high";
     14   if (score >= 0.4) return "score-mid";
     15   return "score-low";
     16 }
     17 
     18 function formatScore(score: number | null | undefined): string {
     19   if (score === null || score === undefined) return "-";
     20   return (score * 100).toFixed(0) + "%";
     21 }
     22 
     23 function formatCost(cost: number | null | undefined): string {
     24   if (cost === null || cost === undefined) return "-";
     25   return "$" + cost.toFixed(2);
     26 }
     27 
     28 function formatRunId(run: Run): React.ReactNode {
     29   const m = run.meta;
     30   return (
     31     <span style={{ display: "inline-flex", gap: "4px", alignItems: "center", flexWrap: "wrap" }}>
     32       <span className="badge badge-neutral" style={{ fontSize: "0.7rem" }}>{m.task}</span>
     33       <span style={{ color: "var(--text-muted)", fontSize: "0.7rem" }}>
     34         {m.actual_model || m.model} {m.prompt_style} {m.language}
     35       </span>
     36     </span>
     37   );
     38 }
     39 
     40 function formatTime(seconds: number | null | undefined): string {
     41   if (seconds === null || seconds === undefined) return "-";
     42   if (seconds < 60) return seconds + "s";
     43   return Math.floor(seconds / 60) + "m " + (seconds % 60) + "s";
     44 }
     45 
     46 type SortKey = "task" | "model" | "effort" | "prompt" | "lang" | "score" | "cost" | "time" | "turns";
     47 
     48 function getSortValue(run: Run, key: SortKey): string | number {
     49   switch (key) {
     50     case "task": return run.meta.task;
     51     case "model": return run.meta.actual_model || run.meta.model;
     52     case "effort": return run.meta.effort;
     53     case "prompt": return run.meta.prompt_style;
     54     case "lang": return run.meta.language;
     55     case "score": return run.eval_results?.score ?? -1;
     56     case "cost": return run.claude_output?.total_cost_usd ?? -1;
     57     case "time": return run.meta.wall_time_seconds ?? -1;
     58     case "turns": return run.claude_output?.num_turns ?? -1;
     59   }
     60 }
     61 
     62 interface CellGroup {
     63   cell_id: string;
     64   runs: Run[];
     65   meta: Run["meta"];
     66   scores: number[];
     67   costs: number[];
     68   times: number[];
     69   turns: number[];
     70   avg: { score: number; cost: number; time: number; turns: number };
     71   min: { score: number; cost: number };
     72   max: { score: number; cost: number };
     73 }
     74 
     75 function groupByCellId(runs: Run[]): CellGroup[] {
     76   const groups: Record<string, Run[]> = {};
     77   for (const run of runs) {
     78     const cellId = run.meta.cell_id;
     79     (groups[cellId] ??= []).push(run);
     80   }
     81   return Object.entries(groups).map(([cell_id, cellRuns]) => {
     82     const scores = cellRuns.map(r => r.eval_results?.score).filter((s): s is number => s != null);
     83     const costs = cellRuns.map(r => r.claude_output?.total_cost_usd).filter((c): c is number => c != null);
     84     const times = cellRuns.map(r => r.meta.wall_time_seconds).filter((t): t is number => t != null);
     85     const turnsList = cellRuns.map(r => r.claude_output?.num_turns).filter((t): t is number => t != null);
     86     const avg = (arr: number[]) => arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : 0;
     87     return {
     88       cell_id,
     89       runs: cellRuns,
     90       meta: cellRuns[0].meta,
     91       scores, costs, times, turns: turnsList,
     92       avg: { score: avg(scores), cost: avg(costs), time: avg(times), turns: avg(turnsList) },
     93       min: { score: scores.length > 0 ? Math.min(...scores) : 0, cost: costs.length > 0 ? Math.min(...costs) : 0 },
     94       max: { score: scores.length > 0 ? Math.max(...scores) : 0, cost: costs.length > 0 ? Math.max(...costs) : 0 },
     95     };
     96   });
     97 }
     98 
     99 function RangeCell({ min, max, avg, format }: { min: number; max: number; avg: number; format: (v: number) => string }) {
    100   const spread = max - min;
    101   const isWide = format === formatScore ? spread > 0.1 : spread > avg * 0.3;
    102   return (
    103     <span style={{ fontFamily: "var(--font-mono)" }}>
    104       <span style={{ fontWeight: 600 }}>{format(avg)}</span>
    105       {min !== max && (
    106         <span style={{ fontSize: "0.65rem", color: isWide ? "var(--yellow)" : "var(--text-muted)", marginLeft: "4px" }}>
    107           {format(min)}-{format(max)}
    108         </span>
    109       )}
    110     </span>
    111   );
    112 }
    113 
    114 export default function Grid({ runs, axisValues, tasks }: GridProps) {
    115   const [filters, setFilters] = useState<Record<string, string>>({});
    116   const [sortKey, setSortKey] = useState<SortKey>("score");
    117   const [sortAsc, setSortAsc] = useState(false);
    118   const [grouped, setGrouped] = useState(true);
    119 
    120   const handleSort = (key: SortKey) => {
    121     if (sortKey === key) {
    122       setSortAsc(!sortAsc);
    123     } else {
    124       setSortKey(key);
    125       setSortAsc(false);
    126     }
    127   };
    128 
    129   const filteredRuns = useMemo(() => {
    130     const filtered = runs.filter((run) => {
    131       for (const [key, value] of Object.entries(filters)) {
    132         if (key === "task") {
    133           if (run.meta.task !== value) return false;
    134         } else {
    135           if (String(run.meta[key as keyof typeof run.meta]) !== value)
    136             return false;
    137         }
    138       }
    139       return true;
    140     });
    141 
    142     return filtered.sort((a, b) => {
    143       const va = getSortValue(a, sortKey);
    144       const vb = getSortValue(b, sortKey);
    145       const cmp = va < vb ? -1 : va > vb ? 1 : 0;
    146       return sortAsc ? cmp : -cmp;
    147     });
    148   }, [runs, filters, sortKey, sortAsc]);
    149 
    150   const cellGroups = useMemo(() => {
    151     const groups = groupByCellId(filteredRuns);
    152     return groups.sort((a, b) => {
    153       const va = sortKey === "score" ? a.avg.score : sortKey === "cost" ? a.avg.cost : sortKey === "time" ? a.avg.time : sortKey === "turns" ? a.avg.turns : 0;
    154       const vb = sortKey === "score" ? b.avg.score : sortKey === "cost" ? b.avg.cost : sortKey === "time" ? b.avg.time : sortKey === "turns" ? b.avg.turns : 0;
    155       const cmp = va < vb ? -1 : va > vb ? 1 : 0;
    156       return sortAsc ? cmp : -cmp;
    157     });
    158   }, [filteredRuns, sortKey, sortAsc]);
    159 
    160   return (
    161     <div>
    162       <div style={{ display: "flex", justifyContent: "space-between", alignItems: "center", marginBottom: "12px" }}>
    163         <Filters
    164           axisValues={axisValues}
    165           tasks={tasks}
    166           onFilterChange={setFilters}
    167         />
    168         <button
    169           onClick={() => setGrouped(!grouped)}
    170           style={{
    171             padding: "4px 10px", fontSize: "11px", fontFamily: "var(--font-mono)",
    172             background: "transparent", border: "1px solid hsl(var(--border))",
    173             color: "hsl(var(--foreground))", cursor: "pointer", textTransform: "uppercase",
    174             letterSpacing: "0.5px", flexShrink: 0,
    175           }}
    176         >
    177           {grouped ? "Show individual runs" : "Group by config"}
    178         </button>
    179       </div>
    180 
    181       <div className="card" style={{ overflowX: "auto" }}>
    182         <table>
    183           <thead>
    184             <tr>
    185               <th>Run ID</th>
    186               {(["task", "model", "effort", "prompt", "lang", "score"] as SortKey[]).map((key) => {
    187                 const labels: Record<SortKey, string> = {
    188                   task: "Task", model: "Model", effort: "Effort", prompt: "Prompt",
    189                   lang: "Lang", score: "Score", cost: "Cost", time: "Time", turns: "Turns",
    190                 };
    191                 return (
    192                   <th
    193                     key={key}
    194                     onClick={() => handleSort(key)}
    195                     style={{ cursor: "pointer", userSelect: "none" }}
    196                   >
    197                     {labels[key]} {sortKey === key ? (sortAsc ? "\u25B2" : "\u25BC") : ""}
    198                   </th>
    199                 );
    200               })}
    201               <th>Pass</th>
    202               {(["cost", "time", "turns"] as SortKey[]).map((key) => {
    203                 const labels: Record<SortKey, string> = {
    204                   task: "Task", model: "Model", effort: "Effort", prompt: "Prompt",
    205                   lang: "Lang", score: "Score", cost: "Cost", time: "Time", turns: "Turns",
    206                 };
    207                 return (
    208                   <th
    209                     key={key}
    210                     onClick={() => handleSort(key)}
    211                     style={{ cursor: "pointer", userSelect: "none" }}
    212                   >
    213                     {labels[key]} {sortKey === key ? (sortAsc ? "\u25B2" : "\u25BC") : ""}
    214                   </th>
    215                 );
    216               })}
    217             </tr>
    218           </thead>
    219           <tbody>
    220             {grouped ? (
    221               cellGroups.length === 0 ? (
    222                 <tr>
    223                   <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}>
    224                     {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."}
    225                   </td>
    226                 </tr>
    227               ) : (
    228                 cellGroups.map((g) => (
    229                   <tr key={g.cell_id}>
    230                     <td>
    231                       <div style={{ fontSize: "0.75rem", display: "flex", alignItems: "center", gap: "6px" }}>
    232                         <a href={`/c/${g.runs[0]?.meta.short_cell_id || g.cell_id}`} style={{ color: "var(--accent)", fontSize: "0.65rem", textTransform: "uppercase", letterSpacing: "0.5px", opacity: 0.7 }} title="View cell detail">cell</a>
    233                         <span style={{ color: "var(--text-muted)", fontSize: "0.6rem", fontFamily: "var(--font-mono)", opacity: g.runs.length < 3 ? 0.5 : 0.8 }}>n={g.runs.length}</span>
    234                         {g.runs.map((r, i) => (
    235                           <span key={r.meta.run_id}>
    236                             {i > 0 && " "}
    237                             <a href={`/r/${r.meta.short_id || r.meta.run_id}`} style={{ color: "var(--accent)" }}>#{r.meta.run_number}</a>
    238                           </span>
    239                         ))}
    240                       </div>
    241                     </td>
    242                     <td>
    243                       {g.meta.task}
    244                       {g.runs.some(r => {
    245                         const cost = r.claude_output?.total_cost_usd ?? 0;
    246                         const budget = r.meta.max_budget_usd ?? 0;
    247                         return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124;
    248                       }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>}
    249                     </td>
    250                     <td><span className="badge badge-neutral">{g.meta.actual_model || g.meta.model}</span></td>
    251                     <td>{g.meta.effort}</td>
    252                     <td>{g.meta.prompt_style}</td>
    253                     <td>{g.meta.language}</td>
    254                     <td className={`score-cell ${scoreClass(g.avg.score)}`}>
    255                       <RangeCell min={g.min.score} max={g.max.score} avg={g.avg.score} format={formatScore} />
    256                     </td>
    257                     <td>
    258                       {(() => {
    259                         const passes = g.runs.filter(r => r.eval_results?.functional?.pass === true).length;
    260                         const fails = g.runs.filter(r => r.eval_results?.functional?.pass === false).length;
    261                         const total = g.runs.length;
    262                         if (passes === total) return <span className="badge badge-pass">{passes}/{total}</span>;
    263                         if (fails === total) return <span className="badge badge-fail">0/{total}</span>;
    264                         return <span className="badge badge-neutral">{passes}/{total}</span>;
    265                       })()}
    266                     </td>
    267                     <td>
    268                       <RangeCell min={g.min.cost} max={g.max.cost} avg={g.avg.cost} format={formatCost} />
    269                     </td>
    270                     <td>{formatTime(Math.round(g.avg.time))}</td>
    271                     <td>{Math.round(g.avg.turns)}</td>
    272                   </tr>
    273                 ))
    274               )
    275             ) : (
    276               filteredRuns.length === 0 ? (
    277                 <tr>
    278                   <td colSpan={11} style={{ textAlign: "center", color: "var(--text-muted)", padding: "40px" }}>
    279                     {runs.length === 0 ? "No benchmark results yet." : "No results match the current filters."}
    280                   </td>
    281                 </tr>
    282               ) : (
    283                 filteredRuns.map((run) => (
    284                   <tr key={run.meta.run_id}>
    285                     <td>
    286                       <a href={`/r/${run.meta.short_id || run.meta.run_id}`} style={{ fontSize: "0.75rem" }}>
    287                         {formatRunId(run)}
    288                       </a>
    289                     </td>
    290                     <td>{run.meta.task}</td>
    291                     <td><span className="badge badge-neutral">{run.meta.actual_model || run.meta.model}</span></td>
    292                     <td>{run.meta.effort}</td>
    293                     <td>{run.meta.prompt_style}</td>
    294                     <td>{run.meta.language}</td>
    295                     <td className={`score-cell ${scoreClass(run.eval_results?.score)}`}>
    296                       {formatScore(run.eval_results?.score)}
    297                     </td>
    298                     <td>
    299                       {run.eval_results?.functional?.pass === true ? (
    300                         <span className="badge badge-pass">PASS</span>
    301                       ) : run.eval_results?.functional?.pass === false ? (
    302                         <span className="badge badge-fail">FAIL</span>
    303                       ) : (
    304                         <span className="badge badge-neutral">-</span>
    305                       )}
    306                     </td>
    307                     <td>{formatCost(run.claude_output?.total_cost_usd)}</td>
    308                     <td>{formatTime(run.meta.wall_time_seconds)}</td>
    309                     <td>{run.claude_output?.num_turns ?? "-"}</td>
    310                   </tr>
    311                 ))
    312               )
    313             )}
    314           </tbody>
    315         </table>
    316         <div style={{ padding: "12px", color: "var(--text-muted)", fontSize: "0.75rem" }}>
    317           {grouped
    318             ? `${cellGroups.length} configs (${filteredRuns.length} runs)`
    319             : `${filteredRuns.length} of ${runs.length} runs`}
    320         </div>
    321       </div>
    322     </div>
    323   );
    324 }

Impressum · Datenschutz