loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 5e358b275032b8351588c74c53ef7c5853c1b8b4
parent 42135ccf8f0b74d916836155da84957a9875e4f3
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 07:50:46 +0200

Cell-based analytics across all dashboard views

Everything now aggregates by cell (config) not by individual run.
Runs within a cell show variance/reliability of that config.

Analysis lib:
- Cell interface with avg/min/max for all metrics
- groupIntoCells() aggregation function
- computeMainEffects now uses cell averages with variance tracking
- computeInteraction includes variance per cell combo

Bar charts:
- Error bars showing min-max range of cell scores per model/task
- Labels show cell count (e.g., "haiku (n=19 cells)")

Tornado chart:
- Shaded variance bands behind effect bars
- Shows +/-variance percentage alongside effect
- "N cells" instead of "n=N"

Compare page:
- Cell count and run count columns
- Score and cost ranges (min-max across cells)
- Cell-first aggregation prevents configs with more repeats from dominating

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Charts.tsx | 204++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mdashboard/src/components/TornadoChart.tsx | 77++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mdashboard/src/lib/analysis.ts | 147+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mdashboard/src/pages/compare.astro | 120++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
4 files changed, 443 insertions(+), 105 deletions(-)

diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -8,6 +8,7 @@ import { ResponsiveContainer, Legend, Cell, + ErrorBar, } from "recharts"; import type { Run } from "../lib/types"; @@ -18,8 +19,24 @@ interface ChartsProps { interface ModelScore { model: string; avg_score: number; + min_score: number; + max_score: number; + errorRange: [number, number]; avg_cost: number; - count: number; + cellCount: number; +} + +interface TaskScore { + task: string; + avg_score: number; + min_score: number; + max_score: number; + scoreErrorRange: [number, number]; + pass_rate: number; + min_pass_rate: number; + max_pass_rate: number; + passRateErrorRange: [number, number]; + cellCount: number; } const SMUI = { @@ -53,70 +70,139 @@ const TOOLTIP_STYLE = { padding: "8px 12px", }; -function aggregateByModel(runs: Run[]): ModelScore[] { - const byModel: Record<string, { scores: number[]; costs: number[] }> = {}; +interface CellAggregate { + cell_id: string; + model: string; + task: string; + avgScore: number; + avgCost: number; + passRate: number; + runCount: number; +} + +function aggregateCells(runs: Run[]): CellAggregate[] { + const byCell: Record<string, { + model: string; + task: string; + scores: number[]; + costs: number[]; + passes: number; + total: number; + }> = {}; for (const run of runs) { - const model = run.meta.model; - if (!byModel[model]) byModel[model] = { scores: [], costs: [] }; + const cellId = run.meta.cell_id; + if (!byCell[cellId]) { + byCell[cellId] = { + model: run.meta.model, + task: run.meta.task, + scores: [], + costs: [], + passes: 0, + total: 0, + }; + } + byCell[cellId].total++; if (run.eval_results?.score != null) { - byModel[model].scores.push(run.eval_results.score); + byCell[cellId].scores.push(run.eval_results.score); } if (run.claude_output?.total_cost_usd != null) { - byModel[model].costs.push(run.claude_output.total_cost_usd); + byCell[cellId].costs.push(run.claude_output.total_cost_usd); + } + if (run.eval_results?.functional?.pass) { + byCell[cellId].passes++; } } - return Object.entries(byModel).map(([model, data]) => ({ - model: `${model} (n=${data.scores.length})`, - avg_score: data.scores.length > 0 - ? Math.round( - (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100 - ) + return Object.entries(byCell).map(([cell_id, data]) => ({ + cell_id, + model: data.model, + task: data.task, + avgScore: data.scores.length > 0 + ? data.scores.reduce((a, b) => a + b, 0) / data.scores.length : 0, - avg_cost: data.costs.length > 0 - ? Math.round( - (data.costs.reduce((a, b) => a + b, 0) / data.costs.length) * 100 - ) / 100 + avgCost: data.costs.length > 0 + ? data.costs.reduce((a, b) => a + b, 0) / data.costs.length : 0, - count: data.scores.length, + passRate: data.total > 0 + ? data.passes / data.total + : 0, + runCount: data.total, })); } -interface TaskScore { - task: string; - avg_score: number; - pass_rate: number; +function aggregateByModel(runs: Run[]): ModelScore[] { + const cells = aggregateCells(runs); + const byModel: Record<string, CellAggregate[]> = {}; + + for (const cell of cells) { + if (!byModel[cell.model]) byModel[cell.model] = []; + byModel[cell.model].push(cell); + } + + return Object.entries(byModel).map(([model, modelCells]) => { + const scores = modelCells.map((c) => Math.round(c.avgScore * 100)); + const costs = modelCells.map((c) => c.avgCost); + const avgScore = scores.length > 0 + ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length) + : 0; + const minScore = scores.length > 0 ? Math.min(...scores) : 0; + const maxScore = scores.length > 0 ? Math.max(...scores) : 0; + const avgCost = costs.length > 0 + ? Math.round((costs.reduce((a, b) => a + b, 0) / costs.length) * 100) / 100 + : 0; + + return { + model: `${model} (n=${modelCells.length} cells)`, + avg_score: avgScore, + min_score: minScore, + max_score: maxScore, + errorRange: [avgScore - minScore, maxScore - avgScore] as [number, number], + avg_cost: avgCost, + cellCount: modelCells.length, + }; + }); } function aggregateByTask(runs: Run[]): TaskScore[] { - const byTask: Record<string, { scores: number[]; passes: number; total: number }> = {}; + const cells = aggregateCells(runs); + const byTask: Record<string, CellAggregate[]> = {}; - for (const run of runs) { - const task = run.meta.task; - if (!byTask[task]) byTask[task] = { scores: [], passes: 0, total: 0 }; - - byTask[task].total++; - if (run.eval_results?.score != null) { - byTask[task].scores.push(run.eval_results.score); - } - if (run.eval_results?.functional?.pass) { - byTask[task].passes++; - } + for (const cell of cells) { + if (!byTask[cell.task]) byTask[cell.task] = []; + byTask[cell.task].push(cell); } - return Object.entries(byTask).map(([task, data]) => ({ - task: `${task} (n=${data.total})`, - avg_score: data.scores.length > 0 - ? Math.round( - (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100 - ) - : 0, - pass_rate: data.total > 0 - ? Math.round((data.passes / data.total) * 100) - : 0, - })); + return Object.entries(byTask).map(([task, taskCells]) => { + const scores = taskCells.map((c) => Math.round(c.avgScore * 100)); + const passRates = taskCells.map((c) => Math.round(c.passRate * 100)); + + const avgScore = scores.length > 0 + ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length) + : 0; + const minScore = scores.length > 0 ? Math.min(...scores) : 0; + const maxScore = scores.length > 0 ? Math.max(...scores) : 0; + + const avgPassRate = passRates.length > 0 + ? Math.round(passRates.reduce((a, b) => a + b, 0) / passRates.length) + : 0; + const minPassRate = passRates.length > 0 ? Math.min(...passRates) : 0; + const maxPassRate = passRates.length > 0 ? Math.max(...passRates) : 0; + + return { + task: `${task} (n=${taskCells.length} cells)`, + avg_score: avgScore, + min_score: minScore, + max_score: maxScore, + scoreErrorRange: [avgScore - minScore, maxScore - avgScore] as [number, number], + pass_rate: avgPassRate, + min_pass_rate: minPassRate, + max_pass_rate: maxPassRate, + passRateErrorRange: [avgPassRate - minPassRate, maxPassRate - avgPassRate] as [number, number], + cellCount: taskCells.length, + }; + }); } export default function Charts({ runs }: ChartsProps) { @@ -154,8 +240,17 @@ export default function Charts({ runs }: ChartsProps) { tickLine={false} axisLine={false} /> - <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> + <Tooltip + contentStyle={TOOLTIP_STYLE} + cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} + formatter={(value: number, name: string) => { + if (name === "Avg Score %") return [`${value}%`, name]; + return [value, name]; + }} + labelFormatter={(label: string) => label} + /> <Bar dataKey="avg_score" name="Avg Score %" radius={0}> + <ErrorBar dataKey="errorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} /> {modelData.map((entry) => { const baseModel = entry.model.split(" ")[0]; return <Cell key={entry.model} fill={MODEL_COLORS[baseModel] || SMUI.frost2} />; @@ -186,7 +281,14 @@ export default function Charts({ runs }: ChartsProps) { tickLine={false} axisLine={false} /> - <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> + <Tooltip + contentStyle={TOOLTIP_STYLE} + cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} + formatter={(value: number, name: string) => { + return [`${value}%`, name]; + }} + labelFormatter={(label: string) => label} + /> <Legend wrapperStyle={{ fontFamily: "'JetBrains Mono', monospace", @@ -195,8 +297,12 @@ export default function Charts({ runs }: ChartsProps) { letterSpacing: "0.5px", }} /> - <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0} /> - <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0} /> + <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0}> + <ErrorBar dataKey="scoreErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} /> + </Bar> + <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0}> + <ErrorBar dataKey="passRateErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} /> + </Bar> </BarChart> </ResponsiveContainer> </div> diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx @@ -41,8 +41,13 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) { ); } - const maxSpread = Math.max(...effects.map((e) => e.spread)); - const scale = maxSpread > 0 ? 200 / maxSpread : 1; // max bar width = 200px + // Scale must account for variance bands extending beyond effect bars + const maxExtent = Math.max( + ...effects.flatMap((e) => + e.values.map((v) => Math.abs(v.effect) + v.variance) + ) + ); + const scale = maxExtent > 0 ? 200 / maxExtent : 1; return ( <div className="card"> @@ -54,7 +59,8 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) { marginBottom: "16px", }} > - Sorted by effect size. Wider bars = bigger impact on outcomes. + Sorted by effect size. Solid bars show effect (deviation from grand + mean). Shaded bands show within-cell variance. </p> {effects.map((effect) => ( @@ -89,7 +95,9 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) { }} > {effect.values.map((entry) => { - const width = Math.abs(entry.effect) * scale; + const effectWidth = Math.abs(entry.effect) * scale; + const varianceBandWidth = + (Math.abs(entry.effect) + entry.variance) * scale; const isPositive = entry.effect >= 0; return ( <div @@ -114,34 +122,73 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) { </div> <div style={{ + position: "relative", height: "16px", - width: `${Math.max(width, 2)}px`, - background: isPositive - ? "var(--green)" - : "var(--red)", - borderRadius: "2px", - opacity: 0.8, + width: `${Math.max(varianceBandWidth, effectWidth, 2)}px`, }} - /> + > + {/* Variance band (behind, wider, semi-transparent) */} + {entry.variance > 0 && ( + <div + style={{ + position: "absolute", + top: "1px", + left: 0, + height: "14px", + width: `${Math.max(varianceBandWidth, 2)}px`, + background: isPositive + ? "var(--green)" + : "var(--red)", + opacity: 0.15, + borderRadius: "2px", + }} + /> + )} + {/* Effect bar (foreground, solid) */} + <div + style={{ + position: "absolute", + top: 0, + left: 0, + height: "16px", + width: `${Math.max(effectWidth, 2)}px`, + background: isPositive + ? "var(--green)" + : "var(--red)", + borderRadius: "2px", + opacity: 0.8, + }} + /> + </div> <div style={{ fontSize: "0.7rem", fontFamily: "var(--font-mono)", - color: isPositive - ? "var(--green)" - : "var(--red)", + color: isPositive ? "var(--green)" : "var(--red)", + whiteSpace: "nowrap", }} > {entry.effect >= 0 ? "+" : ""} {(entry.effect * 100).toFixed(1)}% + {entry.variance > 0 && ( + <span + style={{ + color: "var(--text-muted)", + marginLeft: "4px", + }} + > + ±{(entry.variance * 100).toFixed(1)}% + </span> + )} </div> <div style={{ fontSize: "0.65rem", color: "var(--text-muted)", + whiteSpace: "nowrap", }} > - (n={entry.n}) + {entry.n} cells </div> </div> ); diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts @@ -1,10 +1,24 @@ import type { Run, AxisName } from "./types"; import { AXIS_NAMES } from "./types"; +export interface Cell { + cell_id: string; + runs: Run[]; + meta: Run["meta"]; // from first run + n: number; + score: { avg: number; min: number; max: number; range: number }; + cost: { avg: number; min: number; max: number }; + turns: { avg: number; min: number; max: number }; + wall_time: { avg: number; min: number; max: number }; + gameplay: { avg: number; min: number; max: number }; + code_quality: { avg: number; min: number; max: number }; +} + export interface EffectEntry { value: string; mean: number; effect: number; + variance: number; n: number; } @@ -16,6 +30,7 @@ export interface AxisEffect { export interface InteractionCell { mean: number; + variance: number; n: number; } @@ -55,6 +70,56 @@ const METRICS: Record<string, MetricExtractor> = { transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null, }; +function agg(values: number[]): { avg: number; min: number; max: number } { + if (values.length === 0) return { avg: 0, min: 0, max: 0 }; + const sum = values.reduce((a, b) => a + b, 0); + return { + avg: sum / values.length, + min: Math.min(...values), + max: Math.max(...values), + }; +} + +export function groupIntoCells(runs: Run[]): Cell[] { + const byCell = new Map<string, Run[]>(); + for (const run of runs) { + const id = run.meta.cell_id; + const list = byCell.get(id); + if (list) list.push(run); + else byCell.set(id, [run]); + } + + const cells: Cell[] = []; + for (const [cell_id, cellRuns] of byCell) { + const extractVals = (extractor: MetricExtractor): number[] => { + const vals: number[] = []; + for (const r of cellRuns) { + const v = extractor(r); + if (v !== null) vals.push(v); + } + return vals; + }; + + const scoreVals = extractVals(METRICS.score); + const scoreAgg = agg(scoreVals); + + cells.push({ + cell_id, + runs: cellRuns, + meta: cellRuns[0].meta, + n: cellRuns.length, + score: { ...scoreAgg, range: scoreAgg.max - scoreAgg.min }, + cost: agg(extractVals(METRICS.cost)), + turns: agg(extractVals(METRICS.turns)), + wall_time: agg(extractVals(METRICS.wall_time)), + gameplay: agg(extractVals(METRICS.gameplay)), + code_quality: agg(extractVals(METRICS.code_quality)), + }); + } + + return cells; +} + export function computeMainEffects( runs: Run[], metric: string = "score" @@ -62,14 +127,24 @@ export function computeMainEffects( const extract = METRICS[metric]; if (!extract) return []; - const scored: Array<{ meta: Run["meta"]; value: number }> = []; - for (const run of runs) { - const val = extract(run); - if (val !== null) scored.push({ meta: run.meta, value: val }); + const cells = groupIntoCells(runs); + + // Compute per-cell metric averages and ranges + const scored: Array<{ meta: Run["meta"]; avg: number; range: number }> = []; + for (const cell of cells) { + const vals: number[] = []; + for (const run of cell.runs) { + const v = extract(run); + if (v !== null) vals.push(v); + } + if (vals.length === 0) continue; + const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length; + const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0; + scored.push({ meta: cell.meta, avg: cellAvg, range: cellRange }); } if (scored.length === 0) return []; - const grandMean = scored.reduce((s, r) => s + r.value, 0) / scored.length; + const grandMean = scored.reduce((s, c) => s + c.avg, 0) / scored.length; // Find axis keys from meta const axisKeys = Object.keys(scored[0].meta).filter( @@ -79,22 +154,26 @@ export function computeMainEffects( const effects: AxisEffect[] = []; for (const axis of axisKeys) { - const groups: Record<string, number[]> = {}; - for (const { meta, value } of scored) { + const groups: Record<string, { avgs: number[]; ranges: number[] }> = {}; + for (const { meta, avg, range } of scored) { const key = String((meta as Record<string, unknown>)[axis] ?? "unknown"); - (groups[key] ??= []).push(value); + const g = groups[key] ??= { avgs: [], ranges: [] }; + g.avgs.push(avg); + g.ranges.push(range); } if (Object.keys(groups).length < 2) continue; const values: EffectEntry[] = []; - for (const [val, vals] of Object.entries(groups)) { - const mean = vals.reduce((a, b) => a + b, 0) / vals.length; + for (const [val, { avgs, ranges }] of Object.entries(groups)) { + const mean = avgs.reduce((a, b) => a + b, 0) / avgs.length; + const variance = ranges.reduce((a, b) => a + b, 0) / ranges.length; values.push({ value: val, mean: Math.round(mean * 10000) / 10000, effect: Math.round((mean - grandMean) * 10000) / 10000, - n: vals.length, + variance: Math.round(variance * 10000) / 10000, + n: avgs.length, }); } @@ -121,31 +200,49 @@ export function computeInteraction( if (!extract) return { axisA, axisB, table: {}, maxInteraction: 0 }; - const groups: Record<string, Record<string, number[]>> = {}; + const cells = groupIntoCells(runs); - for (const run of runs) { - const val = extract(run); - if (val === null) continue; - const a = String((run.meta as Record<string, unknown>)[axisA] ?? "?"); - const b = String((run.meta as Record<string, unknown>)[axisB] ?? "?"); - ((groups[a] ??= {})[b] ??= []).push(val); + // Group cells by (axisA, axisB) combination + const groups: Record<string, Record<string, { avgs: number[]; ranges: number[] }>> = {}; + + for (const cell of cells) { + const vals: number[] = []; + for (const run of cell.runs) { + const v = extract(run); + if (v !== null) vals.push(v); + } + if (vals.length === 0) continue; + + const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length; + const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0; + + const a = String((cell.meta as Record<string, unknown>)[axisA] ?? "?"); + const b = String((cell.meta as Record<string, unknown>)[axisB] ?? "?"); + const g = ((groups[a] ??= {})[b] ??= { avgs: [], ranges: [] }); + g.avgs.push(cellAvg); + g.ranges.push(cellRange); } const table: Record<string, Record<string, InteractionCell>> = {}; - const allVals: number[] = []; + const allMeans: number[] = []; for (const [a, bGroups] of Object.entries(groups)) { table[a] = {}; - for (const [b, vals] of Object.entries(bGroups)) { - const mean = vals.reduce((s, v) => s + v, 0) / vals.length; - table[a][b] = { mean: Math.round(mean * 10000) / 10000, n: vals.length }; - allVals.push(mean); + for (const [b, { avgs, ranges }] of Object.entries(bGroups)) { + const mean = avgs.reduce((s, v) => s + v, 0) / avgs.length; + const variance = ranges.reduce((s, v) => s + v, 0) / ranges.length; + table[a][b] = { + mean: Math.round(mean * 10000) / 10000, + variance: Math.round(variance * 10000) / 10000, + n: avgs.length, + }; + allMeans.push(mean); } } const grandMean = - allVals.length > 0 - ? allVals.reduce((a, b) => a + b, 0) / allVals.length + allMeans.length > 0 + ? allMeans.reduce((a, b) => a + b, 0) / allMeans.length : 0; // Row and column means diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro @@ -1,23 +1,80 @@ --- import Base from "../layouts/Base.astro"; -import { loadAllRuns, getAxisValues, getTaskNames, aggregateRuns, AXIS_NAMES } from "../lib/data"; +import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES } from "../lib/data"; import type { Run, AxisName } from "../lib/data"; const runs = loadAllRuns(); const axisValues = getAxisValues(runs); const tasks = getTaskNames(runs); -// Build comparison data: for each axis, show how different values perform +// Build comparison data using cell-based aggregation. +// A "cell" is a unique configuration (cell_id). Multiple runs share a cell_id +// when they are repeat trials of the same config. Averaging per-cell first, +// then aggregating across cells, prevents configs with more repeats from +// dominating the average. + interface ComparisonRow { axis: string; value: string; - count: number; + cells: number; // number of unique configs + runs: number; // total runs avg_score: string; - pass_rate: string; + score_range: string; // "68%-80%" avg_cost: string; + cost_range: string; // "$0.15-$0.22" avg_time: string; } +interface CellStats { + avg_score: number | null; + avg_cost: number | null; + avg_time: number | null; + run_count: number; +} + +/** Compute per-cell averages from a list of runs. */ +function getCellStats(runs: Run[]): Map<string, CellStats> { + const cells = new Map<string, Run[]>(); + for (const run of runs) { + const id = run.meta.cell_id; + if (!cells.has(id)) cells.set(id, []); + cells.get(id)!.push(run); + } + + const result = new Map<string, CellStats>(); + for (const [cellId, cellRuns] of cells) { + const scores = cellRuns + .map((r) => r.eval_results?.score) + .filter((s): s is number => s != null); + const costs = cellRuns + .map((r) => r.claude_output?.total_cost_usd) + .filter((c): c is number => c != null); + const times = cellRuns + .map((r) => r.meta.wall_time_seconds) + .filter((t): t is number => t != null); + + const avg = (arr: number[]) => + arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; + + result.set(cellId, { + avg_score: avg(scores), + avg_cost: avg(costs), + avg_time: avg(times), + run_count: cellRuns.length, + }); + } + return result; +} + +function formatRange(values: number[], formatter: (v: number) => string): string { + if (values.length === 0) return "-"; + if (values.length === 1) return formatter(values[0]); + const min = Math.min(...values); + const max = Math.max(...values); + if (min === max) return formatter(min); + return formatter(min) + "-" + formatter(max); +} + const comparisons: ComparisonRow[] = []; const AXIS_LABELS: Record<AxisName, string> = { @@ -39,23 +96,50 @@ const AXIS_LABELS: Record<AxisName, string> = { max_budget: "Budget", }; +// Pre-compute all cell stats once +const allCellStats = getCellStats(runs); + for (const axis of AXIS_NAMES) { for (const value of axisValues[axis]) { const filtered = runs.filter( (r: Run) => String(r.meta[axis as keyof typeof r.meta]) === value ); - const stats = aggregateRuns(filtered); + + // Find the unique cell_ids in these runs and gather their stats + const cellIds = new Set(filtered.map((r) => r.meta.cell_id)); + const matchingCells: CellStats[] = []; + for (const id of cellIds) { + const cs = allCellStats.get(id); + if (cs) matchingCells.push(cs); + } + + const cellScores = matchingCells + .map((c) => c.avg_score) + .filter((s): s is number => s != null); + const cellCosts = matchingCells + .map((c) => c.avg_cost) + .filter((c): c is number => c != null); + const cellTimes = matchingCells + .map((c) => c.avg_time) + .filter((t): t is number => t != null); + + const avg = (arr: number[]) => + arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; + + const avgScore = avg(cellScores); + const avgCost = avg(cellCosts); + const avgTime = avg(cellTimes); + comparisons.push({ axis: AXIS_LABELS[axis], value, - count: stats.count, - avg_score: - stats.avg_score != null ? (stats.avg_score * 100).toFixed(0) + "%" : "-", - pass_rate: - stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-", - avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-", - avg_time: - stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-", + cells: cellIds.size, + runs: filtered.length, + avg_score: avgScore != null ? (avgScore * 100).toFixed(0) + "%" : "-", + score_range: formatRange(cellScores, (v) => (v * 100).toFixed(0) + "%"), + avg_cost: avgCost != null ? "$" + avgCost.toFixed(2) : "-", + cost_range: formatRange(cellCosts, (v) => "$" + v.toFixed(2)), + avg_time: avgTime != null ? Math.round(avgTime) + "s" : "-", }); } } @@ -78,10 +162,12 @@ for (const axis of AXIS_NAMES) { <tr> <th>Axis</th> <th>Value</th> + <th>Cells</th> <th>Runs</th> <th>Avg Score</th> - <th>Pass Rate</th> + <th>Score Range</th> <th>Avg Cost</th> + <th>Cost Range</th> <th>Avg Time</th> </tr> </thead> @@ -92,10 +178,12 @@ for (const axis of AXIS_NAMES) { <td> <span class="badge badge-neutral">{row.value}</span> </td> - <td>{row.count}</td> + <td>{row.cells}</td> + <td>{row.runs}</td> <td class="score-cell">{row.avg_score}</td> - <td class="score-cell">{row.pass_rate}</td> + <td style="color: var(--text-muted); font-size: 0.85rem;">{row.score_range}</td> <td>{row.avg_cost}</td> + <td style="color: var(--text-muted); font-size: 0.85rem;">{row.cost_range}</td> <td>{row.avg_time}</td> </tr> ))}

Impressum · Datenschutz