commit 5e358b275032b8351588c74c53ef7c5853c1b8b4
parent 42135ccf8f0b74d916836155da84957a9875e4f3
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 07:50:46 +0200
Cell-based analytics across all dashboard views
Everything now aggregates by cell (config) not by individual run.
Runs within a cell show variance/reliability of that config.
Analysis lib:
- Cell interface with avg/min/max for all metrics
- groupIntoCells() aggregation function
- computeMainEffects now uses cell averages with variance tracking
- computeInteraction includes variance per cell combo
Bar charts:
- Error bars showing min-max range of cell scores per model/task
- Labels show cell count (e.g., "haiku (n=19 cells)")
Tornado chart:
- Shaded variance bands behind effect bars
- Shows +/-variance percentage alongside effect
- "N cells" instead of "n=N"
Compare page:
- Cell count and run count columns
- Score and cost ranges (min-max across cells)
- Cell-first aggregation prevents configs with more repeats from dominating
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
4 files changed, 443 insertions(+), 105 deletions(-)
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -8,6 +8,7 @@ import {
ResponsiveContainer,
Legend,
Cell,
+ ErrorBar,
} from "recharts";
import type { Run } from "../lib/types";
@@ -18,8 +19,24 @@ interface ChartsProps {
interface ModelScore {
model: string;
avg_score: number;
+ min_score: number;
+ max_score: number;
+ errorRange: [number, number];
avg_cost: number;
- count: number;
+ cellCount: number;
+}
+
+interface TaskScore {
+ task: string;
+ avg_score: number;
+ min_score: number;
+ max_score: number;
+ scoreErrorRange: [number, number];
+ pass_rate: number;
+ min_pass_rate: number;
+ max_pass_rate: number;
+ passRateErrorRange: [number, number];
+ cellCount: number;
}
const SMUI = {
@@ -53,70 +70,139 @@ const TOOLTIP_STYLE = {
padding: "8px 12px",
};
-function aggregateByModel(runs: Run[]): ModelScore[] {
- const byModel: Record<string, { scores: number[]; costs: number[] }> = {};
+interface CellAggregate {
+ cell_id: string;
+ model: string;
+ task: string;
+ avgScore: number;
+ avgCost: number;
+ passRate: number;
+ runCount: number;
+}
+
+function aggregateCells(runs: Run[]): CellAggregate[] {
+ const byCell: Record<string, {
+ model: string;
+ task: string;
+ scores: number[];
+ costs: number[];
+ passes: number;
+ total: number;
+ }> = {};
for (const run of runs) {
- const model = run.meta.model;
- if (!byModel[model]) byModel[model] = { scores: [], costs: [] };
+ const cellId = run.meta.cell_id;
+ if (!byCell[cellId]) {
+ byCell[cellId] = {
+ model: run.meta.model,
+ task: run.meta.task,
+ scores: [],
+ costs: [],
+ passes: 0,
+ total: 0,
+ };
+ }
+ byCell[cellId].total++;
if (run.eval_results?.score != null) {
- byModel[model].scores.push(run.eval_results.score);
+ byCell[cellId].scores.push(run.eval_results.score);
}
if (run.claude_output?.total_cost_usd != null) {
- byModel[model].costs.push(run.claude_output.total_cost_usd);
+ byCell[cellId].costs.push(run.claude_output.total_cost_usd);
+ }
+ if (run.eval_results?.functional?.pass) {
+ byCell[cellId].passes++;
}
}
- return Object.entries(byModel).map(([model, data]) => ({
- model: `${model} (n=${data.scores.length})`,
- avg_score: data.scores.length > 0
- ? Math.round(
- (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
- )
+ return Object.entries(byCell).map(([cell_id, data]) => ({
+ cell_id,
+ model: data.model,
+ task: data.task,
+ avgScore: data.scores.length > 0
+ ? data.scores.reduce((a, b) => a + b, 0) / data.scores.length
: 0,
- avg_cost: data.costs.length > 0
- ? Math.round(
- (data.costs.reduce((a, b) => a + b, 0) / data.costs.length) * 100
- ) / 100
+ avgCost: data.costs.length > 0
+ ? data.costs.reduce((a, b) => a + b, 0) / data.costs.length
: 0,
- count: data.scores.length,
+ passRate: data.total > 0
+ ? data.passes / data.total
+ : 0,
+ runCount: data.total,
}));
}
-interface TaskScore {
- task: string;
- avg_score: number;
- pass_rate: number;
+function aggregateByModel(runs: Run[]): ModelScore[] {
+ const cells = aggregateCells(runs);
+ const byModel: Record<string, CellAggregate[]> = {};
+
+ for (const cell of cells) {
+ if (!byModel[cell.model]) byModel[cell.model] = [];
+ byModel[cell.model].push(cell);
+ }
+
+ return Object.entries(byModel).map(([model, modelCells]) => {
+ const scores = modelCells.map((c) => Math.round(c.avgScore * 100));
+ const costs = modelCells.map((c) => c.avgCost);
+ const avgScore = scores.length > 0
+ ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length)
+ : 0;
+ const minScore = scores.length > 0 ? Math.min(...scores) : 0;
+ const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
+ const avgCost = costs.length > 0
+ ? Math.round((costs.reduce((a, b) => a + b, 0) / costs.length) * 100) / 100
+ : 0;
+
+ return {
+ model: `${model} (n=${modelCells.length} cells)`,
+ avg_score: avgScore,
+ min_score: minScore,
+ max_score: maxScore,
+ errorRange: [avgScore - minScore, maxScore - avgScore] as [number, number],
+ avg_cost: avgCost,
+ cellCount: modelCells.length,
+ };
+ });
}
function aggregateByTask(runs: Run[]): TaskScore[] {
- const byTask: Record<string, { scores: number[]; passes: number; total: number }> = {};
+ const cells = aggregateCells(runs);
+ const byTask: Record<string, CellAggregate[]> = {};
- for (const run of runs) {
- const task = run.meta.task;
- if (!byTask[task]) byTask[task] = { scores: [], passes: 0, total: 0 };
-
- byTask[task].total++;
- if (run.eval_results?.score != null) {
- byTask[task].scores.push(run.eval_results.score);
- }
- if (run.eval_results?.functional?.pass) {
- byTask[task].passes++;
- }
+ for (const cell of cells) {
+ if (!byTask[cell.task]) byTask[cell.task] = [];
+ byTask[cell.task].push(cell);
}
- return Object.entries(byTask).map(([task, data]) => ({
- task: `${task} (n=${data.total})`,
- avg_score: data.scores.length > 0
- ? Math.round(
- (data.scores.reduce((a, b) => a + b, 0) / data.scores.length) * 100
- )
- : 0,
- pass_rate: data.total > 0
- ? Math.round((data.passes / data.total) * 100)
- : 0,
- }));
+ return Object.entries(byTask).map(([task, taskCells]) => {
+ const scores = taskCells.map((c) => Math.round(c.avgScore * 100));
+ const passRates = taskCells.map((c) => Math.round(c.passRate * 100));
+
+ const avgScore = scores.length > 0
+ ? Math.round(scores.reduce((a, b) => a + b, 0) / scores.length)
+ : 0;
+ const minScore = scores.length > 0 ? Math.min(...scores) : 0;
+ const maxScore = scores.length > 0 ? Math.max(...scores) : 0;
+
+ const avgPassRate = passRates.length > 0
+ ? Math.round(passRates.reduce((a, b) => a + b, 0) / passRates.length)
+ : 0;
+ const minPassRate = passRates.length > 0 ? Math.min(...passRates) : 0;
+ const maxPassRate = passRates.length > 0 ? Math.max(...passRates) : 0;
+
+ return {
+ task: `${task} (n=${taskCells.length} cells)`,
+ avg_score: avgScore,
+ min_score: minScore,
+ max_score: maxScore,
+ scoreErrorRange: [avgScore - minScore, maxScore - avgScore] as [number, number],
+ pass_rate: avgPassRate,
+ min_pass_rate: minPassRate,
+ max_pass_rate: maxPassRate,
+ passRateErrorRange: [avgPassRate - minPassRate, maxPassRate - avgPassRate] as [number, number],
+ cellCount: taskCells.length,
+ };
+ });
}
export default function Charts({ runs }: ChartsProps) {
@@ -154,8 +240,17 @@ export default function Charts({ runs }: ChartsProps) {
tickLine={false}
axisLine={false}
/>
- <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} />
+ <Tooltip
+ contentStyle={TOOLTIP_STYLE}
+ cursor={{ fill: "hsl(217 17% 28% / 0.3)" }}
+ formatter={(value: number, name: string) => {
+ if (name === "Avg Score %") return [`${value}%`, name];
+ return [value, name];
+ }}
+ labelFormatter={(label: string) => label}
+ />
<Bar dataKey="avg_score" name="Avg Score %" radius={0}>
+ <ErrorBar dataKey="errorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
{modelData.map((entry) => {
const baseModel = entry.model.split(" ")[0];
return <Cell key={entry.model} fill={MODEL_COLORS[baseModel] || SMUI.frost2} />;
@@ -186,7 +281,14 @@ export default function Charts({ runs }: ChartsProps) {
tickLine={false}
axisLine={false}
/>
- <Tooltip contentStyle={TOOLTIP_STYLE} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} />
+ <Tooltip
+ contentStyle={TOOLTIP_STYLE}
+ cursor={{ fill: "hsl(217 17% 28% / 0.3)" }}
+ formatter={(value: number, name: string) => {
+ return [`${value}%`, name];
+ }}
+ labelFormatter={(label: string) => label}
+ />
<Legend
wrapperStyle={{
fontFamily: "'JetBrains Mono', monospace",
@@ -195,8 +297,12 @@ export default function Charts({ runs }: ChartsProps) {
letterSpacing: "0.5px",
}}
/>
- <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0} />
- <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0} />
+ <Bar dataKey="avg_score" fill={SMUI.frost2} name="Avg Score %" radius={0}>
+ <ErrorBar dataKey="scoreErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
+ </Bar>
+ <Bar dataKey="pass_rate" fill={SMUI.green} name="Pass Rate %" radius={0}>
+ <ErrorBar dataKey="passRateErrorRange" stroke={SMUI.muted} strokeWidth={1.5} width={6} />
+ </Bar>
</BarChart>
</ResponsiveContainer>
</div>
diff --git a/dashboard/src/components/TornadoChart.tsx b/dashboard/src/components/TornadoChart.tsx
@@ -41,8 +41,13 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
);
}
- const maxSpread = Math.max(...effects.map((e) => e.spread));
- const scale = maxSpread > 0 ? 200 / maxSpread : 1; // max bar width = 200px
+ // Scale must account for variance bands extending beyond effect bars
+ const maxExtent = Math.max(
+ ...effects.flatMap((e) =>
+ e.values.map((v) => Math.abs(v.effect) + v.variance)
+ )
+ );
+ const scale = maxExtent > 0 ? 200 / maxExtent : 1;
return (
<div className="card">
@@ -54,7 +59,8 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
marginBottom: "16px",
}}
>
- Sorted by effect size. Wider bars = bigger impact on outcomes.
+ Sorted by effect size. Solid bars show effect (deviation from grand
+ mean). Shaded bands show within-cell variance.
</p>
{effects.map((effect) => (
@@ -89,7 +95,9 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
}}
>
{effect.values.map((entry) => {
- const width = Math.abs(entry.effect) * scale;
+ const effectWidth = Math.abs(entry.effect) * scale;
+ const varianceBandWidth =
+ (Math.abs(entry.effect) + entry.variance) * scale;
const isPositive = entry.effect >= 0;
return (
<div
@@ -114,34 +122,73 @@ export default function TornadoChart({ effects, metric }: TornadoChartProps) {
</div>
<div
style={{
+ position: "relative",
height: "16px",
- width: `${Math.max(width, 2)}px`,
- background: isPositive
- ? "var(--green)"
- : "var(--red)",
- borderRadius: "2px",
- opacity: 0.8,
+ width: `${Math.max(varianceBandWidth, effectWidth, 2)}px`,
}}
- />
+ >
+ {/* Variance band (behind, wider, semi-transparent) */}
+ {entry.variance > 0 && (
+ <div
+ style={{
+ position: "absolute",
+ top: "1px",
+ left: 0,
+ height: "14px",
+ width: `${Math.max(varianceBandWidth, 2)}px`,
+ background: isPositive
+ ? "var(--green)"
+ : "var(--red)",
+ opacity: 0.15,
+ borderRadius: "2px",
+ }}
+ />
+ )}
+ {/* Effect bar (foreground, solid) */}
+ <div
+ style={{
+ position: "absolute",
+ top: 0,
+ left: 0,
+ height: "16px",
+ width: `${Math.max(effectWidth, 2)}px`,
+ background: isPositive
+ ? "var(--green)"
+ : "var(--red)",
+ borderRadius: "2px",
+ opacity: 0.8,
+ }}
+ />
+ </div>
<div
style={{
fontSize: "0.7rem",
fontFamily: "var(--font-mono)",
- color: isPositive
- ? "var(--green)"
- : "var(--red)",
+ color: isPositive ? "var(--green)" : "var(--red)",
+ whiteSpace: "nowrap",
}}
>
{entry.effect >= 0 ? "+" : ""}
{(entry.effect * 100).toFixed(1)}%
+ {entry.variance > 0 && (
+ <span
+ style={{
+ color: "var(--text-muted)",
+ marginLeft: "4px",
+ }}
+ >
+ ±{(entry.variance * 100).toFixed(1)}%
+ </span>
+ )}
</div>
<div
style={{
fontSize: "0.65rem",
color: "var(--text-muted)",
+ whiteSpace: "nowrap",
}}
>
- (n={entry.n})
+ {entry.n} cells
</div>
</div>
);
diff --git a/dashboard/src/lib/analysis.ts b/dashboard/src/lib/analysis.ts
@@ -1,10 +1,24 @@
import type { Run, AxisName } from "./types";
import { AXIS_NAMES } from "./types";
+export interface Cell {
+ cell_id: string;
+ runs: Run[];
+ meta: Run["meta"]; // from first run
+ n: number;
+ score: { avg: number; min: number; max: number; range: number };
+ cost: { avg: number; min: number; max: number };
+ turns: { avg: number; min: number; max: number };
+ wall_time: { avg: number; min: number; max: number };
+ gameplay: { avg: number; min: number; max: number };
+ code_quality: { avg: number; min: number; max: number };
+}
+
export interface EffectEntry {
value: string;
mean: number;
effect: number;
+ variance: number;
n: number;
}
@@ -16,6 +30,7 @@ export interface AxisEffect {
export interface InteractionCell {
mean: number;
+ variance: number;
n: number;
}
@@ -55,6 +70,56 @@ const METRICS: Record<string, MetricExtractor> = {
transcript: (r) => (r.eval_results as Record<string, any>)?.transcript_analysis?.score ?? null,
};
+function agg(values: number[]): { avg: number; min: number; max: number } {
+ if (values.length === 0) return { avg: 0, min: 0, max: 0 };
+ const sum = values.reduce((a, b) => a + b, 0);
+ return {
+ avg: sum / values.length,
+ min: Math.min(...values),
+ max: Math.max(...values),
+ };
+}
+
+export function groupIntoCells(runs: Run[]): Cell[] {
+ const byCell = new Map<string, Run[]>();
+ for (const run of runs) {
+ const id = run.meta.cell_id;
+ const list = byCell.get(id);
+ if (list) list.push(run);
+ else byCell.set(id, [run]);
+ }
+
+ const cells: Cell[] = [];
+ for (const [cell_id, cellRuns] of byCell) {
+ const extractVals = (extractor: MetricExtractor): number[] => {
+ const vals: number[] = [];
+ for (const r of cellRuns) {
+ const v = extractor(r);
+ if (v !== null) vals.push(v);
+ }
+ return vals;
+ };
+
+ const scoreVals = extractVals(METRICS.score);
+ const scoreAgg = agg(scoreVals);
+
+ cells.push({
+ cell_id,
+ runs: cellRuns,
+ meta: cellRuns[0].meta,
+ n: cellRuns.length,
+ score: { ...scoreAgg, range: scoreAgg.max - scoreAgg.min },
+ cost: agg(extractVals(METRICS.cost)),
+ turns: agg(extractVals(METRICS.turns)),
+ wall_time: agg(extractVals(METRICS.wall_time)),
+ gameplay: agg(extractVals(METRICS.gameplay)),
+ code_quality: agg(extractVals(METRICS.code_quality)),
+ });
+ }
+
+ return cells;
+}
+
export function computeMainEffects(
runs: Run[],
metric: string = "score"
@@ -62,14 +127,24 @@ export function computeMainEffects(
const extract = METRICS[metric];
if (!extract) return [];
- const scored: Array<{ meta: Run["meta"]; value: number }> = [];
- for (const run of runs) {
- const val = extract(run);
- if (val !== null) scored.push({ meta: run.meta, value: val });
+ const cells = groupIntoCells(runs);
+
+ // Compute per-cell metric averages and ranges
+ const scored: Array<{ meta: Run["meta"]; avg: number; range: number }> = [];
+ for (const cell of cells) {
+ const vals: number[] = [];
+ for (const run of cell.runs) {
+ const v = extract(run);
+ if (v !== null) vals.push(v);
+ }
+ if (vals.length === 0) continue;
+ const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+ const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
+ scored.push({ meta: cell.meta, avg: cellAvg, range: cellRange });
}
if (scored.length === 0) return [];
- const grandMean = scored.reduce((s, r) => s + r.value, 0) / scored.length;
+ const grandMean = scored.reduce((s, c) => s + c.avg, 0) / scored.length;
// Find axis keys from meta
const axisKeys = Object.keys(scored[0].meta).filter(
@@ -79,22 +154,26 @@ export function computeMainEffects(
const effects: AxisEffect[] = [];
for (const axis of axisKeys) {
- const groups: Record<string, number[]> = {};
- for (const { meta, value } of scored) {
+ const groups: Record<string, { avgs: number[]; ranges: number[] }> = {};
+ for (const { meta, avg, range } of scored) {
const key = String((meta as Record<string, unknown>)[axis] ?? "unknown");
- (groups[key] ??= []).push(value);
+ const g = groups[key] ??= { avgs: [], ranges: [] };
+ g.avgs.push(avg);
+ g.ranges.push(range);
}
if (Object.keys(groups).length < 2) continue;
const values: EffectEntry[] = [];
- for (const [val, vals] of Object.entries(groups)) {
- const mean = vals.reduce((a, b) => a + b, 0) / vals.length;
+ for (const [val, { avgs, ranges }] of Object.entries(groups)) {
+ const mean = avgs.reduce((a, b) => a + b, 0) / avgs.length;
+ const variance = ranges.reduce((a, b) => a + b, 0) / ranges.length;
values.push({
value: val,
mean: Math.round(mean * 10000) / 10000,
effect: Math.round((mean - grandMean) * 10000) / 10000,
- n: vals.length,
+ variance: Math.round(variance * 10000) / 10000,
+ n: avgs.length,
});
}
@@ -121,31 +200,49 @@ export function computeInteraction(
if (!extract)
return { axisA, axisB, table: {}, maxInteraction: 0 };
- const groups: Record<string, Record<string, number[]>> = {};
+ const cells = groupIntoCells(runs);
- for (const run of runs) {
- const val = extract(run);
- if (val === null) continue;
- const a = String((run.meta as Record<string, unknown>)[axisA] ?? "?");
- const b = String((run.meta as Record<string, unknown>)[axisB] ?? "?");
- ((groups[a] ??= {})[b] ??= []).push(val);
+ // Group cells by (axisA, axisB) combination
+ const groups: Record<string, Record<string, { avgs: number[]; ranges: number[] }>> = {};
+
+ for (const cell of cells) {
+ const vals: number[] = [];
+ for (const run of cell.runs) {
+ const v = extract(run);
+ if (v !== null) vals.push(v);
+ }
+ if (vals.length === 0) continue;
+
+ const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+ const cellRange = vals.length > 1 ? Math.max(...vals) - Math.min(...vals) : 0;
+
+ const a = String((cell.meta as Record<string, unknown>)[axisA] ?? "?");
+ const b = String((cell.meta as Record<string, unknown>)[axisB] ?? "?");
+ const g = ((groups[a] ??= {})[b] ??= { avgs: [], ranges: [] });
+ g.avgs.push(cellAvg);
+ g.ranges.push(cellRange);
}
const table: Record<string, Record<string, InteractionCell>> = {};
- const allVals: number[] = [];
+ const allMeans: number[] = [];
for (const [a, bGroups] of Object.entries(groups)) {
table[a] = {};
- for (const [b, vals] of Object.entries(bGroups)) {
- const mean = vals.reduce((s, v) => s + v, 0) / vals.length;
- table[a][b] = { mean: Math.round(mean * 10000) / 10000, n: vals.length };
- allVals.push(mean);
+ for (const [b, { avgs, ranges }] of Object.entries(bGroups)) {
+ const mean = avgs.reduce((s, v) => s + v, 0) / avgs.length;
+ const variance = ranges.reduce((s, v) => s + v, 0) / ranges.length;
+ table[a][b] = {
+ mean: Math.round(mean * 10000) / 10000,
+ variance: Math.round(variance * 10000) / 10000,
+ n: avgs.length,
+ };
+ allMeans.push(mean);
}
}
const grandMean =
- allVals.length > 0
- ? allVals.reduce((a, b) => a + b, 0) / allVals.length
+ allMeans.length > 0
+ ? allMeans.reduce((a, b) => a + b, 0) / allMeans.length
: 0;
// Row and column means
diff --git a/dashboard/src/pages/compare.astro b/dashboard/src/pages/compare.astro
@@ -1,23 +1,80 @@
---
import Base from "../layouts/Base.astro";
-import { loadAllRuns, getAxisValues, getTaskNames, aggregateRuns, AXIS_NAMES } from "../lib/data";
+import { loadAllRuns, getAxisValues, getTaskNames, AXIS_NAMES } from "../lib/data";
import type { Run, AxisName } from "../lib/data";
const runs = loadAllRuns();
const axisValues = getAxisValues(runs);
const tasks = getTaskNames(runs);
-// Build comparison data: for each axis, show how different values perform
+// Build comparison data using cell-based aggregation.
+// A "cell" is a unique configuration (cell_id). Multiple runs share a cell_id
+// when they are repeat trials of the same config. Averaging per-cell first,
+// then aggregating across cells, prevents configs with more repeats from
+// dominating the average.
+
interface ComparisonRow {
axis: string;
value: string;
- count: number;
+ cells: number; // number of unique configs
+ runs: number; // total runs
avg_score: string;
- pass_rate: string;
+ score_range: string; // "68%-80%"
avg_cost: string;
+ cost_range: string; // "$0.15-$0.22"
avg_time: string;
}
+interface CellStats {
+ avg_score: number | null;
+ avg_cost: number | null;
+ avg_time: number | null;
+ run_count: number;
+}
+
+/** Compute per-cell averages from a list of runs. */
+function getCellStats(runs: Run[]): Map<string, CellStats> {
+ const cells = new Map<string, Run[]>();
+ for (const run of runs) {
+ const id = run.meta.cell_id;
+ if (!cells.has(id)) cells.set(id, []);
+ cells.get(id)!.push(run);
+ }
+
+ const result = new Map<string, CellStats>();
+ for (const [cellId, cellRuns] of cells) {
+ const scores = cellRuns
+ .map((r) => r.eval_results?.score)
+ .filter((s): s is number => s != null);
+ const costs = cellRuns
+ .map((r) => r.claude_output?.total_cost_usd)
+ .filter((c): c is number => c != null);
+ const times = cellRuns
+ .map((r) => r.meta.wall_time_seconds)
+ .filter((t): t is number => t != null);
+
+ const avg = (arr: number[]) =>
+ arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+
+ result.set(cellId, {
+ avg_score: avg(scores),
+ avg_cost: avg(costs),
+ avg_time: avg(times),
+ run_count: cellRuns.length,
+ });
+ }
+ return result;
+}
+
+function formatRange(values: number[], formatter: (v: number) => string): string {
+ if (values.length === 0) return "-";
+ if (values.length === 1) return formatter(values[0]);
+ const min = Math.min(...values);
+ const max = Math.max(...values);
+ if (min === max) return formatter(min);
+ return formatter(min) + "-" + formatter(max);
+}
+
const comparisons: ComparisonRow[] = [];
const AXIS_LABELS: Record<AxisName, string> = {
@@ -39,23 +96,50 @@ const AXIS_LABELS: Record<AxisName, string> = {
max_budget: "Budget",
};
+// Pre-compute all cell stats once
+const allCellStats = getCellStats(runs);
+
for (const axis of AXIS_NAMES) {
for (const value of axisValues[axis]) {
const filtered = runs.filter(
(r: Run) => String(r.meta[axis as keyof typeof r.meta]) === value
);
- const stats = aggregateRuns(filtered);
+
+ // Find the unique cell_ids in these runs and gather their stats
+ const cellIds = new Set(filtered.map((r) => r.meta.cell_id));
+ const matchingCells: CellStats[] = [];
+ for (const id of cellIds) {
+ const cs = allCellStats.get(id);
+ if (cs) matchingCells.push(cs);
+ }
+
+ const cellScores = matchingCells
+ .map((c) => c.avg_score)
+ .filter((s): s is number => s != null);
+ const cellCosts = matchingCells
+ .map((c) => c.avg_cost)
+ .filter((c): c is number => c != null);
+ const cellTimes = matchingCells
+ .map((c) => c.avg_time)
+ .filter((t): t is number => t != null);
+
+ const avg = (arr: number[]) =>
+ arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null;
+
+ const avgScore = avg(cellScores);
+ const avgCost = avg(cellCosts);
+ const avgTime = avg(cellTimes);
+
comparisons.push({
axis: AXIS_LABELS[axis],
value,
- count: stats.count,
- avg_score:
- stats.avg_score != null ? (stats.avg_score * 100).toFixed(0) + "%" : "-",
- pass_rate:
- stats.pass_rate != null ? (stats.pass_rate * 100).toFixed(0) + "%" : "-",
- avg_cost: stats.avg_cost != null ? "$" + stats.avg_cost.toFixed(2) : "-",
- avg_time:
- stats.avg_wall_time != null ? Math.round(stats.avg_wall_time) + "s" : "-",
+ cells: cellIds.size,
+ runs: filtered.length,
+ avg_score: avgScore != null ? (avgScore * 100).toFixed(0) + "%" : "-",
+ score_range: formatRange(cellScores, (v) => (v * 100).toFixed(0) + "%"),
+ avg_cost: avgCost != null ? "$" + avgCost.toFixed(2) : "-",
+ cost_range: formatRange(cellCosts, (v) => "$" + v.toFixed(2)),
+ avg_time: avgTime != null ? Math.round(avgTime) + "s" : "-",
});
}
}
@@ -78,10 +162,12 @@ for (const axis of AXIS_NAMES) {
<tr>
<th>Axis</th>
<th>Value</th>
+ <th>Cells</th>
<th>Runs</th>
<th>Avg Score</th>
- <th>Pass Rate</th>
+ <th>Score Range</th>
<th>Avg Cost</th>
+ <th>Cost Range</th>
<th>Avg Time</th>
</tr>
</thead>
@@ -92,10 +178,12 @@ for (const axis of AXIS_NAMES) {
<td>
<span class="badge badge-neutral">{row.value}</span>
</td>
- <td>{row.count}</td>
+ <td>{row.cells}</td>
+ <td>{row.runs}</td>
<td class="score-cell">{row.avg_score}</td>
- <td class="score-cell">{row.pass_rate}</td>
+ <td style="color: var(--text-muted); font-size: 0.85rem;">{row.score_range}</td>
<td>{row.avg_cost}</td>
+ <td style="color: var(--text-muted); font-size: 0.85rem;">{row.cost_range}</td>
<td>{row.avg_time}</td>
</tr>
))}