commit f213f5a1831e271fcf572e9de9073fe5c85985ba
parent 364e1e4595a31324e0d96750c5bff342c7bbaf76
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sun, 5 Apr 2026 21:22:35 +0200
Convert all charts to cell-based: every visualization now shows cells not runs
ScatterPlot: dots are cells with error bar crosshairs showing run ranges
CorrelationMatrix: spreads computed from cell averages per axis value
HeatmapMatrix: values are averages of cell averages, labels show "N cells"
BumpChart: model rankings from cell averages per condition
RadarComparison: dimension scores from cell averages
ConfigTreemap: size = cell count, color = avg cell score
EfficiencyFrontier: simplified to use groupIntoCells(), labels updated
All charts now consistently represent cells (unique configs) with
variance from repeat runs shown as ranges/error bars.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
7 files changed, 263 insertions(+), 169 deletions(-)
diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx
@@ -11,6 +11,7 @@ import {
} from "recharts";
import type { Run } from "../lib/types";
import { AXIS_NAMES, type AxisName } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
interface BumpChartProps {
runs: Run[];
@@ -65,15 +66,19 @@ function computeRankings(
runs: Run[],
axis: AxisName
): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } {
+ // Group runs into cells, then work with cell averages
+ const cells = groupIntoCells(runs);
+ const scoredCells = cells.filter((c) => c.score.avg > 0);
+
// Get unique condition values for the selected axis
const conditionValues = Array.from(
- new Set(runs.map((r) => String(r.meta[axis])))
+ new Set(scoredCells.map((c) => String(c.meta[axis])))
).sort();
// Get unique models
- const models = Array.from(new Set(runs.map((r) => r.meta.model))).sort();
+ const models = Array.from(new Set(scoredCells.map((c) => c.meta.model))).sort();
- // For each condition value, compute average score per model, then rank
+ // For each condition value, compute average of cell averages per model, then rank
const ranked: Record<string, RankedPoint[]> = {};
for (const model of models) {
ranked[model] = [];
@@ -85,27 +90,24 @@ function computeRankings(
for (let ci = 0; ci < conditionValues.length; ci++) {
const cv = conditionValues[ci];
- const runsForCondition = runs.filter(
- (r) => String(r.meta[axis]) === cv
+ const cellsForCondition = scoredCells.filter(
+ (c) => String(c.meta[axis]) === cv
);
- // Compute average score per model for this condition
+ // Compute average of cell averages per model for this condition
const modelScores: Array<{
model: string;
avgScore: number;
n: number;
}> = [];
for (const model of models) {
- const modelRuns = runsForCondition.filter(
- (r) => r.meta.model === model
+ const modelCells = cellsForCondition.filter(
+ (c) => c.meta.model === model
);
- const scores = modelRuns
- .map((r) => r.eval_results?.score)
- .filter((s): s is number => s !== null && s !== undefined);
- if (scores.length > 0) {
- const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
- modelScores.push({ model, avgScore: avg, n: scores.length });
+ if (modelCells.length > 0) {
+ const avg = modelCells.reduce((s, c) => s + c.score.avg, 0) / modelCells.length;
+ modelScores.push({ model, avgScore: avg, n: modelCells.length });
}
}
@@ -280,8 +282,7 @@ function CustomTooltipContent({
{entry.point!.model}
</span>
<span style={{ color: "var(--text-muted)", marginLeft: "auto" }}>
- {(entry.point!.avgScore * 100).toFixed(1)}% (n=
- {entry.point!.n})
+ {(entry.point!.avgScore * 100).toFixed(1)}% ({entry.point!.n} cells)
</span>
</div>
))}
@@ -292,18 +293,17 @@ function CustomTooltipContent({
export default function BumpChart({ runs }: BumpChartProps) {
// Pre-compute which axes are useful: need 2+ condition values AND 2+ models with scores
const validAxes = useMemo(() => {
- const scoredRuns = runs.filter(
- (r) => r.eval_results?.score !== null && r.eval_results?.score !== undefined
- );
+ const cells = groupIntoCells(runs);
+ const scoredCells = cells.filter((c) => c.score.avg > 0);
return CONDITION_AXES.filter((axis) => {
const conditionValues = Array.from(
- new Set(scoredRuns.map((r) => String(r.meta[axis])))
+ new Set(scoredCells.map((c) => String(c.meta[axis])))
);
if (conditionValues.length < 2) return false;
// Check that at least one condition value has 2+ models with scores
for (const cv of conditionValues) {
const modelsWithScores = new Set(
- scoredRuns.filter((r) => String(r.meta[axis]) === cv).map((r) => r.meta.model)
+ scoredCells.filter((c) => String(c.meta[axis]) === cv).map((c) => c.meta.model)
);
if (modelsWithScores.size >= 2) return true;
}
@@ -354,12 +354,9 @@ export default function BumpChart({ runs }: BumpChartProps) {
const maxRank = models.length;
- const scoredRuns = runs.filter(
- (r) =>
- r.eval_results?.score !== null && r.eval_results?.score !== undefined
- );
+ const scoredCells = groupIntoCells(runs).filter((c) => c.score.avg > 0);
- if (scoredRuns.length === 0) {
+ if (scoredCells.length === 0) {
return (
<div
className="card"
@@ -369,7 +366,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
color: "var(--text-muted)",
}}
>
- No scored runs available for ranking.
+ No scored cells available for ranking.
</div>
);
}
@@ -387,7 +384,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
}}
>
Not enough data to compare models. Rankings need at least 2 condition
- values where 2 or more models have scored runs.
+ values where 2 or more models have scored cells.
</div>
</div>
);
@@ -414,7 +411,7 @@ export default function BumpChart({ runs }: BumpChartProps) {
margin: "4px 0 0",
}}
>
- Rank 1 = best average score. Crossings indicate rank swaps.
+ Rank 1 = best average cell score. Crossings indicate rank swaps.
</p>
</div>
<div style={{ display: "flex", alignItems: "center", gap: "8px" }}>
diff --git a/dashboard/src/components/ConfigTreemap.tsx b/dashboard/src/components/ConfigTreemap.tsx
@@ -2,6 +2,7 @@ import React, { useState, useCallback } from "react";
import { Treemap, ResponsiveContainer, Tooltip } from "recharts";
import type { TreemapNode } from "recharts/types/chart/Treemap";
import type { Run, AxisName } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
interface ConfigTreemapProps {
runs: Run[];
@@ -47,14 +48,15 @@ interface GroupData {
}
function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] {
- const byModel: Record<string, Record<string, Run[]>> = {};
+ const cells = groupIntoCells(runs);
+ const byModel: Record<string, Record<string, Cell[]>> = {};
- for (const run of runs) {
- const model = run.meta.model;
- const secondary = String(run.meta[secondaryAxis]);
+ for (const cell of cells) {
+ const model = cell.meta.model;
+ const secondary = String(cell.meta[secondaryAxis]);
if (!byModel[model]) byModel[model] = {};
if (!byModel[model][secondary]) byModel[model][secondary] = [];
- byModel[model][secondary].push(run);
+ byModel[model][secondary].push(cell);
}
return Object.entries(byModel)
@@ -63,19 +65,17 @@ function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] {
name: model,
children: Object.entries(configs)
.sort(([a], [b]) => a.localeCompare(b))
- .map(([configValue, configRuns]) => {
- const scores = configRuns
- .map((r) => r.eval_results?.score)
- .filter((s): s is number => s !== null && s !== undefined);
+ .map(([configValue, configCells]) => {
+ const scoredCells = configCells.filter((c) => c.score.avg > 0);
const avgScore =
- scores.length > 0
- ? scores.reduce((a, b) => a + b, 0) / scores.length
+ scoredCells.length > 0
+ ? scoredCells.reduce((s, c) => s + c.score.avg, 0) / scoredCells.length
: null;
return {
name: `${model} / ${configValue}`,
displayName: `${model} / ${configValue}`,
- size: configRuns.length,
+ size: configCells.length,
avgScore,
avgScorePct:
avgScore !== null ? `${(avgScore * 100).toFixed(0)}%` : "--",
@@ -198,7 +198,7 @@ function CustomTooltip({
Score: {node.avgScorePct}
</div>
<div style={{ color: "hsl(213 14% 65%)" }}>
- Runs: {node.size}
+ Cells: {node.size}
</div>
</div>
);
diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx
@@ -1,4 +1,5 @@
import type { Run } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
interface CorrelationMatrixProps {
runs: Run[];
@@ -36,13 +37,19 @@ const OUTCOME_METRICS: Array<{ key: string; label: string; extract: MetricExtrac
{ key: "time", label: "Wall Time", extract: (r) => r.meta.wall_time_seconds ?? null },
];
-function computeSpread(runs: Run[], axisKey: string, extract: MetricExtractor): number | null {
+function computeSpread(cells: Cell[], axisKey: string, extract: MetricExtractor): number | null {
+ // Compute per-cell metric averages, then group by axis value
const groups: Record<string, number[]> = {};
- for (const run of runs) {
- const val = extract(run);
- if (val === null) continue;
- const groupKey = String((run.meta as Record<string, unknown>)[axisKey] ?? "unknown");
- (groups[groupKey] ??= []).push(val);
+ for (const cell of cells) {
+ const vals: number[] = [];
+ for (const run of cell.runs) {
+ const v = extract(run);
+ if (v !== null) vals.push(v);
+ }
+ if (vals.length === 0) continue;
+ const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length;
+ const groupKey = String((cell.meta as Record<string, unknown>)[axisKey] ?? "unknown");
+ (groups[groupKey] ??= []).push(cellAvg);
}
const keys = Object.keys(groups);
@@ -72,6 +79,9 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
);
}
+ // Group runs into cells once, then compute spreads from cell averages
+ const cells = groupIntoCells(runs);
+
// Compute the full matrix: rows = config axes, columns = metrics
const matrix: Array<{
key: string;
@@ -82,7 +92,7 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) {
for (const axis of CONFIG_AXES) {
const spreads = OUTCOME_METRICS.map((metric) =>
- computeSpread(runs, axis.key, metric.extract)
+ computeSpread(cells, axis.key, metric.extract)
);
const validSpreads = spreads.filter((s): s is number => s !== null);
const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0;
diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx
@@ -9,6 +9,7 @@ import {
ResponsiveContainer,
} from "recharts";
import type { Run } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
interface EfficiencyFrontierProps {
runs: Run[];
@@ -42,55 +43,28 @@ function getModelColor(model: string): string {
}
function aggregateByConfig(runs: Run[]): ConfigPoint[] {
- const groups: Record<
- string,
- {
- scores: number[];
- costs: number[];
- model: string;
- config: Record<string, string>;
- }
- > = {};
-
- for (const run of runs) {
- const id = run.meta.cell_id;
- if (!groups[id]) {
- groups[id] = {
- scores: [],
- costs: [],
- model: run.meta.model,
- config: {
- model: run.meta.model,
- effort: run.meta.effort,
- prompt_style: run.meta.prompt_style,
- language: run.meta.language,
- linter: run.meta.linter,
- playwright: run.meta.playwright,
- context_file: run.meta.context_file,
- sub_agents: run.meta.sub_agents,
- web_search: run.meta.web_search,
- max_budget: run.meta.max_budget,
- },
- };
- }
-
- if (run.eval_results?.score != null) {
- groups[id].scores.push(run.eval_results.score);
- }
- if (run.claude_output?.total_cost_usd != null) {
- groups[id].costs.push(run.claude_output.total_cost_usd);
- }
- }
-
- return Object.entries(groups)
- .filter(([, g]) => g.scores.length > 0 && g.costs.length > 0)
- .map(([cell_id, g]) => ({
- cell_id,
- model: g.model,
- avgCost: g.costs.reduce((a, b) => a + b, 0) / g.costs.length,
- avgScore: g.scores.reduce((a, b) => a + b, 0) / g.scores.length,
- runCount: g.scores.length,
- config: g.config,
+ const cells = groupIntoCells(runs);
+
+ return cells
+ .filter((c) => c.score.avg > 0 && c.cost.avg > 0)
+ .map((c) => ({
+ cell_id: c.cell_id,
+ model: c.meta.model,
+ avgCost: c.cost.avg,
+ avgScore: c.score.avg,
+ runCount: c.n,
+ config: {
+ model: c.meta.model,
+ effort: c.meta.effort,
+ prompt_style: c.meta.prompt_style,
+ language: c.meta.language,
+ linter: c.meta.linter,
+ playwright: c.meta.playwright,
+ context_file: c.meta.context_file,
+ sub_agents: c.meta.sub_agents,
+ web_search: c.meta.web_search,
+ max_budget: c.meta.max_budget,
+ },
isFrontier: false,
label: "",
}));
@@ -203,7 +177,7 @@ function CustomTooltip({
<span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span>
</div>
<div style={{ marginBottom: "8px" }}>
- <span style={{ color: "var(--text-muted)" }}>runs: </span>
+ <span style={{ color: "var(--text-muted)" }}>runs in cell: </span>
<span>{point.runCount}</span>
</div>
{point.isFrontier && (
@@ -334,8 +308,8 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
marginBottom: "16px",
}}
>
- Cost vs score per config. Pareto frontier highlights configs not
- dominated on both axes.
+ Cost vs score per cell (averaged across runs). Pareto frontier
+ highlights cells not dominated on both axes.
</p>
{/* Legend */}
@@ -420,7 +394,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) {
{/* Non-frontier points (dimmed) */}
<Scatter
- name="configs"
+ name="cells"
data={nonFrontierPoints}
shape={nonFrontierShape}
isAnimationActive={false}
diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx
@@ -1,6 +1,7 @@
import { useState, useMemo } from "react";
import type { Run, AxisName } from "../lib/types";
import { AXIS_NAMES } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
interface HeatmapMatrixProps {
runs: Run[];
@@ -61,16 +62,20 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
const [colAxis, setColAxis] = useState<AxisName>("prompt_style");
const { rowValues, colValues, cells } = useMemo(() => {
+ const analysisCells = groupIntoCells(runs);
const cellMap: Record<string, Record<string, CellData>> = {};
const rowSet = new Set<string>();
const colSet = new Set<string>();
- for (const run of runs) {
- const score = run.eval_results?.score;
- if (score === null || score === undefined) continue;
+ for (const cell of analysisCells) {
+ // Skip cells where no run has a score
+ const hasScore = cell.runs.some((r) => r.eval_results?.score != null);
+ if (!hasScore) continue;
+ // Use the cell's average score as a single data point
+ const cellAvg = cell.score.avg;
- const rv = String(run.meta[rowAxis]);
- const cv = String(run.meta[colAxis]);
+ const rv = String(cell.meta[rowAxis]);
+ const cv = String(cell.meta[colAxis]);
rowSet.add(rv);
colSet.add(cv);
@@ -78,7 +83,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
if (!cellMap[rv]) cellMap[rv] = {};
if (!cellMap[rv][cv]) cellMap[rv][cv] = { totalScore: 0, count: 0 };
- cellMap[rv][cv].totalScore += score;
+ cellMap[rv][cv].totalScore += cellAvg;
cellMap[rv][cv].count += 1;
}
@@ -171,7 +176,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
fontFamily: "var(--font-mono)",
}}
>
- No scored runs available for this axis combination.
+ No scored cells available for this axis combination.
</div>
) : (
<div style={{ overflowX: "auto" }}>
@@ -297,7 +302,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) {
lineHeight: 1.3,
}}
>
- n={cell.count}
+ {cell.count} {cell.count === 1 ? "cell" : "cells"}
</div>
</td>
);
diff --git a/dashboard/src/components/RadarComparison.tsx b/dashboard/src/components/RadarComparison.tsx
@@ -9,6 +9,7 @@ import {
Tooltip,
} from "recharts";
import type { Run } from "../lib/types";
+import { groupIntoCells, type Cell } from "../lib/analysis";
interface RadarComparisonProps {
runs: Run[];
@@ -50,31 +51,25 @@ function extractDimensionScore(run: Run, dim: Dimension): number | null {
interface CellConfig {
cell_id: string;
label: string;
- runs: Run[];
+ cell: Cell;
}
function buildCellConfigs(runs: Run[]): CellConfig[] {
- const grouped: Record<string, Run[]> = {};
- for (const run of runs) {
- const id = run.meta.cell_id;
- if (!grouped[id]) grouped[id] = [];
- grouped[id].push(run);
- }
-
- return Object.entries(grouped)
- .map(([cell_id, cellRuns]) => {
- const m = cellRuns[0].meta;
+ const cells = groupIntoCells(runs);
+ return cells
+ .map((cell) => {
+ const m = cell.meta;
const label = `${m.model} / ${m.language} / ${m.prompt_style} / ${m.effort}`;
- return { cell_id, label, runs: cellRuns };
+ return { cell_id: cell.cell_id, label, cell };
})
.sort((a, b) => a.label.localeCompare(b.label));
}
-function averageScores(
- runs: Run[],
+function cellAverageScore(
+ cell: Cell,
dim: Dimension
): number | null {
- const scores = runs
+ const scores = cell.runs
.map((r) => extractDimensionScore(r, dim))
.filter((s): s is number => s !== null);
if (scores.length === 0) return null;
@@ -148,8 +143,8 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
const data: RadarDatum[] = useMemo(() => {
return DIMENSIONS.map((dim) => {
- const scoreA = configA ? averageScores(configA.runs, dim) : null;
- const scoreB = configB ? averageScores(configB.runs, dim) : null;
+ const scoreA = configA ? cellAverageScore(configA.cell, dim) : null;
+ const scoreB = configB ? cellAverageScore(configB.cell, dim) : null;
return {
dimension: DIMENSION_LABELS[dim],
scoreA: scoreA ?? 0,
@@ -203,7 +198,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
>
{cellConfigs.map((c) => (
<option key={c.cell_id} value={c.cell_id}>
- {c.label} (n={c.runs.length})
+ {c.label} ({c.cell.n} runs)
</option>
))}
</select>
@@ -217,7 +212,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) {
>
{cellConfigs.map((c) => (
<option key={c.cell_id} value={c.cell_id}>
- {c.label} (n={c.runs.length})
+ {c.label} ({c.cell.n} runs)
</option>
))}
</select>
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -7,8 +7,10 @@ import {
Tooltip,
ResponsiveContainer,
Legend,
+ ErrorBar,
} from "recharts";
import type { Run } from "../lib/types";
+import { groupIntoCells } from "../lib/analysis";
interface ScatterPlotProps {
runs: Run[];
@@ -16,68 +18,177 @@ interface ScatterPlotProps {
yMetric: string;
}
-const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = {
+type CellMetricKey = "cost" | "score" | "turns" | "wall_time";
+
+const METRIC_CONFIG: Record<
+ string,
+ {
+ label: string;
+ cellKey: CellMetricKey;
+ scale: number; // multiply avg/min/max by this for display
+ format: (v: number) => string;
+ }
+> = {
cost: {
label: "Cost ($)",
- extract: (r) => r.claude_output?.total_cost_usd ?? null,
+ cellKey: "cost",
+ scale: 1,
format: (v) => `$${v.toFixed(2)}`,
},
score: {
label: "Score (%)",
- extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null,
+ cellKey: "score",
+ scale: 100,
format: (v) => `${v.toFixed(0)}%`,
},
turns: {
label: "Turns",
- extract: (r) => r.claude_output?.num_turns ?? null,
- format: (v) => `${v}`,
+ cellKey: "turns",
+ scale: 1,
+ format: (v) => `${Math.round(v)}`,
},
wall_time: {
label: "Time (s)",
- extract: (r) => r.meta.wall_time_seconds ?? null,
- format: (v) => `${v}s`,
+ cellKey: "wall_time",
+ scale: 1,
+ format: (v) => `${Math.round(v)}s`,
},
};
const MODEL_COLORS: Record<string, string> = {
- haiku: "hsl(193 44% 67%)", // frost cyan
- sonnet: "hsl(40 71% 73%)", // aurora yellow
- opus: "hsl(311 24% 63%)", // aurora purple
+ haiku: "hsl(193 44% 67%)", // frost cyan
+ sonnet: "hsl(40 71% 73%)", // aurora yellow
+ opus: "hsl(311 24% 63%)", // aurora purple
};
-export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) {
+function formatCellId(cellId: string): string {
+ return cellId.replace(/_/g, " ");
+}
+
+interface CellDatum {
+ x: number;
+ y: number;
+ xErrorRange: [number, number];
+ yErrorRange: [number, number];
+ cell_id: string;
+ xLabel: string;
+ yLabel: string;
+ xRange: string;
+ yRange: string;
+ n: number;
+}
+
+function CustomTooltip({ active, payload }: any) {
+ if (!active || !payload?.length) return null;
+ const d: CellDatum = payload[0].payload;
+ return (
+ <div
+ style={{
+ background: "hsl(217 16% 15.5%)",
+ border: "1px solid hsl(217 17% 28%)",
+ borderRadius: "2px",
+ fontFamily: "'JetBrains Mono', monospace",
+ fontSize: "11px",
+ padding: "8px 10px",
+ lineHeight: "1.6",
+ color: "hsl(213 14% 80%)",
+ }}
+ >
+ <div style={{ fontWeight: 600, marginBottom: 4 }}>
+ {formatCellId(d.cell_id)}
+ </div>
+ <div>
+ {d.xLabel}: {d.xRange}
+ </div>
+ <div>
+ {d.yLabel}: {d.yRange}
+ </div>
+ <div style={{ marginTop: 2, color: "hsl(213 14% 55%)" }}>
+ {d.n} run{d.n !== 1 ? "s" : ""} in cell
+ </div>
+ </div>
+ );
+}
+
+export default function ScatterPlot({
+ runs,
+ xMetric,
+ yMetric,
+}: ScatterPlotProps) {
const xConf = METRIC_CONFIG[xMetric];
const yConf = METRIC_CONFIG[yMetric];
if (!xConf || !yConf) return null;
- // Group by model
- const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {};
+ const cells = groupIntoCells(runs);
+
+ // Group cells by model
+ const byModel: Record<string, CellDatum[]> = {};
+
+ for (const cell of cells) {
+ const xAgg = cell[xConf.cellKey];
+ const yAgg = cell[yConf.cellKey];
+ // Skip cells where either metric has no data
+ if (xAgg.avg === 0 && xAgg.min === 0 && xAgg.max === 0) continue;
+ if (yAgg.avg === 0 && yAgg.min === 0 && yAgg.max === 0) continue;
- for (const run of runs) {
- const x = xConf.extract(run);
- const y = yConf.extract(run);
- if (x === null || y === null) continue;
+ const xAvg = xAgg.avg * xConf.scale;
+ const xMin = xAgg.min * xConf.scale;
+ const xMax = xAgg.max * xConf.scale;
+ const yAvg = yAgg.avg * yConf.scale;
+ const yMin = yAgg.min * yConf.scale;
+ const yMax = yAgg.max * yConf.scale;
- const model = run.meta.model;
+ const model = cell.meta.model;
if (!byModel[model]) byModel[model] = [];
+
+ const xRangeStr =
+ cell.n > 1
+ ? `avg ${xConf.format(xAvg)} (${xConf.format(xMin)} - ${xConf.format(xMax)})`
+ : xConf.format(xAvg);
+ const yRangeStr =
+ cell.n > 1
+ ? `avg ${yConf.format(yAvg)} (${yConf.format(yMin)} - ${yConf.format(yMax)})`
+ : yConf.format(yAvg);
+
byModel[model].push({
- x,
- y,
- run_id: run.meta.run_id,
- prompt: run.meta.prompt_style,
+ x: xAvg,
+ y: yAvg,
+ xErrorRange: [xAvg - xMin, xMax - xAvg],
+ yErrorRange: [yAvg - yMin, yMax - yAvg],
+ cell_id: cell.cell_id,
+ xLabel: xConf.label,
+ yLabel: yConf.label,
+ xRange: xRangeStr,
+ yRange: yRangeStr,
+ n: cell.n,
});
}
const models = Object.keys(byModel).sort();
+ const totalCells = models.reduce((sum, m) => sum + byModel[m].length, 0);
return (
<div className="card">
<h3 style={{ marginBottom: "16px" }}>
- {xConf.label} vs {yConf.label}
+ {xConf.label} vs {yConf.label}{" "}
+ <span
+ style={{
+ fontSize: "12px",
+ fontWeight: 400,
+ color: "hsl(213 14% 55%)",
+ }}
+ >
+ ({totalCells} cells)
+ </span>
</h3>
<ResponsiveContainer width="100%" height={350}>
- <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}>
- <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" />
+ <ScatterChart
+ margin={{ top: 10, right: 20, bottom: 10, left: 10 }}
+ >
+ <CartesianGrid
+ strokeDasharray="3 3"
+ stroke="hsl(217 17% 28%)"
+ />
<XAxis
dataKey="x"
name={xConf.label}
@@ -92,20 +203,7 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps
fontSize={11}
tickFormatter={(v) => yConf.format(v)}
/>
- <Tooltip
- contentStyle={{
- background: "hsl(217 16% 15.5%)",
- border: "1px solid hsl(217 17% 28%)",
- borderRadius: "2px",
- fontFamily: "'JetBrains Mono', monospace",
- fontSize: "11px",
- }}
- formatter={(value: number, name: string) => {
- if (name === xConf.label) return [xConf.format(value), name];
- if (name === yConf.label) return [yConf.format(value), name];
- return [value, name];
- }}
- />
+ <Tooltip content={<CustomTooltip />} />
<Legend />
{models.map((model) => (
<Scatter
@@ -113,7 +211,22 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps
name={model}
data={byModel[model]}
fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"}
- />
+ >
+ <ErrorBar
+ dataKey="xErrorRange"
+ direction="x"
+ stroke="hsl(213 14% 45%)"
+ strokeWidth={1}
+ width={4}
+ />
+ <ErrorBar
+ dataKey="yErrorRange"
+ direction="y"
+ stroke="hsl(213 14% 45%)"
+ strokeWidth={1}
+ width={4}
+ />
+ </Scatter>
))}
</ScatterChart>
</ResponsiveContainer>