commit 5fdfb44dee79ae049634182e66638143e9c30d37
parent fb973e79eabeaea219f5e16b068249972f732435
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 7 Apr 2026 17:26:13 +0200
Surprises tab, model selector, shared color palette integration
- Surprises: own page (/surprises) with aggregate stats, type breakdown,
grouped cards. Removed from Insights. Added nav link.
- ModelSelector: shared multi-select toggle component for filtering
models across charts. Integrated into Charts.tsx and ScatterPlot.tsx.
- All components use shared lib/colors.ts palette.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
8 files changed, 618 insertions(+), 50 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -79,9 +79,11 @@ Static Astro site with React islands. SMUI design system (JetBrains Mono, Nord p
Pages:
- **Grid** (`/`): per-task summary, box plots for score distribution, filterable cell/run table with sorting
-- **Insights** (`/insights`): surprise cards, convex hull scatter plots (4 density levels) with model toggles, variability analysis (box plots, reliability ranking, ANOVA decomposition), tornado chart with variance bands, interaction heatmap
+- **Insights** (`/insights`): convex hull scatter plots (4 density levels) with model toggles, variability analysis (box plots, reliability ranking, ANOVA decomposition), tornado chart with variance bands, interaction heatmap
+- **Surprises** (`/surprises`): aggregate surprise stats, breakdown by type (model upsets, prompt upsets, individual outliers), grouped surprise cards with run links
- **Explore** (`/explore`): correlation matrix, efficiency frontier, bump chart, heatmap matrix, radar comparison, treemap
- **Compare** (`/compare`): cell-based aggregate stats with score/cost ranges per axis value
+- **PCA** (`/pca`): principal component analysis scatter plot (PC1/PC2/PC3 selectable axes), model-colored points sized by score, loadings interpretation tables, variance explained bars
- **Run detail** (`/run/{id}` or `/r/{short_id}`): outcome/output score separation, all config pills, SonarQube detail card, 6 detail cards, transcript viewer, artifact iframe, link to cell
- **Cell detail** (`/cell/{id}` or `/c/{short_id}`): run comparison table, artifact gallery, variance stats, agent behavior comparison
- **Methodology** (`/methodology`): scoring framework, DOE design, gameplay bot phases, known limitations
@@ -113,7 +115,7 @@ Short URL IDs: 8-char SHA256 hash for `/r/` and `/c/` routes with redirect pages
## TODO
### Analysis
-- [ ] PCA analysis: add when 100+ runs exist with new scoring. One-hot encode categoricals, identify principal components explaining variance.
+- [x] PCA analysis: `harness/pca-analysis.py` generates `results/analysis/pca.json`, dashboard at `/pca`
- [ ] Pareto frontier analysis: multi-objective optimization (score vs cost, score vs time)
### Eval
diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx
@@ -1,3 +1,4 @@
+import { useState, useMemo } from "react";
import {
ComposedChart,
Bar,
@@ -13,6 +14,7 @@ import {
} from "recharts";
import type { Run } from "../lib/types";
import { getModelColor, modelSortOrder } from "../lib/colors";
+import ModelSelector from "./ModelSelector";
interface ChartsProps {
runs: Run[];
@@ -374,6 +376,17 @@ function TaskBoxTooltipContent({ active, payload, label }: { active?: boolean; p
}
export default function Charts({ runs }: ChartsProps) {
+ // Extract unique models sorted consistently
+ const allModels = useMemo(() => {
+ const models = new Set<string>();
+ for (const run of runs) {
+ models.add(run.meta.actual_model || run.meta.model);
+ }
+ return [...models].sort((a, b) => modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b));
+ }, [runs]);
+
+ const [selectedModels, setSelectedModels] = useState<Set<string>>(() => new Set(allModels));
+
if (runs.length === 0) {
return (
<div className="card" style={{ textAlign: "center", padding: "40px", color: SMUI.muted }}>
@@ -382,8 +395,9 @@ export default function Charts({ runs }: ChartsProps) {
);
}
- const modelData = aggregateByModel(runs);
- const taskData = aggregateByTask(runs);
+ const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model));
+ const modelData = aggregateByModel(filteredRuns);
+ const taskData = aggregateByTask(filteredRuns);
const modelDots = modelScatterData(modelData);
const taskScoreDots = taskScatterData(taskData, "s", SMUI.frost2);
const taskPassDots = taskScatterData(taskData, "p", SMUI.green);
@@ -391,7 +405,14 @@ export default function Charts({ runs }: ChartsProps) {
return (
<div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "16px" }}>
<div className="card">
- <h3 style={{ marginBottom: "16px" }}>Score Distribution by Model</h3>
+ <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}>
+ <h3 style={{ margin: 0 }}>Score Distribution by Model</h3>
+ <ModelSelector
+ allModels={allModels}
+ selectedModels={selectedModels}
+ onChange={setSelectedModels}
+ />
+ </div>
<ResponsiveContainer width="100%" height={270}>
<ComposedChart data={modelData} barCategoryGap="20%">
<CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} />
diff --git a/dashboard/src/components/ModelSelector.tsx b/dashboard/src/components/ModelSelector.tsx
@@ -0,0 +1,89 @@
+import { getModelColor } from "../lib/colors";
+
+interface ModelSelectorProps {
+ allModels: string[];
+ selectedModels: Set<string>;
+ onChange: (models: Set<string>) => void;
+}
+
+export default function ModelSelector({
+ allModels,
+ selectedModels,
+ onChange,
+}: ModelSelectorProps) {
+ const allSelected = allModels.length > 0 && allModels.every((m) => selectedModels.has(m));
+ const noneSelected = allModels.length > 0 && allModels.every((m) => !selectedModels.has(m));
+
+ const toggleModel = (model: string) => {
+ const next = new Set(selectedModels);
+ if (next.has(model)) {
+ next.delete(model);
+ } else {
+ next.add(model);
+ }
+ onChange(next);
+ };
+
+ const toggleAll = () => {
+ if (allSelected) {
+ onChange(new Set());
+ } else {
+ onChange(new Set(allModels));
+ }
+ };
+
+ return (
+ <div
+ style={{
+ display: "flex",
+ gap: "8px",
+ justifyContent: "center",
+ flexWrap: "wrap",
+ }}
+ >
+ <button
+ onClick={toggleAll}
+ style={{
+ padding: "4px 10px",
+ borderRadius: "0",
+ border: `1px solid var(--border, hsl(217 17% 28%))`,
+ background: allSelected
+ ? "rgba(255, 255, 255, 0.08)"
+ : "transparent",
+ color: allSelected
+ ? "var(--text-primary, hsl(213 14% 80%))"
+ : "var(--text-muted, hsl(213 14% 55%))",
+ opacity: noneSelected ? 0.4 : 1,
+ cursor: "pointer",
+ fontSize: "0.75rem",
+ fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)",
+ }}
+ >
+ {allSelected ? "None" : "All"}
+ </button>
+ {allModels.map((model) => {
+ const color = getModelColor(model);
+ const active = selectedModels.has(model);
+ return (
+ <button
+ key={model}
+ onClick={() => toggleModel(model)}
+ style={{
+ padding: "4px 10px",
+ borderRadius: "0",
+ border: `1px solid ${color}`,
+ background: active ? `${color}22` : "transparent",
+ color: active ? color : "var(--text-muted, hsl(213 14% 55%))",
+ opacity: active ? 1 : 0.4,
+ cursor: "pointer",
+ fontSize: "0.75rem",
+ fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)",
+ }}
+ >
+ {model}
+ </button>
+ );
+ })}
+ </div>
+ );
+}
diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx
@@ -12,6 +12,7 @@ import {
import { useXAxisScale, useYAxisScale } from "recharts";
import type { Run } from "../lib/types";
import { groupIntoCells } from "../lib/analysis";
+import ModelSelector from "./ModelSelector";
interface ScatterPlotProps {
runs: Run[];
@@ -392,17 +393,8 @@ export default function ScatterPlot({
// Initialize visibleModels on first render or when models change
const effectiveVisible = visibleModels ?? new Set(allModels);
- const toggleModel = (model: string) => {
- setVisibleModels((prev) => {
- const current = prev ?? new Set(allModels);
- const next = new Set(current);
- if (next.has(model)) {
- next.delete(model);
- } else {
- next.add(model);
- }
- return next;
- });
+ const handleModelChange = (models: Set<string>) => {
+ setVisibleModels(models);
};
// Compute regions from ALL data (for stable axis domains)
@@ -468,34 +460,12 @@ export default function ScatterPlot({
</div>
{/* Model toggles */}
- <div
- style={{
- display: "flex",
- gap: "8px",
- justifyContent: "center",
- marginBottom: "12px",
- flexWrap: "wrap",
- }}
- >
- {allModels.map((model) => (
- <button
- key={model}
- onClick={() => toggleModel(model)}
- style={{
- padding: "4px 10px",
- borderRadius: "0",
- border: `1px solid ${fallbackColor(model)}`,
- background: effectiveVisible.has(model) ? `${fallbackColor(model)}22` : "transparent",
- color: effectiveVisible.has(model) ? fallbackColor(model) : "var(--text-muted, hsl(213 14% 55%))",
- opacity: effectiveVisible.has(model) ? 1 : 0.4,
- cursor: "pointer",
- fontSize: "0.75rem",
- fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)",
- }}
- >
- {model}
- </button>
- ))}
+ <div style={{ marginBottom: "12px" }}>
+ <ModelSelector
+ allModels={allModels}
+ selectedModels={effectiveVisible}
+ onChange={handleModelChange}
+ />
</div>
{hovered && <CentroidTooltip data={hovered} />}
diff --git a/dashboard/src/components/SurprisesPage.tsx b/dashboard/src/components/SurprisesPage.tsx
@@ -0,0 +1,473 @@
+import { useState, useMemo } from "react";
+import type { Run } from "../lib/types";
+
+interface SurprisesPageProps {
+ runs: Run[];
+}
+
+interface RunRef {
+ run_id: string;
+ short_id?: string;
+ model: string;
+ score: number;
+ cost: number;
+ config: Record<string, string>;
+}
+
+interface Surprise {
+ title: string;
+ detail: string;
+ category: "model_upset" | "prompt_upset" | "individual_outlier";
+ weaker: { model: string; config: string; score: number; cost: number };
+ stronger: { model: string; config: string; score: number; cost: number };
+ magnitude: number;
+ runs: RunRef[];
+ configDiffs: string[];
+ /** Which config axis is the primary differentiator */
+ primaryAxis: string;
+}
+
+const MODEL_RANK: Record<string, number> = {
+ haiku: 1,
+ sonnet: 2,
+ opus: 3,
+};
+
+const CONFIG_KEYS = [
+ "prompt_style", "language", "effort", "human_language",
+ "linter", "playwright", "context_file",
+ "web_search", "max_budget", "tool_read", "tool_write",
+ "tool_edit", "tool_glob", "tool_grep",
+ "tests_provided", "strategy", "design_guidance", "architecture",
+ "error_checking", "context_noise", "renderer",
+];
+
+function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] {
+ const diffs: string[] = [];
+ const metaA = runsA[0]?.meta;
+ const metaB = runsB[0]?.meta;
+ if (!metaA || !metaB) return diffs;
+
+ for (const key of CONFIG_KEYS) {
+ const va = String((metaA as Record<string, unknown>)[key]);
+ const vb = String((metaB as Record<string, unknown>)[key]);
+ if (va !== vb) {
+ diffs.push(`${key}: ${va} vs ${vb}`);
+ }
+ }
+ return diffs;
+}
+
+function findSurprises(runs: Run[]): Surprise[] {
+ const surprises: Surprise[] = [];
+
+ // Group runs by config (everything except model and run number)
+ const configGroups: Record<string, Run[]> = {};
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ const m = run.meta;
+ const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|");
+ (configGroups[key] ??= []).push(run);
+ }
+
+ // Within each config group, compare models
+ for (const [, group] of Object.entries(configGroups)) {
+ const byModel: Record<string, Run[]> = {};
+ for (const run of group) {
+ (byModel[run.meta.model] ??= []).push(run);
+ }
+
+ const models = Object.keys(byModel);
+ for (let i = 0; i < models.length; i++) {
+ for (let j = i + 1; j < models.length; j++) {
+ const a = models[i];
+ const b = models[j];
+ const rankA = MODEL_RANK[a] || 0;
+ const rankB = MODEL_RANK[b] || 0;
+
+ const runsA = byModel[a];
+ const runsB = byModel[b];
+ const scoresA = runsA.map(r => r.eval_results!.score!);
+ const scoresB = runsB.map(r => r.eval_results!.score!);
+ const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length;
+ const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length;
+
+ const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0);
+ const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0);
+ const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length;
+ const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length;
+
+ const allRuns = [
+ ...runsA.map(r => ({
+ run_id: r.meta.run_id, short_id: r.meta.short_id, model: a,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ...runsB.map(r => ({
+ run_id: r.meta.run_id, short_id: r.meta.short_id, model: b,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ];
+
+ if (rankA < rankB && avgA > avgB + 0.02) {
+ surprises.push({
+ title: `${a} beat ${b}`,
+ detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`,
+ category: "model_upset",
+ weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+ stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+ magnitude: avgA - avgB,
+ runs: allRuns,
+ configDiffs: getConfigDiffs(runsA, runsB),
+ primaryAxis: "model",
+ });
+ } else if (rankB < rankA && avgB > avgA + 0.02) {
+ surprises.push({
+ title: `${b} beat ${a}`,
+ detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`,
+ category: "model_upset",
+ weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB },
+ stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA },
+ magnitude: avgB - avgA,
+ runs: allRuns,
+ configDiffs: getConfigDiffs(runsB, runsA),
+ primaryAxis: "model",
+ });
+ }
+ }
+ }
+ }
+
+ // Find individual outlier runs where a stronger model scored far below haiku
+ const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!);
+ const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0;
+
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ const model = run.meta.model;
+ const score = run.eval_results.score;
+ const rank = MODEL_RANK[model] || 0;
+
+ if (rank > 1 && score < haikuMean - 0.15) {
+ surprises.push({
+ title: `${model} run scored far below haiku avg`,
+ detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`,
+ category: "individual_outlier",
+ weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 },
+ stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 },
+ magnitude: haikuMean - score,
+ runs: [{
+ run_id: run.meta.run_id, short_id: run.meta.short_id, model,
+ score, cost: run.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])),
+ }],
+ configDiffs: CONFIG_KEYS.filter(k => {
+ const v = String((run.meta as Record<string, unknown>)[k]);
+ return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple";
+ }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`),
+ primaryAxis: "model",
+ });
+ }
+ }
+
+ // Simple prompt beats detailed
+ const promptGroups: Record<string, Run[]> = {};
+ for (const run of runs) {
+ if (run.eval_results?.score == null) continue;
+ const m = run.meta;
+ const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|");
+ (promptGroups[key] ??= []).push(run);
+ }
+
+ for (const [, group] of Object.entries(promptGroups)) {
+ const byPrompt: Record<string, Run[]> = {};
+ for (const run of group) {
+ (byPrompt[run.meta.prompt_style] ??= []).push(run);
+ }
+ if (byPrompt.simple && byPrompt.detailed) {
+ const simpleRuns = byPrompt.simple;
+ const detailedRuns = byPrompt.detailed;
+ const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length;
+ const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length;
+ if (avgSimple > avgDetailed + 0.05) {
+ const allRuns = [
+ ...simpleRuns.map(r => ({
+ run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ...detailedRuns.map(r => ({
+ run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model,
+ score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0,
+ config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])),
+ })),
+ ];
+ surprises.push({
+ title: "Simple prompt beat detailed",
+ detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`,
+ category: "prompt_upset",
+ weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 },
+ stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 },
+ magnitude: avgSimple - avgDetailed,
+ runs: allRuns,
+ configDiffs: ["prompt_style: simple vs detailed"],
+ primaryAxis: "prompt_style",
+ });
+ }
+ }
+ }
+
+ return surprises.sort((a, b) => b.magnitude - a.magnitude);
+}
+
+const CATEGORY_LABELS: Record<string, string> = {
+ model_upset: "Model upsets",
+ prompt_upset: "Prompt upsets",
+ individual_outlier: "Individual outliers",
+};
+
+const CATEGORY_DESCRIPTIONS: Record<string, string> = {
+ model_upset: "A cheaper/weaker model outperformed a more capable one under the same configuration.",
+ prompt_upset: "A simpler prompt style beat a more detailed one, suggesting diminishing returns from verbosity.",
+ individual_outlier: "A single run from a stronger model scored far below the weaker model's average.",
+};
+
+const CATEGORY_COLORS: Record<string, string> = {
+ model_upset: "var(--yellow)",
+ prompt_upset: "var(--accent)",
+ individual_outlier: "var(--red)",
+};
+
+function SurpriseCard({ surprise }: { surprise: Surprise }) {
+ const [expanded, setExpanded] = useState(false);
+
+ return (
+ <div
+ className="card"
+ style={{
+ padding: "14px",
+ borderLeft: `3px solid ${CATEGORY_COLORS[surprise.category] || "var(--yellow)"}`,
+ cursor: "pointer",
+ }}
+ onClick={() => setExpanded(!expanded)}
+ >
+ <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+ {surprise.title}
+ </div>
+ <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}>
+ {surprise.detail}
+ </div>
+ <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}>
+ <div>
+ <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(surprise.weaker.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ <div style={{ color: "var(--text-muted)" }}>vs</div>
+ <div>
+ <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span>
+ <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}>
+ {(surprise.stronger.score * 100).toFixed(0)}%
+ </span>
+ </div>
+ </div>
+
+ <div style={{ display: "flex", gap: "8px", marginTop: "8px", flexWrap: "wrap" }}>
+ <span style={{
+ fontSize: "10px",
+ padding: "2px 6px",
+ borderRadius: "3px",
+ background: "hsl(var(--muted))",
+ color: "hsl(var(--muted-foreground))",
+ fontFamily: "var(--font-mono)",
+ }}>
+ +{(surprise.magnitude * 100).toFixed(0)}pp
+ </span>
+ <span style={{
+ fontSize: "10px",
+ padding: "2px 6px",
+ borderRadius: "3px",
+ background: "hsl(var(--muted))",
+ color: "hsl(var(--muted-foreground))",
+ fontFamily: "var(--font-mono)",
+ }}>
+ {surprise.runs.length} run{surprise.runs.length !== 1 ? "s" : ""}
+ </span>
+ </div>
+
+ {expanded && (
+ <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}>
+ {surprise.configDiffs.length > 0 && (
+ <div style={{ marginBottom: "8px" }}>
+ <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div>
+ {surprise.configDiffs.map((diff, i) => (
+ <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div>
+ ))}
+ </div>
+ )}
+
+ <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>
+ Runs ({surprise.runs.length})
+ </div>
+ {surprise.runs.map((r) => (
+ <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}>
+ <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}>
+ {r.model}
+ </span>
+ <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}>
+ {(r.score * 100).toFixed(0)}%
+ </span>
+ <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
+ ${r.cost.toFixed(2)}
+ </span>
+ <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }} onClick={e => e.stopPropagation()}>
+ view
+ </a>
+ </div>
+ ))}
+ </div>
+ )}
+ </div>
+ );
+}
+
+export default function SurprisesPage({ runs }: SurprisesPageProps) {
+ const surprises = useMemo(() => findSurprises(runs), [runs]);
+
+ // Aggregate stats
+ const byCategory = useMemo(() => {
+ const groups: Record<string, Surprise[]> = {};
+ for (const s of surprises) {
+ (groups[s.category] ??= []).push(s);
+ }
+ return groups;
+ }, [surprises]);
+
+ const axisCounts = useMemo(() => {
+ const counts: Record<string, number> = {};
+ for (const s of surprises) {
+ counts[s.primaryAxis] = (counts[s.primaryAxis] || 0) + 1;
+ }
+ return Object.entries(counts).sort((a, b) => b[1] - a[1]);
+ }, [surprises]);
+
+ const avgMagnitude = useMemo(() => {
+ if (surprises.length === 0) return 0;
+ return surprises.reduce((sum, s) => sum + s.magnitude, 0) / surprises.length;
+ }, [surprises]);
+
+ const maxMagnitude = useMemo(() => {
+ if (surprises.length === 0) return 0;
+ return Math.max(...surprises.map(s => s.magnitude));
+ }, [surprises]);
+
+ // Category order for display
+ const categoryOrder = ["model_upset", "prompt_upset", "individual_outlier"];
+ const orderedCategories = categoryOrder.filter(c => byCategory[c]?.length);
+
+ if (surprises.length === 0) {
+ return (
+ <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}>
+ No surprises yet. Run more experiments with different models to find upsets.
+ </div>
+ );
+ }
+
+ return (
+ <div>
+ {/* Explanation */}
+ <div className="card" style={{ padding: "16px", marginBottom: "24px" }}>
+ <p style={{ fontSize: "12px", color: "var(--text-muted)", margin: 0, lineHeight: "1.6" }}>
+ A "surprise" is a result that defies expectations: a weaker or cheaper model outperforming a stronger one,
+ or a simpler configuration beating a more elaborate one. These findings highlight where conventional assumptions
+ about model capability and configuration complexity break down. Click any card to see the runs involved.
+ </p>
+ </div>
+
+ {/* Summary stats */}
+ <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(180px, 1fr))", gap: "12px", marginBottom: "24px" }}>
+ <div className="stat-card">
+ <div className="stat-value">{surprises.length}</div>
+ <div className="stat-label">Total surprises</div>
+ </div>
+ <div className="stat-card">
+ <div className="stat-value">{(avgMagnitude * 100).toFixed(0)}pp</div>
+ <div className="stat-label">Avg magnitude</div>
+ </div>
+ <div className="stat-card">
+ <div className="stat-value">{(maxMagnitude * 100).toFixed(0)}pp</div>
+ <div className="stat-label">Largest upset</div>
+ </div>
+ <div className="stat-card">
+ <div className="stat-value">{axisCounts[0]?.[0] || "--"}</div>
+ <div className="stat-label">Most surprising axis</div>
+ </div>
+ </div>
+
+ {/* Breakdown by type */}
+ <div className="card" style={{ padding: "16px", marginBottom: "24px" }}>
+ <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "12px" }}>
+ Breakdown by type
+ </div>
+ <div style={{ display: "flex", gap: "24px", flexWrap: "wrap" }}>
+ {orderedCategories.map(cat => (
+ <div key={cat} style={{ display: "flex", alignItems: "baseline", gap: "8px" }}>
+ <span style={{
+ width: "8px",
+ height: "8px",
+ borderRadius: "2px",
+ background: CATEGORY_COLORS[cat],
+ display: "inline-block",
+ flexShrink: 0,
+ position: "relative",
+ top: "-1px",
+ }} />
+ <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "14px" }}>
+ {byCategory[cat]?.length || 0}
+ </span>
+ <span style={{ fontSize: "11px", color: "var(--text-muted)" }}>
+ {CATEGORY_LABELS[cat]}
+ </span>
+ </div>
+ ))}
+ </div>
+ {axisCounts.length > 1 && (
+ <div style={{ marginTop: "12px", paddingTop: "12px", borderTop: "1px solid var(--border)" }}>
+ <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "8px" }}>
+ Surprises by axis
+ </div>
+ <div style={{ display: "flex", gap: "16px", flexWrap: "wrap" }}>
+ {axisCounts.map(([axis, count]) => (
+ <div key={axis} style={{ display: "flex", alignItems: "baseline", gap: "6px" }}>
+ <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "13px" }}>
+ {count}
+ </span>
+ <span style={{ fontSize: "11px", color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}>
+ {axis}
+ </span>
+ </div>
+ ))}
+ </div>
+ </div>
+ )}
+ </div>
+
+ {/* Grouped surprise cards */}
+ {orderedCategories.map(cat => (
+ <div key={cat} style={{ marginBottom: "32px" }}>
+ <h3 style={{ marginBottom: "4px" }}>{CATEGORY_LABELS[cat]}</h3>
+ <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}>
+ {CATEGORY_DESCRIPTIONS[cat]}
+ </p>
+ <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}>
+ {byCategory[cat]!.map((s, i) => (
+ <SurpriseCard key={i} surprise={s} />
+ ))}
+ </div>
+ </div>
+ ))}
+ </div>
+ );
+}
diff --git a/dashboard/src/layouts/Base.astro b/dashboard/src/layouts/Base.astro
@@ -47,8 +47,10 @@ try {
<nav style="display: flex; gap: 16px; font-size: 0.875rem; align-items: center;">
<a href="/">Grid</a>
<a href="/insights">Insights</a>
+ <a href="/surprises">Surprises</a>
<a href="/explore">Explore</a>
<a href="/compare">Compare</a>
+ <a href="/pca">PCA</a>
<span style="border-left: 1px solid hsl(var(--border)); height: 16px;"></span>
<a href="/methodology">Methodology</a>
</nav>
diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro
@@ -3,7 +3,6 @@ import Base from "../layouts/Base.astro";
import { loadAllRuns } from "../lib/data";
import Insights from "../components/Insights";
import ScatterPlot from "../components/ScatterPlot";
-import Surprises from "../components/Surprises";
import Variability from "../components/Variability";
const runs = loadAllRuns();
@@ -15,11 +14,7 @@ const runs = loadAllRuns();
Which variables move the needle? Where do weaker configs win? How consistent are the results?
</p>
- <Surprises client:load runs={runs} />
-
- <div style="margin-top: 32px;">
- <Variability client:load runs={runs} />
- </div>
+ <Variability client:load runs={runs} />
<div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
<ScatterPlot client:load runs={runs} defaultX="cost" defaultY="outcome" />
diff --git a/dashboard/src/pages/surprises.astro b/dashboard/src/pages/surprises.astro
@@ -0,0 +1,16 @@
+---
+import Base from "../layouts/Base.astro";
+import { loadAllRuns } from "../lib/data";
+import SurprisesPage from "../components/SurprisesPage";
+
+const runs = loadAllRuns();
+---
+
+<Base title="Surprises">
+ <h1 style="margin-bottom: 8px;">Surprises</h1>
+ <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;">
+ Where weaker configs outperformed stronger ones, and conventional assumptions broke down.
+ </p>
+
+ <SurprisesPage client:load runs={runs} />
+</Base>