loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 5fdfb44dee79ae049634182e66638143e9c30d37
parent fb973e79eabeaea219f5e16b068249972f732435
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 17:26:13 +0200

Surprises tab, model selector, shared color palette integration

- Surprises: own page (/surprises) with aggregate stats, type breakdown,
  grouped cards. Removed from Insights. Added nav link.
- ModelSelector: shared multi-select toggle component for filtering
  models across charts. Integrated into Charts.tsx and ScatterPlot.tsx.
- All components use shared lib/colors.ts palette.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
MCLAUDE.md | 6++++--
Mdashboard/src/components/Charts.tsx | 27++++++++++++++++++++++++---
Adashboard/src/components/ModelSelector.tsx | 89+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/components/ScatterPlot.tsx | 48+++++++++---------------------------------------
Adashboard/src/components/SurprisesPage.tsx | 473+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/layouts/Base.astro | 2++
Mdashboard/src/pages/insights.astro | 7+------
Adashboard/src/pages/surprises.astro | 16++++++++++++++++
8 files changed, 618 insertions(+), 50 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md @@ -79,9 +79,11 @@ Static Astro site with React islands. SMUI design system (JetBrains Mono, Nord p Pages: - **Grid** (`/`): per-task summary, box plots for score distribution, filterable cell/run table with sorting -- **Insights** (`/insights`): surprise cards, convex hull scatter plots (4 density levels) with model toggles, variability analysis (box plots, reliability ranking, ANOVA decomposition), tornado chart with variance bands, interaction heatmap +- **Insights** (`/insights`): convex hull scatter plots (4 density levels) with model toggles, variability analysis (box plots, reliability ranking, ANOVA decomposition), tornado chart with variance bands, interaction heatmap +- **Surprises** (`/surprises`): aggregate surprise stats, breakdown by type (model upsets, prompt upsets, individual outliers), grouped surprise cards with run links - **Explore** (`/explore`): correlation matrix, efficiency frontier, bump chart, heatmap matrix, radar comparison, treemap - **Compare** (`/compare`): cell-based aggregate stats with score/cost ranges per axis value +- **PCA** (`/pca`): principal component analysis scatter plot (PC1/PC2/PC3 selectable axes), model-colored points sized by score, loadings interpretation tables, variance explained bars - **Run detail** (`/run/{id}` or `/r/{short_id}`): outcome/output score separation, all config pills, SonarQube detail card, 6 detail cards, transcript viewer, artifact iframe, link to cell - **Cell detail** (`/cell/{id}` or `/c/{short_id}`): run comparison table, artifact gallery, variance stats, agent behavior comparison - **Methodology** (`/methodology`): scoring framework, DOE design, gameplay bot phases, known limitations @@ -113,7 +115,7 @@ Short URL IDs: 8-char SHA256 hash for `/r/` and `/c/` routes with redirect pages ## TODO ### Analysis -- [ ] PCA analysis: add when 100+ runs exist with new scoring. One-hot encode categoricals, identify principal components explaining variance. +- [x] PCA analysis: `harness/pca-analysis.py` generates `results/analysis/pca.json`, dashboard at `/pca` - [ ] Pareto frontier analysis: multi-objective optimization (score vs cost, score vs time) ### Eval diff --git a/dashboard/src/components/Charts.tsx b/dashboard/src/components/Charts.tsx @@ -1,3 +1,4 @@ +import { useState, useMemo } from "react"; import { ComposedChart, Bar, @@ -13,6 +14,7 @@ import { } from "recharts"; import type { Run } from "../lib/types"; import { getModelColor, modelSortOrder } from "../lib/colors"; +import ModelSelector from "./ModelSelector"; interface ChartsProps { runs: Run[]; @@ -374,6 +376,17 @@ function TaskBoxTooltipContent({ active, payload, label }: { active?: boolean; p } export default function Charts({ runs }: ChartsProps) { + // Extract unique models sorted consistently + const allModels = useMemo(() => { + const models = new Set<string>(); + for (const run of runs) { + models.add(run.meta.actual_model || run.meta.model); + } + return [...models].sort((a, b) => modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b)); + }, [runs]); + + const [selectedModels, setSelectedModels] = useState<Set<string>>(() => new Set(allModels)); + if (runs.length === 0) { return ( <div className="card" style={{ textAlign: "center", padding: "40px", color: SMUI.muted }}> @@ -382,8 +395,9 @@ export default function Charts({ runs }: ChartsProps) { ); } - const modelData = aggregateByModel(runs); - const taskData = aggregateByTask(runs); + const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model)); + const modelData = aggregateByModel(filteredRuns); + const taskData = aggregateByTask(filteredRuns); const modelDots = modelScatterData(modelData); const taskScoreDots = taskScatterData(taskData, "s", SMUI.frost2); const taskPassDots = taskScatterData(taskData, "p", SMUI.green); @@ -391,7 +405,14 @@ export default function Charts({ runs }: ChartsProps) { return ( <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "16px" }}> <div className="card"> - <h3 style={{ marginBottom: "16px" }}>Score Distribution by Model</h3> + <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> + <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> + <ModelSelector + allModels={allModels} + selectedModels={selectedModels} + onChange={setSelectedModels} + /> + </div> <ResponsiveContainer width="100%" height={270}> <ComposedChart data={modelData} barCategoryGap="20%"> <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} /> diff --git a/dashboard/src/components/ModelSelector.tsx b/dashboard/src/components/ModelSelector.tsx @@ -0,0 +1,89 @@ +import { getModelColor } from "../lib/colors"; + +interface ModelSelectorProps { + allModels: string[]; + selectedModels: Set<string>; + onChange: (models: Set<string>) => void; +} + +export default function ModelSelector({ + allModels, + selectedModels, + onChange, +}: ModelSelectorProps) { + const allSelected = allModels.length > 0 && allModels.every((m) => selectedModels.has(m)); + const noneSelected = allModels.length > 0 && allModels.every((m) => !selectedModels.has(m)); + + const toggleModel = (model: string) => { + const next = new Set(selectedModels); + if (next.has(model)) { + next.delete(model); + } else { + next.add(model); + } + onChange(next); + }; + + const toggleAll = () => { + if (allSelected) { + onChange(new Set()); + } else { + onChange(new Set(allModels)); + } + }; + + return ( + <div + style={{ + display: "flex", + gap: "8px", + justifyContent: "center", + flexWrap: "wrap", + }} + > + <button + onClick={toggleAll} + style={{ + padding: "4px 10px", + borderRadius: "0", + border: `1px solid var(--border, hsl(217 17% 28%))`, + background: allSelected + ? "rgba(255, 255, 255, 0.08)" + : "transparent", + color: allSelected + ? "var(--text-primary, hsl(213 14% 80%))" + : "var(--text-muted, hsl(213 14% 55%))", + opacity: noneSelected ? 0.4 : 1, + cursor: "pointer", + fontSize: "0.75rem", + fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)", + }} + > + {allSelected ? "None" : "All"} + </button> + {allModels.map((model) => { + const color = getModelColor(model); + const active = selectedModels.has(model); + return ( + <button + key={model} + onClick={() => toggleModel(model)} + style={{ + padding: "4px 10px", + borderRadius: "0", + border: `1px solid ${color}`, + background: active ? `${color}22` : "transparent", + color: active ? color : "var(--text-muted, hsl(213 14% 55%))", + opacity: active ? 1 : 0.4, + cursor: "pointer", + fontSize: "0.75rem", + fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)", + }} + > + {model} + </button> + ); + })} + </div> + ); +} diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx @@ -12,6 +12,7 @@ import { import { useXAxisScale, useYAxisScale } from "recharts"; import type { Run } from "../lib/types"; import { groupIntoCells } from "../lib/analysis"; +import ModelSelector from "./ModelSelector"; interface ScatterPlotProps { runs: Run[]; @@ -392,17 +393,8 @@ export default function ScatterPlot({ // Initialize visibleModels on first render or when models change const effectiveVisible = visibleModels ?? new Set(allModels); - const toggleModel = (model: string) => { - setVisibleModels((prev) => { - const current = prev ?? new Set(allModels); - const next = new Set(current); - if (next.has(model)) { - next.delete(model); - } else { - next.add(model); - } - return next; - }); + const handleModelChange = (models: Set<string>) => { + setVisibleModels(models); }; // Compute regions from ALL data (for stable axis domains) @@ -468,34 +460,12 @@ export default function ScatterPlot({ </div> {/* Model toggles */} - <div - style={{ - display: "flex", - gap: "8px", - justifyContent: "center", - marginBottom: "12px", - flexWrap: "wrap", - }} - > - {allModels.map((model) => ( - <button - key={model} - onClick={() => toggleModel(model)} - style={{ - padding: "4px 10px", - borderRadius: "0", - border: `1px solid ${fallbackColor(model)}`, - background: effectiveVisible.has(model) ? `${fallbackColor(model)}22` : "transparent", - color: effectiveVisible.has(model) ? fallbackColor(model) : "var(--text-muted, hsl(213 14% 55%))", - opacity: effectiveVisible.has(model) ? 1 : 0.4, - cursor: "pointer", - fontSize: "0.75rem", - fontFamily: "var(--font-mono, 'JetBrains Mono', monospace)", - }} - > - {model} - </button> - ))} + <div style={{ marginBottom: "12px" }}> + <ModelSelector + allModels={allModels} + selectedModels={effectiveVisible} + onChange={handleModelChange} + /> </div> {hovered && <CentroidTooltip data={hovered} />} diff --git a/dashboard/src/components/SurprisesPage.tsx b/dashboard/src/components/SurprisesPage.tsx @@ -0,0 +1,473 @@ +import { useState, useMemo } from "react"; +import type { Run } from "../lib/types"; + +interface SurprisesPageProps { + runs: Run[]; +} + +interface RunRef { + run_id: string; + short_id?: string; + model: string; + score: number; + cost: number; + config: Record<string, string>; +} + +interface Surprise { + title: string; + detail: string; + category: "model_upset" | "prompt_upset" | "individual_outlier"; + weaker: { model: string; config: string; score: number; cost: number }; + stronger: { model: string; config: string; score: number; cost: number }; + magnitude: number; + runs: RunRef[]; + configDiffs: string[]; + /** Which config axis is the primary differentiator */ + primaryAxis: string; +} + +const MODEL_RANK: Record<string, number> = { + haiku: 1, + sonnet: 2, + opus: 3, +}; + +const CONFIG_KEYS = [ + "prompt_style", "language", "effort", "human_language", + "linter", "playwright", "context_file", + "web_search", "max_budget", "tool_read", "tool_write", + "tool_edit", "tool_glob", "tool_grep", + "tests_provided", "strategy", "design_guidance", "architecture", + "error_checking", "context_noise", "renderer", +]; + +function getConfigDiffs(runsA: Run[], runsB: Run[]): string[] { + const diffs: string[] = []; + const metaA = runsA[0]?.meta; + const metaB = runsB[0]?.meta; + if (!metaA || !metaB) return diffs; + + for (const key of CONFIG_KEYS) { + const va = String((metaA as Record<string, unknown>)[key]); + const vb = String((metaB as Record<string, unknown>)[key]); + if (va !== vb) { + diffs.push(`${key}: ${va} vs ${vb}`); + } + } + return diffs; +} + +function findSurprises(runs: Run[]): Surprise[] { + const surprises: Surprise[] = []; + + // Group runs by config (everything except model and run number) + const configGroups: Record<string, Run[]> = {}; + for (const run of runs) { + if (run.eval_results?.score == null) continue; + const m = run.meta; + const key = CONFIG_KEYS.map(k => (m as Record<string, unknown>)[k]).join("|"); + (configGroups[key] ??= []).push(run); + } + + // Within each config group, compare models + for (const [, group] of Object.entries(configGroups)) { + const byModel: Record<string, Run[]> = {}; + for (const run of group) { + (byModel[run.meta.model] ??= []).push(run); + } + + const models = Object.keys(byModel); + for (let i = 0; i < models.length; i++) { + for (let j = i + 1; j < models.length; j++) { + const a = models[i]; + const b = models[j]; + const rankA = MODEL_RANK[a] || 0; + const rankB = MODEL_RANK[b] || 0; + + const runsA = byModel[a]; + const runsB = byModel[b]; + const scoresA = runsA.map(r => r.eval_results!.score!); + const scoresB = runsB.map(r => r.eval_results!.score!); + const avgA = scoresA.reduce((s, v) => s + v, 0) / scoresA.length; + const avgB = scoresB.reduce((s, v) => s + v, 0) / scoresB.length; + + const costsA = runsA.map(r => r.claude_output?.total_cost_usd ?? 0); + const costsB = runsB.map(r => r.claude_output?.total_cost_usd ?? 0); + const avgCostA = costsA.reduce((s, v) => s + v, 0) / costsA.length; + const avgCostB = costsB.reduce((s, v) => s + v, 0) / costsB.length; + + const allRuns = [ + ...runsA.map(r => ({ + run_id: r.meta.run_id, short_id: r.meta.short_id, model: a, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ...runsB.map(r => ({ + run_id: r.meta.run_id, short_id: r.meta.short_id, model: b, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ]; + + if (rankA < rankB && avgA > avgB + 0.02) { + surprises.push({ + title: `${a} beat ${b}`, + detail: `${a} scored ${(avgA * 100).toFixed(0)}% vs ${b} at ${(avgB * 100).toFixed(0)}%`, + category: "model_upset", + weaker: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, + stronger: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, + magnitude: avgA - avgB, + runs: allRuns, + configDiffs: getConfigDiffs(runsA, runsB), + primaryAxis: "model", + }); + } else if (rankB < rankA && avgB > avgA + 0.02) { + surprises.push({ + title: `${b} beat ${a}`, + detail: `${b} scored ${(avgB * 100).toFixed(0)}% vs ${a} at ${(avgA * 100).toFixed(0)}%`, + category: "model_upset", + weaker: { model: b, config: group[0].meta.prompt_style, score: avgB, cost: avgCostB }, + stronger: { model: a, config: group[0].meta.prompt_style, score: avgA, cost: avgCostA }, + magnitude: avgB - avgA, + runs: allRuns, + configDiffs: getConfigDiffs(runsB, runsA), + primaryAxis: "model", + }); + } + } + } + } + + // Find individual outlier runs where a stronger model scored far below haiku + const haikuScores = runs.filter(r => r.meta.model === "haiku" && r.eval_results?.score != null).map(r => r.eval_results!.score!); + const haikuMean = haikuScores.length > 0 ? haikuScores.reduce((a, b) => a + b, 0) / haikuScores.length : 0; + + for (const run of runs) { + if (run.eval_results?.score == null) continue; + const model = run.meta.model; + const score = run.eval_results.score; + const rank = MODEL_RANK[model] || 0; + + if (rank > 1 && score < haikuMean - 0.15) { + surprises.push({ + title: `${model} run scored far below haiku avg`, + detail: `This ${model} run scored ${(score * 100).toFixed(0)}% vs haiku average of ${(haikuMean * 100).toFixed(0)}%`, + category: "individual_outlier", + weaker: { model: "haiku", config: "average", score: haikuMean, cost: 0 }, + stronger: { model, config: run.meta.prompt_style, score, cost: run.claude_output?.total_cost_usd ?? 0 }, + magnitude: haikuMean - score, + runs: [{ + run_id: run.meta.run_id, short_id: run.meta.short_id, model, + score, cost: run.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((run.meta as Record<string, unknown>)[k])])), + }], + configDiffs: CONFIG_KEYS.filter(k => { + const v = String((run.meta as Record<string, unknown>)[k]); + return v !== "on" && v !== "typescript" && v !== "en" && v !== "high" && v !== "simple"; + }).map(k => `${k}: ${(run.meta as Record<string, unknown>)[k]}`), + primaryAxis: "model", + }); + } + } + + // Simple prompt beats detailed + const promptGroups: Record<string, Run[]> = {}; + for (const run of runs) { + if (run.eval_results?.score == null) continue; + const m = run.meta; + const key = [m.model, m.language, m.effort, m.linter, m.playwright, m.context_file].join("|"); + (promptGroups[key] ??= []).push(run); + } + + for (const [, group] of Object.entries(promptGroups)) { + const byPrompt: Record<string, Run[]> = {}; + for (const run of group) { + (byPrompt[run.meta.prompt_style] ??= []).push(run); + } + if (byPrompt.simple && byPrompt.detailed) { + const simpleRuns = byPrompt.simple; + const detailedRuns = byPrompt.detailed; + const avgSimple = simpleRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / simpleRuns.length; + const avgDetailed = detailedRuns.map(r => r.eval_results!.score!).reduce((a, b) => a + b, 0) / detailedRuns.length; + if (avgSimple > avgDetailed + 0.05) { + const allRuns = [ + ...simpleRuns.map(r => ({ + run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ...detailedRuns.map(r => ({ + run_id: r.meta.run_id, short_id: r.meta.short_id, model: r.meta.model, + score: r.eval_results!.score!, cost: r.claude_output?.total_cost_usd ?? 0, + config: Object.fromEntries(CONFIG_KEYS.map(k => [k, String((r.meta as Record<string, unknown>)[k])])), + })), + ]; + surprises.push({ + title: "Simple prompt beat detailed", + detail: `${group[0].meta.model}: simple scored ${(avgSimple * 100).toFixed(0)}% vs detailed at ${(avgDetailed * 100).toFixed(0)}%`, + category: "prompt_upset", + weaker: { model: group[0].meta.model, config: "simple", score: avgSimple, cost: 0 }, + stronger: { model: group[0].meta.model, config: "detailed", score: avgDetailed, cost: 0 }, + magnitude: avgSimple - avgDetailed, + runs: allRuns, + configDiffs: ["prompt_style: simple vs detailed"], + primaryAxis: "prompt_style", + }); + } + } + } + + return surprises.sort((a, b) => b.magnitude - a.magnitude); +} + +const CATEGORY_LABELS: Record<string, string> = { + model_upset: "Model upsets", + prompt_upset: "Prompt upsets", + individual_outlier: "Individual outliers", +}; + +const CATEGORY_DESCRIPTIONS: Record<string, string> = { + model_upset: "A cheaper/weaker model outperformed a more capable one under the same configuration.", + prompt_upset: "A simpler prompt style beat a more detailed one, suggesting diminishing returns from verbosity.", + individual_outlier: "A single run from a stronger model scored far below the weaker model's average.", +}; + +const CATEGORY_COLORS: Record<string, string> = { + model_upset: "var(--yellow)", + prompt_upset: "var(--accent)", + individual_outlier: "var(--red)", +}; + +function SurpriseCard({ surprise }: { surprise: Surprise }) { + const [expanded, setExpanded] = useState(false); + + return ( + <div + className="card" + style={{ + padding: "14px", + borderLeft: `3px solid ${CATEGORY_COLORS[surprise.category] || "var(--yellow)"}`, + cursor: "pointer", + }} + onClick={() => setExpanded(!expanded)} + > + <div style={{ fontSize: "13px", fontWeight: 600, marginBottom: "6px", textTransform: "uppercase", letterSpacing: "0.5px" }}> + {surprise.title} + </div> + <div style={{ fontSize: "11px", color: "var(--text-muted)", marginBottom: "8px" }}> + {surprise.detail} + </div> + <div style={{ display: "flex", justifyContent: "space-between", fontSize: "11px" }}> + <div> + <span style={{ color: "var(--green)" }}>{surprise.weaker.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(surprise.weaker.score * 100).toFixed(0)}% + </span> + </div> + <div style={{ color: "var(--text-muted)" }}>vs</div> + <div> + <span style={{ color: "var(--red)" }}>{surprise.stronger.model}</span> + <span style={{ color: "var(--text-muted)", marginLeft: "6px" }}> + {(surprise.stronger.score * 100).toFixed(0)}% + </span> + </div> + </div> + + <div style={{ display: "flex", gap: "8px", marginTop: "8px", flexWrap: "wrap" }}> + <span style={{ + fontSize: "10px", + padding: "2px 6px", + borderRadius: "3px", + background: "hsl(var(--muted))", + color: "hsl(var(--muted-foreground))", + fontFamily: "var(--font-mono)", + }}> + +{(surprise.magnitude * 100).toFixed(0)}pp + </span> + <span style={{ + fontSize: "10px", + padding: "2px 6px", + borderRadius: "3px", + background: "hsl(var(--muted))", + color: "hsl(var(--muted-foreground))", + fontFamily: "var(--font-mono)", + }}> + {surprise.runs.length} run{surprise.runs.length !== 1 ? "s" : ""} + </span> + </div> + + {expanded && ( + <div style={{ marginTop: "12px", borderTop: "1px solid var(--border)", paddingTop: "10px" }}> + {surprise.configDiffs.length > 0 && ( + <div style={{ marginBottom: "8px" }}> + <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}>Config differences</div> + {surprise.configDiffs.map((diff, i) => ( + <div key={i} style={{ fontSize: "11px", fontFamily: "var(--font-mono)", color: "var(--accent)" }}>{diff}</div> + ))} + </div> + )} + + <div style={{ fontSize: "10px", color: "var(--text-muted)", textTransform: "uppercase", letterSpacing: "0.5px", marginBottom: "4px" }}> + Runs ({surprise.runs.length}) + </div> + {surprise.runs.map((r) => ( + <div key={r.run_id} style={{ display: "flex", gap: "8px", fontSize: "11px", marginBottom: "2px", alignItems: "center" }}> + <span style={{ color: r.model === surprise.weaker.model ? "var(--green)" : "var(--red)", width: "50px" }}> + {r.model} + </span> + <span style={{ fontFamily: "var(--font-mono)", width: "40px" }}> + {(r.score * 100).toFixed(0)}% + </span> + <span style={{ color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> + ${r.cost.toFixed(2)} + </span> + <a href={`/r/${r.short_id || r.run_id}`} style={{ fontSize: "10px", color: "var(--accent)" }} onClick={e => e.stopPropagation()}> + view + </a> + </div> + ))} + </div> + )} + </div> + ); +} + +export default function SurprisesPage({ runs }: SurprisesPageProps) { + const surprises = useMemo(() => findSurprises(runs), [runs]); + + // Aggregate stats + const byCategory = useMemo(() => { + const groups: Record<string, Surprise[]> = {}; + for (const s of surprises) { + (groups[s.category] ??= []).push(s); + } + return groups; + }, [surprises]); + + const axisCounts = useMemo(() => { + const counts: Record<string, number> = {}; + for (const s of surprises) { + counts[s.primaryAxis] = (counts[s.primaryAxis] || 0) + 1; + } + return Object.entries(counts).sort((a, b) => b[1] - a[1]); + }, [surprises]); + + const avgMagnitude = useMemo(() => { + if (surprises.length === 0) return 0; + return surprises.reduce((sum, s) => sum + s.magnitude, 0) / surprises.length; + }, [surprises]); + + const maxMagnitude = useMemo(() => { + if (surprises.length === 0) return 0; + return Math.max(...surprises.map(s => s.magnitude)); + }, [surprises]); + + // Category order for display + const categoryOrder = ["model_upset", "prompt_upset", "individual_outlier"]; + const orderedCategories = categoryOrder.filter(c => byCategory[c]?.length); + + if (surprises.length === 0) { + return ( + <div className="card" style={{ textAlign: "center", padding: "32px", color: "var(--text-muted)" }}> + No surprises yet. Run more experiments with different models to find upsets. + </div> + ); + } + + return ( + <div> + {/* Explanation */} + <div className="card" style={{ padding: "16px", marginBottom: "24px" }}> + <p style={{ fontSize: "12px", color: "var(--text-muted)", margin: 0, lineHeight: "1.6" }}> + A "surprise" is a result that defies expectations: a weaker or cheaper model outperforming a stronger one, + or a simpler configuration beating a more elaborate one. These findings highlight where conventional assumptions + about model capability and configuration complexity break down. Click any card to see the runs involved. + </p> + </div> + + {/* Summary stats */} + <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fit, minmax(180px, 1fr))", gap: "12px", marginBottom: "24px" }}> + <div className="stat-card"> + <div className="stat-value">{surprises.length}</div> + <div className="stat-label">Total surprises</div> + </div> + <div className="stat-card"> + <div className="stat-value">{(avgMagnitude * 100).toFixed(0)}pp</div> + <div className="stat-label">Avg magnitude</div> + </div> + <div className="stat-card"> + <div className="stat-value">{(maxMagnitude * 100).toFixed(0)}pp</div> + <div className="stat-label">Largest upset</div> + </div> + <div className="stat-card"> + <div className="stat-value">{axisCounts[0]?.[0] || "--"}</div> + <div className="stat-label">Most surprising axis</div> + </div> + </div> + + {/* Breakdown by type */} + <div className="card" style={{ padding: "16px", marginBottom: "24px" }}> + <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "12px" }}> + Breakdown by type + </div> + <div style={{ display: "flex", gap: "24px", flexWrap: "wrap" }}> + {orderedCategories.map(cat => ( + <div key={cat} style={{ display: "flex", alignItems: "baseline", gap: "8px" }}> + <span style={{ + width: "8px", + height: "8px", + borderRadius: "2px", + background: CATEGORY_COLORS[cat], + display: "inline-block", + flexShrink: 0, + position: "relative", + top: "-1px", + }} /> + <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "14px" }}> + {byCategory[cat]?.length || 0} + </span> + <span style={{ fontSize: "11px", color: "var(--text-muted)" }}> + {CATEGORY_LABELS[cat]} + </span> + </div> + ))} + </div> + {axisCounts.length > 1 && ( + <div style={{ marginTop: "12px", paddingTop: "12px", borderTop: "1px solid var(--border)" }}> + <div style={{ fontSize: "11px", textTransform: "uppercase", letterSpacing: "0.5px", color: "var(--text-muted)", marginBottom: "8px" }}> + Surprises by axis + </div> + <div style={{ display: "flex", gap: "16px", flexWrap: "wrap" }}> + {axisCounts.map(([axis, count]) => ( + <div key={axis} style={{ display: "flex", alignItems: "baseline", gap: "6px" }}> + <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, fontSize: "13px" }}> + {count} + </span> + <span style={{ fontSize: "11px", color: "var(--text-muted)", fontFamily: "var(--font-mono)" }}> + {axis} + </span> + </div> + ))} + </div> + </div> + )} + </div> + + {/* Grouped surprise cards */} + {orderedCategories.map(cat => ( + <div key={cat} style={{ marginBottom: "32px" }}> + <h3 style={{ marginBottom: "4px" }}>{CATEGORY_LABELS[cat]}</h3> + <p style={{ color: "var(--text-muted)", fontSize: "11px", marginBottom: "16px", textTransform: "uppercase", letterSpacing: "0.5px" }}> + {CATEGORY_DESCRIPTIONS[cat]} + </p> + <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(300px, 1fr))", gap: "12px" }}> + {byCategory[cat]!.map((s, i) => ( + <SurpriseCard key={i} surprise={s} /> + ))} + </div> + </div> + ))} + </div> + ); +} diff --git a/dashboard/src/layouts/Base.astro b/dashboard/src/layouts/Base.astro @@ -47,8 +47,10 @@ try { <nav style="display: flex; gap: 16px; font-size: 0.875rem; align-items: center;"> <a href="/">Grid</a> <a href="/insights">Insights</a> + <a href="/surprises">Surprises</a> <a href="/explore">Explore</a> <a href="/compare">Compare</a> + <a href="/pca">PCA</a> <span style="border-left: 1px solid hsl(var(--border)); height: 16px;"></span> <a href="/methodology">Methodology</a> </nav> diff --git a/dashboard/src/pages/insights.astro b/dashboard/src/pages/insights.astro @@ -3,7 +3,6 @@ import Base from "../layouts/Base.astro"; import { loadAllRuns } from "../lib/data"; import Insights from "../components/Insights"; import ScatterPlot from "../components/ScatterPlot"; -import Surprises from "../components/Surprises"; import Variability from "../components/Variability"; const runs = loadAllRuns(); @@ -15,11 +14,7 @@ const runs = loadAllRuns(); Which variables move the needle? Where do weaker configs win? How consistent are the results? </p> - <Surprises client:load runs={runs} /> - - <div style="margin-top: 32px;"> - <Variability client:load runs={runs} /> - </div> + <Variability client:load runs={runs} /> <div style="margin-top: 32px; display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> <ScatterPlot client:load runs={runs} defaultX="cost" defaultY="outcome" /> diff --git a/dashboard/src/pages/surprises.astro b/dashboard/src/pages/surprises.astro @@ -0,0 +1,16 @@ +--- +import Base from "../layouts/Base.astro"; +import { loadAllRuns } from "../lib/data"; +import SurprisesPage from "../components/SurprisesPage"; + +const runs = loadAllRuns(); +--- + +<Base title="Surprises"> + <h1 style="margin-bottom: 8px;">Surprises</h1> + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 11px; text-transform: uppercase; letter-spacing: 0.5px;"> + Where weaker configs outperformed stronger ones, and conventional assumptions broke down. + </p> + + <SurprisesPage client:load runs={runs} /> +</Base>

Impressum · Datenschutz