loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit f213f5a1831e271fcf572e9de9073fe5c85985ba
parent 364e1e4595a31324e0d96750c5bff342c7bbaf76
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sun,  5 Apr 2026 21:22:35 +0200

Convert all charts to cell-based: every visualization now shows cells not runs

ScatterPlot: dots are cells with error bar crosshairs showing run ranges
CorrelationMatrix: spreads computed from cell averages per axis value
HeatmapMatrix: values are averages of cell averages, labels show "N cells"
BumpChart: model rankings from cell averages per condition
RadarComparison: dimension scores from cell averages
ConfigTreemap: size = cell count, color = avg cell score
EfficiencyFrontier: simplified to use groupIntoCells(), labels updated

All charts now consistently represent cells (unique configs) with
variance from repeat runs shown as ranges/error bars.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/BumpChart.tsx | 55++++++++++++++++++++++++++-----------------------------
Mdashboard/src/components/ConfigTreemap.tsx | 26+++++++++++++-------------
Mdashboard/src/components/CorrelationMatrix.tsx | 24+++++++++++++++++-------
Mdashboard/src/components/EfficiencyFrontier.tsx | 80+++++++++++++++++++++++++++----------------------------------------------------
Mdashboard/src/components/HeatmapMatrix.tsx | 21+++++++++++++--------
Mdashboard/src/components/RadarComparison.tsx | 33++++++++++++++-------------------
Mdashboard/src/components/ScatterPlot.tsx | 193++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
7 files changed, 263 insertions(+), 169 deletions(-)

diff --git a/dashboard/src/components/BumpChart.tsx b/dashboard/src/components/BumpChart.tsx @@ -11,6 +11,7 @@ import { } from "recharts"; import type { Run } from "../lib/types"; import { AXIS_NAMES, type AxisName } from "../lib/types"; +import { groupIntoCells, type Cell } from "../lib/analysis"; interface BumpChartProps { runs: Run[]; @@ -65,15 +66,19 @@ function computeRankings( runs: Run[], axis: AxisName ): { ranked: Record<string, RankedPoint[]>; crossings: CrossingPoint[] } { + // Group runs into cells, then work with cell averages + const cells = groupIntoCells(runs); + const scoredCells = cells.filter((c) => c.score.avg > 0); + // Get unique condition values for the selected axis const conditionValues = Array.from( - new Set(runs.map((r) => String(r.meta[axis]))) + new Set(scoredCells.map((c) => String(c.meta[axis]))) ).sort(); // Get unique models - const models = Array.from(new Set(runs.map((r) => r.meta.model))).sort(); + const models = Array.from(new Set(scoredCells.map((c) => c.meta.model))).sort(); - // For each condition value, compute average score per model, then rank + // For each condition value, compute average of cell averages per model, then rank const ranked: Record<string, RankedPoint[]> = {}; for (const model of models) { ranked[model] = []; @@ -85,27 +90,24 @@ function computeRankings( for (let ci = 0; ci < conditionValues.length; ci++) { const cv = conditionValues[ci]; - const runsForCondition = runs.filter( - (r) => String(r.meta[axis]) === cv + const cellsForCondition = scoredCells.filter( + (c) => String(c.meta[axis]) === cv ); - // Compute average score per model for this condition + // Compute average of cell averages per model for this condition const modelScores: Array<{ model: string; avgScore: number; n: number; }> = []; for (const model of models) { - const modelRuns = runsForCondition.filter( - (r) => r.meta.model === model + const modelCells = cellsForCondition.filter( + (c) => c.meta.model === model ); - const scores = modelRuns - .map((r) => r.eval_results?.score) - .filter((s): s is number => s !== null && s !== undefined); - if (scores.length > 0) { - const avg = scores.reduce((a, b) => a + b, 0) / scores.length; - modelScores.push({ model, avgScore: avg, n: scores.length }); + if (modelCells.length > 0) { + const avg = modelCells.reduce((s, c) => s + c.score.avg, 0) / modelCells.length; + modelScores.push({ model, avgScore: avg, n: modelCells.length }); } } @@ -280,8 +282,7 @@ function CustomTooltipContent({ {entry.point!.model} </span> <span style={{ color: "var(--text-muted)", marginLeft: "auto" }}> - {(entry.point!.avgScore * 100).toFixed(1)}% (n= - {entry.point!.n}) + {(entry.point!.avgScore * 100).toFixed(1)}% ({entry.point!.n} cells) </span> </div> ))} @@ -292,18 +293,17 @@ function CustomTooltipContent({ export default function BumpChart({ runs }: BumpChartProps) { // Pre-compute which axes are useful: need 2+ condition values AND 2+ models with scores const validAxes = useMemo(() => { - const scoredRuns = runs.filter( - (r) => r.eval_results?.score !== null && r.eval_results?.score !== undefined - ); + const cells = groupIntoCells(runs); + const scoredCells = cells.filter((c) => c.score.avg > 0); return CONDITION_AXES.filter((axis) => { const conditionValues = Array.from( - new Set(scoredRuns.map((r) => String(r.meta[axis]))) + new Set(scoredCells.map((c) => String(c.meta[axis]))) ); if (conditionValues.length < 2) return false; // Check that at least one condition value has 2+ models with scores for (const cv of conditionValues) { const modelsWithScores = new Set( - scoredRuns.filter((r) => String(r.meta[axis]) === cv).map((r) => r.meta.model) + scoredCells.filter((c) => String(c.meta[axis]) === cv).map((c) => c.meta.model) ); if (modelsWithScores.size >= 2) return true; } @@ -354,12 +354,9 @@ export default function BumpChart({ runs }: BumpChartProps) { const maxRank = models.length; - const scoredRuns = runs.filter( - (r) => - r.eval_results?.score !== null && r.eval_results?.score !== undefined - ); + const scoredCells = groupIntoCells(runs).filter((c) => c.score.avg > 0); - if (scoredRuns.length === 0) { + if (scoredCells.length === 0) { return ( <div className="card" @@ -369,7 +366,7 @@ export default function BumpChart({ runs }: BumpChartProps) { color: "var(--text-muted)", }} > - No scored runs available for ranking. + No scored cells available for ranking. </div> ); } @@ -387,7 +384,7 @@ export default function BumpChart({ runs }: BumpChartProps) { }} > Not enough data to compare models. Rankings need at least 2 condition - values where 2 or more models have scored runs. + values where 2 or more models have scored cells. </div> </div> ); @@ -414,7 +411,7 @@ export default function BumpChart({ runs }: BumpChartProps) { margin: "4px 0 0", }} > - Rank 1 = best average score. Crossings indicate rank swaps. + Rank 1 = best average cell score. Crossings indicate rank swaps. </p> </div> <div style={{ display: "flex", alignItems: "center", gap: "8px" }}> diff --git a/dashboard/src/components/ConfigTreemap.tsx b/dashboard/src/components/ConfigTreemap.tsx @@ -2,6 +2,7 @@ import React, { useState, useCallback } from "react"; import { Treemap, ResponsiveContainer, Tooltip } from "recharts"; import type { TreemapNode } from "recharts/types/chart/Treemap"; import type { Run, AxisName } from "../lib/types"; +import { groupIntoCells, type Cell } from "../lib/analysis"; interface ConfigTreemapProps { runs: Run[]; @@ -47,14 +48,15 @@ interface GroupData { } function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] { - const byModel: Record<string, Record<string, Run[]>> = {}; + const cells = groupIntoCells(runs); + const byModel: Record<string, Record<string, Cell[]>> = {}; - for (const run of runs) { - const model = run.meta.model; - const secondary = String(run.meta[secondaryAxis]); + for (const cell of cells) { + const model = cell.meta.model; + const secondary = String(cell.meta[secondaryAxis]); if (!byModel[model]) byModel[model] = {}; if (!byModel[model][secondary]) byModel[model][secondary] = []; - byModel[model][secondary].push(run); + byModel[model][secondary].push(cell); } return Object.entries(byModel) @@ -63,19 +65,17 @@ function buildTreeData(runs: Run[], secondaryAxis: AxisName): GroupData[] { name: model, children: Object.entries(configs) .sort(([a], [b]) => a.localeCompare(b)) - .map(([configValue, configRuns]) => { - const scores = configRuns - .map((r) => r.eval_results?.score) - .filter((s): s is number => s !== null && s !== undefined); + .map(([configValue, configCells]) => { + const scoredCells = configCells.filter((c) => c.score.avg > 0); const avgScore = - scores.length > 0 - ? scores.reduce((a, b) => a + b, 0) / scores.length + scoredCells.length > 0 + ? scoredCells.reduce((s, c) => s + c.score.avg, 0) / scoredCells.length : null; return { name: `${model} / ${configValue}`, displayName: `${model} / ${configValue}`, - size: configRuns.length, + size: configCells.length, avgScore, avgScorePct: avgScore !== null ? `${(avgScore * 100).toFixed(0)}%` : "--", @@ -198,7 +198,7 @@ function CustomTooltip({ Score: {node.avgScorePct} </div> <div style={{ color: "hsl(213 14% 65%)" }}> - Runs: {node.size} + Cells: {node.size} </div> </div> ); diff --git a/dashboard/src/components/CorrelationMatrix.tsx b/dashboard/src/components/CorrelationMatrix.tsx @@ -1,4 +1,5 @@ import type { Run } from "../lib/types"; +import { groupIntoCells, type Cell } from "../lib/analysis"; interface CorrelationMatrixProps { runs: Run[]; @@ -36,13 +37,19 @@ const OUTCOME_METRICS: Array<{ key: string; label: string; extract: MetricExtrac { key: "time", label: "Wall Time", extract: (r) => r.meta.wall_time_seconds ?? null }, ]; -function computeSpread(runs: Run[], axisKey: string, extract: MetricExtractor): number | null { +function computeSpread(cells: Cell[], axisKey: string, extract: MetricExtractor): number | null { + // Compute per-cell metric averages, then group by axis value const groups: Record<string, number[]> = {}; - for (const run of runs) { - const val = extract(run); - if (val === null) continue; - const groupKey = String((run.meta as Record<string, unknown>)[axisKey] ?? "unknown"); - (groups[groupKey] ??= []).push(val); + for (const cell of cells) { + const vals: number[] = []; + for (const run of cell.runs) { + const v = extract(run); + if (v !== null) vals.push(v); + } + if (vals.length === 0) continue; + const cellAvg = vals.reduce((a, b) => a + b, 0) / vals.length; + const groupKey = String((cell.meta as Record<string, unknown>)[axisKey] ?? "unknown"); + (groups[groupKey] ??= []).push(cellAvg); } const keys = Object.keys(groups); @@ -72,6 +79,9 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) { ); } + // Group runs into cells once, then compute spreads from cell averages + const cells = groupIntoCells(runs); + // Compute the full matrix: rows = config axes, columns = metrics const matrix: Array<{ key: string; @@ -82,7 +92,7 @@ export default function CorrelationMatrix({ runs }: CorrelationMatrixProps) { for (const axis of CONFIG_AXES) { const spreads = OUTCOME_METRICS.map((metric) => - computeSpread(runs, axis.key, metric.extract) + computeSpread(cells, axis.key, metric.extract) ); const validSpreads = spreads.filter((s): s is number => s !== null); const maxSpread = validSpreads.length > 0 ? Math.max(...validSpreads) : 0; diff --git a/dashboard/src/components/EfficiencyFrontier.tsx b/dashboard/src/components/EfficiencyFrontier.tsx @@ -9,6 +9,7 @@ import { ResponsiveContainer, } from "recharts"; import type { Run } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; interface EfficiencyFrontierProps { runs: Run[]; @@ -42,55 +43,28 @@ function getModelColor(model: string): string { } function aggregateByConfig(runs: Run[]): ConfigPoint[] { - const groups: Record< - string, - { - scores: number[]; - costs: number[]; - model: string; - config: Record<string, string>; - } - > = {}; - - for (const run of runs) { - const id = run.meta.cell_id; - if (!groups[id]) { - groups[id] = { - scores: [], - costs: [], - model: run.meta.model, - config: { - model: run.meta.model, - effort: run.meta.effort, - prompt_style: run.meta.prompt_style, - language: run.meta.language, - linter: run.meta.linter, - playwright: run.meta.playwright, - context_file: run.meta.context_file, - sub_agents: run.meta.sub_agents, - web_search: run.meta.web_search, - max_budget: run.meta.max_budget, - }, - }; - } - - if (run.eval_results?.score != null) { - groups[id].scores.push(run.eval_results.score); - } - if (run.claude_output?.total_cost_usd != null) { - groups[id].costs.push(run.claude_output.total_cost_usd); - } - } - - return Object.entries(groups) - .filter(([, g]) => g.scores.length > 0 && g.costs.length > 0) - .map(([cell_id, g]) => ({ - cell_id, - model: g.model, - avgCost: g.costs.reduce((a, b) => a + b, 0) / g.costs.length, - avgScore: g.scores.reduce((a, b) => a + b, 0) / g.scores.length, - runCount: g.scores.length, - config: g.config, + const cells = groupIntoCells(runs); + + return cells + .filter((c) => c.score.avg > 0 && c.cost.avg > 0) + .map((c) => ({ + cell_id: c.cell_id, + model: c.meta.model, + avgCost: c.cost.avg, + avgScore: c.score.avg, + runCount: c.n, + config: { + model: c.meta.model, + effort: c.meta.effort, + prompt_style: c.meta.prompt_style, + language: c.meta.language, + linter: c.meta.linter, + playwright: c.meta.playwright, + context_file: c.meta.context_file, + sub_agents: c.meta.sub_agents, + web_search: c.meta.web_search, + max_budget: c.meta.max_budget, + }, isFrontier: false, label: "", })); @@ -203,7 +177,7 @@ function CustomTooltip({ <span style={{ fontWeight: 600 }}>${point.avgCost.toFixed(2)}</span> </div> <div style={{ marginBottom: "8px" }}> - <span style={{ color: "var(--text-muted)" }}>runs: </span> + <span style={{ color: "var(--text-muted)" }}>runs in cell: </span> <span>{point.runCount}</span> </div> {point.isFrontier && ( @@ -334,8 +308,8 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { marginBottom: "16px", }} > - Cost vs score per config. Pareto frontier highlights configs not - dominated on both axes. + Cost vs score per cell (averaged across runs). Pareto frontier + highlights cells not dominated on both axes. </p> {/* Legend */} @@ -420,7 +394,7 @@ export default function EfficiencyFrontier({ runs }: EfficiencyFrontierProps) { {/* Non-frontier points (dimmed) */} <Scatter - name="configs" + name="cells" data={nonFrontierPoints} shape={nonFrontierShape} isAnimationActive={false} diff --git a/dashboard/src/components/HeatmapMatrix.tsx b/dashboard/src/components/HeatmapMatrix.tsx @@ -1,6 +1,7 @@ import { useState, useMemo } from "react"; import type { Run, AxisName } from "../lib/types"; import { AXIS_NAMES } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; interface HeatmapMatrixProps { runs: Run[]; @@ -61,16 +62,20 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) { const [colAxis, setColAxis] = useState<AxisName>("prompt_style"); const { rowValues, colValues, cells } = useMemo(() => { + const analysisCells = groupIntoCells(runs); const cellMap: Record<string, Record<string, CellData>> = {}; const rowSet = new Set<string>(); const colSet = new Set<string>(); - for (const run of runs) { - const score = run.eval_results?.score; - if (score === null || score === undefined) continue; + for (const cell of analysisCells) { + // Skip cells where no run has a score + const hasScore = cell.runs.some((r) => r.eval_results?.score != null); + if (!hasScore) continue; + // Use the cell's average score as a single data point + const cellAvg = cell.score.avg; - const rv = String(run.meta[rowAxis]); - const cv = String(run.meta[colAxis]); + const rv = String(cell.meta[rowAxis]); + const cv = String(cell.meta[colAxis]); rowSet.add(rv); colSet.add(cv); @@ -78,7 +83,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) { if (!cellMap[rv]) cellMap[rv] = {}; if (!cellMap[rv][cv]) cellMap[rv][cv] = { totalScore: 0, count: 0 }; - cellMap[rv][cv].totalScore += score; + cellMap[rv][cv].totalScore += cellAvg; cellMap[rv][cv].count += 1; } @@ -171,7 +176,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) { fontFamily: "var(--font-mono)", }} > - No scored runs available for this axis combination. + No scored cells available for this axis combination. </div> ) : ( <div style={{ overflowX: "auto" }}> @@ -297,7 +302,7 @@ export default function HeatmapMatrix({ runs }: HeatmapMatrixProps) { lineHeight: 1.3, }} > - n={cell.count} + {cell.count} {cell.count === 1 ? "cell" : "cells"} </div> </td> ); diff --git a/dashboard/src/components/RadarComparison.tsx b/dashboard/src/components/RadarComparison.tsx @@ -9,6 +9,7 @@ import { Tooltip, } from "recharts"; import type { Run } from "../lib/types"; +import { groupIntoCells, type Cell } from "../lib/analysis"; interface RadarComparisonProps { runs: Run[]; @@ -50,31 +51,25 @@ function extractDimensionScore(run: Run, dim: Dimension): number | null { interface CellConfig { cell_id: string; label: string; - runs: Run[]; + cell: Cell; } function buildCellConfigs(runs: Run[]): CellConfig[] { - const grouped: Record<string, Run[]> = {}; - for (const run of runs) { - const id = run.meta.cell_id; - if (!grouped[id]) grouped[id] = []; - grouped[id].push(run); - } - - return Object.entries(grouped) - .map(([cell_id, cellRuns]) => { - const m = cellRuns[0].meta; + const cells = groupIntoCells(runs); + return cells + .map((cell) => { + const m = cell.meta; const label = `${m.model} / ${m.language} / ${m.prompt_style} / ${m.effort}`; - return { cell_id, label, runs: cellRuns }; + return { cell_id: cell.cell_id, label, cell }; }) .sort((a, b) => a.label.localeCompare(b.label)); } -function averageScores( - runs: Run[], +function cellAverageScore( + cell: Cell, dim: Dimension ): number | null { - const scores = runs + const scores = cell.runs .map((r) => extractDimensionScore(r, dim)) .filter((s): s is number => s !== null); if (scores.length === 0) return null; @@ -148,8 +143,8 @@ export default function RadarComparison({ runs }: RadarComparisonProps) { const data: RadarDatum[] = useMemo(() => { return DIMENSIONS.map((dim) => { - const scoreA = configA ? averageScores(configA.runs, dim) : null; - const scoreB = configB ? averageScores(configB.runs, dim) : null; + const scoreA = configA ? cellAverageScore(configA.cell, dim) : null; + const scoreB = configB ? cellAverageScore(configB.cell, dim) : null; return { dimension: DIMENSION_LABELS[dim], scoreA: scoreA ?? 0, @@ -203,7 +198,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) { > {cellConfigs.map((c) => ( <option key={c.cell_id} value={c.cell_id}> - {c.label} (n={c.runs.length}) + {c.label} ({c.cell.n} runs) </option> ))} </select> @@ -217,7 +212,7 @@ export default function RadarComparison({ runs }: RadarComparisonProps) { > {cellConfigs.map((c) => ( <option key={c.cell_id} value={c.cell_id}> - {c.label} (n={c.runs.length}) + {c.label} ({c.cell.n} runs) </option> ))} </select> diff --git a/dashboard/src/components/ScatterPlot.tsx b/dashboard/src/components/ScatterPlot.tsx @@ -7,8 +7,10 @@ import { Tooltip, ResponsiveContainer, Legend, + ErrorBar, } from "recharts"; import type { Run } from "../lib/types"; +import { groupIntoCells } from "../lib/analysis"; interface ScatterPlotProps { runs: Run[]; @@ -16,68 +18,177 @@ interface ScatterPlotProps { yMetric: string; } -const METRIC_CONFIG: Record<string, { label: string; extract: (r: Run) => number | null; format: (v: number) => string }> = { +type CellMetricKey = "cost" | "score" | "turns" | "wall_time"; + +const METRIC_CONFIG: Record< + string, + { + label: string; + cellKey: CellMetricKey; + scale: number; // multiply avg/min/max by this for display + format: (v: number) => string; + } +> = { cost: { label: "Cost ($)", - extract: (r) => r.claude_output?.total_cost_usd ?? null, + cellKey: "cost", + scale: 1, format: (v) => `$${v.toFixed(2)}`, }, score: { label: "Score (%)", - extract: (r) => r.eval_results?.score != null ? r.eval_results.score * 100 : null, + cellKey: "score", + scale: 100, format: (v) => `${v.toFixed(0)}%`, }, turns: { label: "Turns", - extract: (r) => r.claude_output?.num_turns ?? null, - format: (v) => `${v}`, + cellKey: "turns", + scale: 1, + format: (v) => `${Math.round(v)}`, }, wall_time: { label: "Time (s)", - extract: (r) => r.meta.wall_time_seconds ?? null, - format: (v) => `${v}s`, + cellKey: "wall_time", + scale: 1, + format: (v) => `${Math.round(v)}s`, }, }; const MODEL_COLORS: Record<string, string> = { - haiku: "hsl(193 44% 67%)", // frost cyan - sonnet: "hsl(40 71% 73%)", // aurora yellow - opus: "hsl(311 24% 63%)", // aurora purple + haiku: "hsl(193 44% 67%)", // frost cyan + sonnet: "hsl(40 71% 73%)", // aurora yellow + opus: "hsl(311 24% 63%)", // aurora purple }; -export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps) { +function formatCellId(cellId: string): string { + return cellId.replace(/_/g, " "); +} + +interface CellDatum { + x: number; + y: number; + xErrorRange: [number, number]; + yErrorRange: [number, number]; + cell_id: string; + xLabel: string; + yLabel: string; + xRange: string; + yRange: string; + n: number; +} + +function CustomTooltip({ active, payload }: any) { + if (!active || !payload?.length) return null; + const d: CellDatum = payload[0].payload; + return ( + <div + style={{ + background: "hsl(217 16% 15.5%)", + border: "1px solid hsl(217 17% 28%)", + borderRadius: "2px", + fontFamily: "'JetBrains Mono', monospace", + fontSize: "11px", + padding: "8px 10px", + lineHeight: "1.6", + color: "hsl(213 14% 80%)", + }} + > + <div style={{ fontWeight: 600, marginBottom: 4 }}> + {formatCellId(d.cell_id)} + </div> + <div> + {d.xLabel}: {d.xRange} + </div> + <div> + {d.yLabel}: {d.yRange} + </div> + <div style={{ marginTop: 2, color: "hsl(213 14% 55%)" }}> + {d.n} run{d.n !== 1 ? "s" : ""} in cell + </div> + </div> + ); +} + +export default function ScatterPlot({ + runs, + xMetric, + yMetric, +}: ScatterPlotProps) { const xConf = METRIC_CONFIG[xMetric]; const yConf = METRIC_CONFIG[yMetric]; if (!xConf || !yConf) return null; - // Group by model - const byModel: Record<string, Array<{ x: number; y: number; run_id: string; prompt: string }>> = {}; + const cells = groupIntoCells(runs); + + // Group cells by model + const byModel: Record<string, CellDatum[]> = {}; + + for (const cell of cells) { + const xAgg = cell[xConf.cellKey]; + const yAgg = cell[yConf.cellKey]; + // Skip cells where either metric has no data + if (xAgg.avg === 0 && xAgg.min === 0 && xAgg.max === 0) continue; + if (yAgg.avg === 0 && yAgg.min === 0 && yAgg.max === 0) continue; - for (const run of runs) { - const x = xConf.extract(run); - const y = yConf.extract(run); - if (x === null || y === null) continue; + const xAvg = xAgg.avg * xConf.scale; + const xMin = xAgg.min * xConf.scale; + const xMax = xAgg.max * xConf.scale; + const yAvg = yAgg.avg * yConf.scale; + const yMin = yAgg.min * yConf.scale; + const yMax = yAgg.max * yConf.scale; - const model = run.meta.model; + const model = cell.meta.model; if (!byModel[model]) byModel[model] = []; + + const xRangeStr = + cell.n > 1 + ? `avg ${xConf.format(xAvg)} (${xConf.format(xMin)} - ${xConf.format(xMax)})` + : xConf.format(xAvg); + const yRangeStr = + cell.n > 1 + ? `avg ${yConf.format(yAvg)} (${yConf.format(yMin)} - ${yConf.format(yMax)})` + : yConf.format(yAvg); + byModel[model].push({ - x, - y, - run_id: run.meta.run_id, - prompt: run.meta.prompt_style, + x: xAvg, + y: yAvg, + xErrorRange: [xAvg - xMin, xMax - xAvg], + yErrorRange: [yAvg - yMin, yMax - yAvg], + cell_id: cell.cell_id, + xLabel: xConf.label, + yLabel: yConf.label, + xRange: xRangeStr, + yRange: yRangeStr, + n: cell.n, }); } const models = Object.keys(byModel).sort(); + const totalCells = models.reduce((sum, m) => sum + byModel[m].length, 0); return ( <div className="card"> <h3 style={{ marginBottom: "16px" }}> - {xConf.label} vs {yConf.label} + {xConf.label} vs {yConf.label}{" "} + <span + style={{ + fontSize: "12px", + fontWeight: 400, + color: "hsl(213 14% 55%)", + }} + > + ({totalCells} cells) + </span> </h3> <ResponsiveContainer width="100%" height={350}> - <ScatterChart margin={{ top: 10, right: 20, bottom: 10, left: 10 }}> - <CartesianGrid strokeDasharray="3 3" stroke="hsl(217 17% 28%)" /> + <ScatterChart + margin={{ top: 10, right: 20, bottom: 10, left: 10 }} + > + <CartesianGrid + strokeDasharray="3 3" + stroke="hsl(217 17% 28%)" + /> <XAxis dataKey="x" name={xConf.label} @@ -92,20 +203,7 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps fontSize={11} tickFormatter={(v) => yConf.format(v)} /> - <Tooltip - contentStyle={{ - background: "hsl(217 16% 15.5%)", - border: "1px solid hsl(217 17% 28%)", - borderRadius: "2px", - fontFamily: "'JetBrains Mono', monospace", - fontSize: "11px", - }} - formatter={(value: number, name: string) => { - if (name === xConf.label) return [xConf.format(value), name]; - if (name === yConf.label) return [yConf.format(value), name]; - return [value, name]; - }} - /> + <Tooltip content={<CustomTooltip />} /> <Legend /> {models.map((model) => ( <Scatter @@ -113,7 +211,22 @@ export default function ScatterPlot({ runs, xMetric, yMetric }: ScatterPlotProps name={model} data={byModel[model]} fill={MODEL_COLORS[model] || "hsl(213 14% 65%)"} - /> + > + <ErrorBar + dataKey="xErrorRange" + direction="x" + stroke="hsl(213 14% 45%)" + strokeWidth={1} + width={4} + /> + <ErrorBar + dataKey="yErrorRange" + direction="y" + stroke="hsl(213 14% 45%)" + strokeWidth={1} + width={4} + /> + </Scatter> ))} </ScatterChart> </ResponsiveContainer>

Impressum · Datenschutz