Charts.tsx (13409B)
1 import { useState, useMemo } from "react"; 2 import { 3 ComposedChart, 4 Bar, 5 Scatter, 6 XAxis, 7 YAxis, 8 CartesianGrid, 9 Tooltip, 10 ResponsiveContainer, 11 Cell, 12 } from "recharts"; 13 import type { Run } from "../lib/types"; 14 import { getModelColor, modelSortOrder } from "../lib/colors"; 15 import { confidenceInterval } from "../lib/analysis"; 16 import ModelSelector from "./ModelSelector"; 17 18 interface ChartsProps { 19 runs: Run[]; 20 } 21 22 interface BoxPlotData { 23 label: string; 24 min: number; 25 q1: number; 26 median: number; 27 q3: number; 28 max: number; 29 cellCount: number; 30 runCount: number; 31 scores: number[]; 32 // Derived fields for recharts stacked bar trick 33 base: number; // invisible bar height = q1 34 iqr: number; // visible box height = q3 - q1 35 color: string; 36 // 95% confidence interval of the mean 37 ciMean: number; 38 ciLower: number; 39 ciUpper: number; 40 } 41 42 43 const SMUI = { 44 surface0: "hsl(213 16% 12%)", 45 surface1: "hsl(217 16% 15.5%)", 46 surface2: "hsl(216 15% 19%)", 47 border: "hsl(217 17% 28%)", 48 muted: "hsl(213 14% 65%)", 49 frost1: "hsl(176 25% 65%)", 50 frost2: "hsl(193 44% 67%)", 51 frost3: "hsl(210 34% 63%)", 52 frost4: "hsl(213 32% 52%)", 53 green: "hsl(92 28% 65%)", 54 yellow: "hsl(40 71% 73%)", 55 red: "hsl(355 52% 64%)", 56 purple: "hsl(311 24% 63%)", 57 }; 58 59 // MODEL_COLORS imported from ../lib/colors 60 61 const TOOLTIP_STYLE = { 62 background: SMUI.surface1, 63 border: `1px solid ${SMUI.border}`, 64 borderRadius: "0", 65 fontFamily: "'JetBrains Mono', monospace", 66 fontSize: "11px", 67 padding: "8px 12px", 68 }; 69 70 interface CellAggregate { 71 cell_id: string; 72 model: string; 73 task: string; 74 avgScore: number; 75 avgCost: number; 76 passRate: number; 77 runCount: number; 78 } 79 80 function aggregateCells(runs: Run[]): CellAggregate[] { 81 const byCell: Record<string, { 82 model: string; 83 task: string; 84 scores: number[]; 85 costs: number[]; 86 passes: number; 87 total: number; 88 }> = {}; 89 90 for (const run of runs) { 91 const cellId = run.meta.cell_id; 92 if (!byCell[cellId]) { 93 byCell[cellId] = { 94 model: run.meta.actual_model || run.meta.model, 95 task: run.meta.task, 96 scores: [], 97 costs: [], 98 passes: 0, 99 total: 0, 100 }; 101 } 102 103 byCell[cellId].total++; 104 if (run.eval_results?.score != null) { 105 byCell[cellId].scores.push(run.eval_results.score); 106 } 107 if (run.claude_output?.total_cost_usd != null) { 108 byCell[cellId].costs.push(run.claude_output.total_cost_usd); 109 } 110 if (run.eval_results?.functional?.pass) { 111 byCell[cellId].passes++; 112 } 113 } 114 115 return Object.entries(byCell).map(([cell_id, data]) => ({ 116 cell_id, 117 model: data.model, 118 task: data.task, 119 avgScore: data.scores.length > 0 120 ? data.scores.reduce((a, b) => a + b, 0) / data.scores.length 121 : 0, 122 avgCost: data.costs.length > 0 123 ? data.costs.reduce((a, b) => a + b, 0) / data.costs.length 124 : 0, 125 passRate: data.total > 0 126 ? data.passes / data.total 127 : 0, 128 runCount: data.total, 129 })); 130 } 131 132 // MODEL_ORDER imported via modelSortOrder from ../lib/colors 133 134 function quantile(sorted: number[], q: number): number { 135 if (sorted.length === 0) return 0; 136 if (sorted.length === 1) return sorted[0]; 137 const pos = q * (sorted.length - 1); 138 const lo = Math.floor(pos); 139 const hi = Math.ceil(pos); 140 const frac = pos - lo; 141 return sorted[lo] + frac * (sorted[hi] - sorted[lo]); 142 } 143 144 function computeBoxStats(values: number[]): { min: number; q1: number; median: number; q3: number; max: number } { 145 const sorted = [...values].sort((a, b) => a - b); 146 return { 147 min: sorted.length > 0 ? sorted[0] : 0, 148 q1: quantile(sorted, 0.25), 149 median: quantile(sorted, 0.5), 150 q3: quantile(sorted, 0.75), 151 max: sorted.length > 0 ? sorted[sorted.length - 1] : 0, 152 }; 153 } 154 155 function aggregateByModel(runs: Run[]): BoxPlotData[] { 156 const cells = aggregateCells(runs); 157 const byModel: Record<string, CellAggregate[]> = {}; 158 159 for (const cell of cells) { 160 if (!byModel[cell.model]) byModel[cell.model] = []; 161 byModel[cell.model].push(cell); 162 } 163 164 const sortedEntries = Object.entries(byModel).sort(([a], [b]) => 165 modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b) 166 ); 167 168 return sortedEntries.map(([model, modelCells]) => { 169 const scores = modelCells.map((c) => Math.round(c.avgScore * 100)); 170 const totalRuns = modelCells.reduce((sum, c) => sum + c.runCount, 0); 171 const stats = computeBoxStats(scores); 172 const baseModel = model; 173 const ci = confidenceInterval(scores); 174 return { 175 label: `${model}|(n=${totalRuns})`, 176 ...stats, 177 base: stats.q1, 178 iqr: stats.q3 - stats.q1, 179 cellCount: modelCells.length, 180 runCount: totalRuns, 181 scores, 182 color: getModelColor(baseModel), 183 ciMean: ci.mean, 184 ciLower: ci.lower, 185 ciUpper: ci.upper, 186 }; 187 }); 188 } 189 190 191 // Custom shape: draws a box from q1 to q3 with whiskers from min to max and a median line 192 function BoxPlotShape(props: any) { 193 const { x, y, width, height, payload } = props as { 194 x: number; y: number; width: number; height: number; 195 payload: BoxPlotData; 196 }; 197 if (!payload || height === undefined) return null; 198 199 const { min, median, max, color, cellCount, ciLower, ciUpper, ciMean } = payload; 200 const lowN = cellCount < 3; 201 const boxOpacity = lowN ? 0.4 : 1; 202 // The bar is rendered from q1 (base) with height iqr (q3-q1). 203 // y is the top of the bar (q3 in chart coords), y+height is the bottom (q1). 204 const boxTop = y; 205 const boxBottom = y + height; 206 const boxQ3 = payload.q3; 207 const boxQ1 = payload.q1; 208 const centerX = x + width / 2; 209 210 // Scale: we need to convert data values to pixel positions. 211 // We know q1 maps to boxBottom and q3 maps to boxTop. 212 const dataToY = (val: number): number => { 213 if (boxQ3 === boxQ1) return boxTop; 214 return boxTop + ((boxQ3 - val) / (boxQ3 - boxQ1)) * (boxBottom - boxTop); 215 }; 216 217 const minY = dataToY(min); 218 const maxY = dataToY(max); 219 const medianY = dataToY(median); 220 const whiskerHalfW = width * 0.3; 221 222 return ( 223 <g opacity={boxOpacity}> 224 {/* Whisker line: min to max */} 225 <line x1={centerX} y1={minY} x2={centerX} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> 226 {/* Min whisker cap */} 227 <line x1={centerX - whiskerHalfW} y1={minY} x2={centerX + whiskerHalfW} y2={minY} stroke={SMUI.muted} strokeWidth={1} /> 228 {/* Max whisker cap */} 229 <line x1={centerX - whiskerHalfW} y1={maxY} x2={centerX + whiskerHalfW} y2={maxY} stroke={SMUI.muted} strokeWidth={1} /> 230 {/* Box (IQR) -- dashed stroke when low sample size */} 231 <rect x={x} y={boxTop} width={width} height={Math.max(height, 1)} fill={color} fillOpacity={0.3} stroke={color} strokeWidth={1} strokeDasharray={lowN ? "4 2" : undefined} /> 232 {/* Median line */} 233 <line x1={x} y1={medianY} x2={x + width} y2={medianY} stroke={color} strokeWidth={2} /> 234 {/* 95% CI band on the mean */} 235 {ciLower !== ciUpper && (() => { 236 const ciTopY = dataToY(Math.min(ciUpper, max)); 237 const ciBotY = dataToY(Math.max(ciLower, min)); 238 const ciMeanY = dataToY(ciMean); 239 const ciHalfW = width * 0.45; 240 return ( 241 <> 242 {/* Shaded CI band */} 243 <rect 244 x={centerX - ciHalfW} 245 y={ciTopY} 246 width={ciHalfW * 2} 247 height={Math.max(ciBotY - ciTopY, 1)} 248 fill={color} 249 fillOpacity={0.2} 250 stroke="none" 251 /> 252 {/* CI vertical line */} 253 <line x1={centerX} y1={ciTopY} x2={centerX} y2={ciBotY} stroke={color} strokeWidth={1.5} strokeDasharray="2 2" /> 254 {/* CI top cap */} 255 <line x1={centerX - 4} y1={ciTopY} x2={centerX + 4} y2={ciTopY} stroke={color} strokeWidth={1.5} /> 256 {/* CI bottom cap */} 257 <line x1={centerX - 4} y1={ciBotY} x2={centerX + 4} y2={ciBotY} stroke={color} strokeWidth={1.5} /> 258 {/* Mean dot */} 259 <circle cx={centerX} cy={ciMeanY} r={2.5} fill={color} stroke="none" /> 260 </> 261 ); 262 })()} 263 </g> 264 ); 265 } 266 267 268 269 270 // Custom tooltip for model box plot 271 function ModelBoxTooltipContent({ active, payload, label }: { active?: boolean; payload?: Array<{ payload: BoxPlotData }>; label?: string }) { 272 if (!active || !payload || payload.length === 0) return null; 273 const d = payload[0].payload; 274 return ( 275 <div style={TOOLTIP_STYLE}> 276 <div style={{ marginBottom: 4, fontWeight: 600 }}>{label?.split("|")[0]}</div> 277 <div style={{ marginBottom: 4, color: SMUI.muted, fontSize: 10 }}>n={d.runCount} runs across {d.cellCount} cells</div> 278 <div>Max: {d.max}%</div> 279 <div>Q3: {Math.round(d.q3)}%</div> 280 <div>Median: {Math.round(d.median)}%</div> 281 <div>Q1: {Math.round(d.q1)}%</div> 282 <div>Min: {d.min}%</div> 283 {d.ciLower !== d.ciUpper && ( 284 <div style={{ marginTop: 4, borderTop: `1px solid ${SMUI.border}`, paddingTop: 4 }}> 285 <div style={{ color: SMUI.frost2 }}>Mean: {Math.round(d.ciMean)}%</div> 286 <div style={{ color: SMUI.frost2 }}>95% CI: [{Math.round(d.ciLower)}% - {Math.round(d.ciUpper)}%]</div> 287 </div> 288 )} 289 </div> 290 ); 291 } 292 293 294 export default function Charts({ runs }: ChartsProps) { 295 // Extract unique models sorted consistently 296 const allModels = useMemo(() => { 297 const models = new Set<string>(); 298 for (const run of runs) { 299 models.add(run.meta.actual_model || run.meta.model); 300 } 301 return [...models].sort((a, b) => modelSortOrder(a) - modelSortOrder(b) || a.localeCompare(b)); 302 }, [runs]); 303 304 const [selectedModels, setSelectedModels] = useState<Set<string>>(() => new Set(allModels)); 305 306 if (runs.length === 0) { 307 return ( 308 <div className="card" style={{ textAlign: "center", padding: "40px", color: SMUI.muted }}> 309 No data to chart yet. 310 </div> 311 ); 312 } 313 314 const filteredRuns = runs.filter((r) => selectedModels.has(r.meta.actual_model || r.meta.model)); 315 const modelData = aggregateByModel(filteredRuns); 316 317 const detectableDifference = useMemo(() => { 318 const ciWidths = modelData 319 .filter((d) => d.ciLower !== d.ciUpper) 320 .map((d) => d.ciUpper - d.ciLower); 321 if (ciWidths.length === 0) return null; 322 return Math.round(Math.max(...ciWidths)); 323 }, [modelData]); 324 325 return ( 326 <div className="card"> 327 <div style={{ display: "flex", justifyContent: "space-between", alignItems: "flex-start", marginBottom: "16px", flexWrap: "wrap", gap: "8px" }}> 328 <div> 329 <h3 style={{ margin: 0 }}>Score Distribution by Model</h3> 330 <div style={{ fontSize: "10px", color: "var(--text-muted, hsl(213 14% 65%))", fontFamily: "'JetBrains Mono', monospace", marginTop: "2px" }}> 331 (n={filteredRuns.length} runs across {modelData.reduce((sum, d) => sum + d.cellCount, 0)} cells) 332 </div> 333 </div> 334 <ModelSelector 335 allModels={allModels} 336 selectedModels={selectedModels} 337 onChange={setSelectedModels} 338 /> 339 </div> 340 <ResponsiveContainer width="100%" height={270}> 341 <ComposedChart data={modelData} barCategoryGap="20%"> 342 <CartesianGrid strokeDasharray="3 3" stroke={SMUI.border} vertical={false} /> 343 <XAxis 344 dataKey="label" 345 stroke={SMUI.muted} 346 tickLine={false} 347 axisLine={{ stroke: SMUI.border }} 348 interval={0} 349 tick={({ x, y, payload }: any) => { 350 const [name, count] = (payload.value as string).split("|"); 351 return ( 352 <g> 353 <text x={x} y={y + 12} textAnchor="middle" fill={SMUI.muted} fontSize={10} fontFamily="'JetBrains Mono', monospace">{name}</text> 354 <text x={x} y={y + 24} textAnchor="middle" fill={SMUI.muted} fontSize={8} fontFamily="'JetBrains Mono', monospace" opacity={0.6}>{count}</text> 355 </g> 356 ); 357 }} 358 height={40} 359 /> 360 <YAxis 361 stroke={SMUI.muted} 362 fontSize={11} 363 fontFamily="'JetBrains Mono', monospace" 364 domain={[0, 100]} 365 tickLine={false} 366 axisLine={false} 367 yAxisId="score" 368 /> 369 <Tooltip content={<ModelBoxTooltipContent />} cursor={{ fill: "hsl(217 17% 28% / 0.3)" }} /> 370 {/* Invisible base bar to push the visible box up to q1 */} 371 <Bar dataKey="base" stackId="box" fill="transparent" barSize={40} yAxisId="score" /> 372 {/* Visible IQR box with custom shape for whiskers and median */} 373 <Bar dataKey="iqr" stackId="box" barSize={40} yAxisId="score" shape={<BoxPlotShape />}> 374 {modelData.map((entry) => ( 375 <Cell key={entry.label} fill={entry.color} /> 376 ))} 377 </Bar> 378 {/* Hidden scatter to keep recharts scale consistent */} 379 <Scatter data={[]} dataKey="score" yAxisId="score" fill="transparent" /> 380 </ComposedChart> 381 </ResponsiveContainer> 382 {detectableDifference != null && ( 383 <div style={{ 384 fontSize: "10px", 385 fontFamily: "'JetBrains Mono', monospace", 386 color: SMUI.muted, 387 marginTop: "4px", 388 textAlign: "center", 389 }}> 390 Detectable difference: differences of ±{detectableDifference}% are statistically significant with current data 391 </div> 392 )} 393 </div> 394 ); 395 }