data.ts (7652B)
1 /** 2 * Data loading module. Server-side only (uses Node.js fs). 3 * Types are re-exported from ./types.ts which is client-safe. 4 */ 5 import { createHash } from "node:crypto"; 6 import fs from "node:fs"; 7 import path from "node:path"; 8 9 // Use process.cwd() which in Astro build is the dashboard/ directory. 10 const RESULTS_DIR = path.resolve(process.cwd(), "../results"); 11 12 // Re-export types so existing imports from "./data" still work 13 export type { RunMeta, EvalResults, ClaudeOutput, Run, AxisName } from "./types"; 14 export { AXIS_NAMES } from "./types"; 15 16 import type { RunMeta, EvalResults, ClaudeOutput, Run, AxisName } from "./types"; 17 import { AXIS_NAMES } from "./types"; 18 19 function shortHash(str: string): string { 20 return createHash("sha256").update(str).digest("hex").slice(0, 8); 21 } 22 23 export function loadAllRuns(): Run[] { 24 const runsDir = path.join(RESULTS_DIR, "runs"); 25 if (!fs.existsSync(runsDir)) return []; 26 27 const runs: Run[] = []; 28 29 for (const runId of fs.readdirSync(runsDir)) { 30 const runDir = path.join(runsDir, runId); 31 if (!fs.statSync(runDir).isDirectory()) continue; 32 33 const metaPath = path.join(runDir, "meta.json"); 34 if (!fs.existsSync(metaPath)) continue; 35 36 try { 37 const meta: RunMeta = JSON.parse(fs.readFileSync(metaPath, "utf-8")); 38 39 // Normalize old schema to new 40 if ((meta as any).sub_agents && !meta.strategy) { 41 meta.strategy = (meta as any).sub_agents === "on" ? "use_subagents" : "none"; 42 } 43 if (meta.playwright === "on") { 44 (meta as any).playwright = "available"; 45 } 46 // Default new axes for old runs 47 meta.tests_provided = meta.tests_provided || "none"; 48 meta.strategy = meta.strategy || "none"; 49 meta.design_guidance = meta.design_guidance || "none"; 50 meta.architecture = meta.architecture || "none"; 51 meta.error_checking = meta.error_checking || "none"; 52 meta.context_noise = meta.context_noise || "clean"; 53 meta.renderer = meta.renderer || "none"; 54 meta.provider = meta.provider || "anthropic"; 55 meta.actual_model = meta.actual_model || meta.model; 56 57 // Normalize legacy model names to versioned 58 const MODEL_RENAME: Record<string, string> = { 59 haiku: "haiku-4.5", sonnet: "sonnet-4.6", opus: "opus-4.6", 60 }; 61 if (MODEL_RENAME[meta.model]) meta.model = MODEL_RENAME[meta.model]; 62 if (MODEL_RENAME[meta.actual_model]) meta.actual_model = MODEL_RENAME[meta.actual_model]; 63 64 // Compute short IDs if not in meta (backwards compat) 65 if (!meta.short_id && meta.run_id) { 66 meta.short_id = shortHash(meta.run_id); 67 } 68 if (!meta.short_cell_id && meta.cell_id) { 69 meta.short_cell_id = shortHash(meta.cell_id); 70 } 71 72 let eval_results: EvalResults | null = null; 73 const evalPath = path.join(runDir, "eval_results.json"); 74 if (fs.existsSync(evalPath)) { 75 eval_results = JSON.parse(fs.readFileSync(evalPath, "utf-8")); 76 } 77 78 let claude_output: ClaudeOutput | null = null; 79 const outputPath = path.join(runDir, "claude_output.json"); 80 if (fs.existsSync(outputPath)) { 81 claude_output = JSON.parse(fs.readFileSync(outputPath, "utf-8")); 82 } 83 84 const has_transcript = fs.existsSync( 85 path.join(runDir, "transcript.jsonl") 86 ); 87 88 runs.push({ meta, eval_results, claude_output, has_transcript }); 89 } catch { 90 // Skip corrupted run data 91 continue; 92 } 93 } 94 95 return runs.sort((a, b) => a.meta.run_id.localeCompare(b.meta.run_id)); 96 } 97 98 export function loadTranscript(runId: string): string[] { 99 const transcriptPath = path.join( 100 RESULTS_DIR, 101 "runs", 102 runId, 103 "transcript.jsonl" 104 ); 105 if (!fs.existsSync(transcriptPath)) return []; 106 return fs 107 .readFileSync(transcriptPath, "utf-8") 108 .split("\n") 109 .filter((line) => line.trim()); 110 } 111 112 export function getAxisValues(runs: Run[]): Record<AxisName, string[]> { 113 const values: Record<string, Set<string>> = {}; 114 for (const axis of AXIS_NAMES) { 115 values[axis] = new Set(); 116 } 117 for (const run of runs) { 118 for (const axis of AXIS_NAMES) { 119 values[axis].add(String(run.meta[axis])); 120 } 121 } 122 const result: Record<string, string[]> = {}; 123 for (const axis of AXIS_NAMES) { 124 result[axis] = Array.from(values[axis]).sort(); 125 } 126 return result as Record<AxisName, string[]>; 127 } 128 129 export function getTaskNames(runs: Run[]): string[] { 130 return Array.from(new Set(runs.map((r) => r.meta.task))).sort(); 131 } 132 133 export interface AggregateStats { 134 count: number; 135 avg_score: number | null; 136 avg_cost: number | null; 137 avg_wall_time: number | null; 138 avg_turns: number | null; 139 pass_rate: number | null; 140 } 141 142 export function aggregateRuns(runs: Run[]): AggregateStats { 143 if (runs.length === 0) { 144 return { 145 count: 0, 146 avg_score: null, 147 avg_cost: null, 148 avg_wall_time: null, 149 avg_turns: null, 150 pass_rate: null, 151 }; 152 } 153 154 const scores = runs 155 .map((r) => r.eval_results?.score) 156 .filter((s): s is number => s !== null && s !== undefined); 157 158 const costs = runs 159 .map((r) => r.claude_output?.total_cost_usd) 160 .filter((c): c is number => c !== undefined && c !== null); 161 162 const wallTimes = runs 163 .map((r) => r.meta.wall_time_seconds) 164 .filter((t): t is number => t !== undefined && t !== null); 165 166 const turns = runs 167 .map((r) => r.claude_output?.num_turns) 168 .filter((t): t is number => t !== undefined && t !== null); 169 170 const passes = runs.filter( 171 (r) => r.eval_results?.functional?.pass === true 172 ).length; 173 174 const avg = (arr: number[]) => 175 arr.length > 0 ? arr.reduce((a, b) => a + b, 0) / arr.length : null; 176 177 return { 178 count: runs.length, 179 avg_score: avg(scores), 180 avg_cost: avg(costs), 181 avg_wall_time: avg(wallTimes), 182 avg_turns: avg(turns), 183 pass_rate: runs.length > 0 ? passes / runs.length : null, 184 }; 185 } 186 187 /** 188 * Trim a Run for serialization into an Astro island that only needs 189 * summary-level fields. Drops per-test details, full SonarQube payloads, 190 * code_analysis/transcript_analysis payloads, etc. -- the bulk of 191 * eval_results.json content that bloats the index page HTML. 192 * 193 * Fields kept are the union of everything the index page islands 194 * (Charts, Grid, TopBottomConfigs, StatisticalPowerCard) plus 195 * analysis.groupIntoCells actually read. 196 */ 197 export function projectRunForIndex(run: Run): Run { 198 const er = run.eval_results as Record<string, any> | null; 199 const slimEval: EvalResults | null = er 200 ? { 201 score: er.score ?? null, 202 functional: er.functional 203 ? { pass: er.functional.pass, score: er.functional.score } 204 : undefined, 205 structural: er.structural 206 ? { pass: er.structural.pass, score: er.structural.score, checks: [] } 207 : undefined, 208 quality: er.quality ? { score: er.quality.score } : undefined, 209 // Non-interface fields the analysis layer reads via `as any`. 210 ...(er.gameplay_bot ? { gameplay_bot: { score: er.gameplay_bot.score } } : {}), 211 ...(er.code_analysis ? { code_analysis: { score: er.code_analysis.score } } : {}), 212 ...(er.transcript_analysis ? { transcript_analysis: { score: er.transcript_analysis.score } } : {}), 213 ...(er.sonarqube ? { sonarqube: { score: er.sonarqube.score } } : {}), 214 } as EvalResults 215 : null; 216 217 const slimOutput: ClaudeOutput | null = run.claude_output 218 ? { 219 total_cost_usd: run.claude_output.total_cost_usd, 220 num_turns: run.claude_output.num_turns, 221 } 222 : null; 223 224 return { 225 meta: run.meta, 226 eval_results: slimEval, 227 claude_output: slimOutput, 228 has_transcript: run.has_transcript, 229 }; 230 }