Project runs before serializing into index-page islands - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 3b81eb9246542dee665795aeb510ae2ced79f03b
parent 0af972817d114910874b95bc4ec84298b7511e40
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu, 16 Apr 2026 16:32:06 +0200

Project runs before serializing into index-page islands

index.astro passes the full Run[] to 4 client:load islands
(StatisticalPowerCard, Charts, TopBottomConfigs, Grid). Astro serializes
each island's props independently, so the full eval_results payload
(gameplay bot report with per-test details, SonarQube details,
code_analysis, transcript_analysis) was embedded four times, once per
island -- ~34 MB of HTML on a 510-run dataset.

Add projectRunForIndex() in data.ts that returns a Run-shaped object
containing only the fields these islands and analysis.groupIntoCells
actually read (score summaries, functional.pass, cost, num_turns). Call
it once in index.astro and pass the slim array to all four islands.

dist/index.html: 34 MB -> 5.9 MB raw, 3.1 MB -> 217 KB gzipped.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/lib/data.ts  | 45 +++++++++++++++++++++++++++++++++++++++++++++
M dashboard/src/pages/index.astro  | 17 ++++++++++++-----

2 files changed, 57 insertions(+), 5 deletions(-)
diff --git a/dashboard/src/lib/data.ts b/dashboard/src/lib/data.ts
@@ -183,3 +183,48 @@ export function aggregateRuns(runs: Run[]): AggregateStats {
     pass_rate: runs.length > 0 ? passes / runs.length : null,
   };
 }
+
+/**
+ * Trim a Run for serialization into an Astro island that only needs
+ * summary-level fields. Drops per-test details, full SonarQube payloads,
+ * code_analysis/transcript_analysis payloads, etc. -- the bulk of
+ * eval_results.json content that bloats the index page HTML.
+ *
+ * Fields kept are the union of everything the index page islands
+ * (Charts, Grid, TopBottomConfigs, StatisticalPowerCard) plus
+ * analysis.groupIntoCells actually read.
+ */
+export function projectRunForIndex(run: Run): Run {
+  const er = run.eval_results as Record<string, any> | null;
+  const slimEval: EvalResults | null = er
+    ? {
+        score: er.score ?? null,
+        functional: er.functional
+          ? { pass: er.functional.pass, score: er.functional.score }
+          : undefined,
+        structural: er.structural
+          ? { pass: er.structural.pass, score: er.structural.score, checks: [] }
+          : undefined,
+        quality: er.quality ? { score: er.quality.score } : undefined,
+        // Non-interface fields the analysis layer reads via `as any`.
+        ...(er.gameplay_bot ? { gameplay_bot: { score: er.gameplay_bot.score } } : {}),
+        ...(er.code_analysis ? { code_analysis: { score: er.code_analysis.score } } : {}),
+        ...(er.transcript_analysis ? { transcript_analysis: { score: er.transcript_analysis.score } } : {}),
+        ...(er.sonarqube ? { sonarqube: { score: er.sonarqube.score } } : {}),
+      } as EvalResults
+    : null;
+
+  const slimOutput: ClaudeOutput | null = run.claude_output
+    ? {
+        total_cost_usd: run.claude_output.total_cost_usd,
+        num_turns: run.claude_output.num_turns,
+      }
+    : null;
+
+  return {
+    meta: run.meta,
+    eval_results: slimEval,
+    claude_output: slimOutput,
+    has_transcript: run.has_transcript,
+  };
+}
diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro
@@ -1,6 +1,6 @@
 ---
 import Base from "../layouts/Base.astro";
-import { loadAllRuns, getAxisValues, getTaskNames } from "../lib/data";
+import { loadAllRuns, getAxisValues, getTaskNames, projectRunForIndex } from "../lib/data";
 import type { Run } from "../lib/types";
 import Grid from "../components/Grid";
 import Charts from "../components/Charts";
@@ -11,6 +11,13 @@ const runs = loadAllRuns();
 const axisValues = getAxisValues(runs);
 const tasks = getTaskNames(runs);
 
+// Each client:load island below serializes its props independently into the
+// HTML. Without projection, the full eval_results payload (gameplay bot
+// reports, SonarQube details, etc.) gets embedded 4x at ~10KB/run. Projecting
+// down to the union of fields these islands actually read drops per-run size
+// ~50x.
+const runsForIndex = runs.map(projectRunForIndex);
+
 // Compute per-task cell-based stats
 interface TaskSummary {
   task: string;
@@ -104,15 +111,15 @@ const totalCells = new Set(runs.map(r => r.meta.cell_id)).size;
     </div>
   ))}
 
-  <StatisticalPowerCard client:load runs={runs} />
+  <StatisticalPowerCard client:load runs={runsForIndex} />
 
   <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
-    <Charts client:load runs={runs} />
-    <TopBottomConfigs client:load runs={runs} />
+    <Charts client:load runs={runsForIndex} />
+    <TopBottomConfigs client:load runs={runsForIndex} />
   </div>
 
   <div style="margin-top: 32px;">
     <h2 style="margin-bottom: 16px;">All Cells</h2>
-    <Grid client:load runs={runs} axisValues={axisValues} tasks={tasks} />
+    <Grid client:load runs={runsForIndex} axisValues={axisValues} tasks={tasks} />
   </div>
 </Base>

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/lib/data.ts	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
M	dashboard/src/pages/index.astro	\|	17	++++++++++++-----