loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 3b81eb9246542dee665795aeb510ae2ced79f03b
parent 0af972817d114910874b95bc4ec84298b7511e40
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Thu, 16 Apr 2026 16:32:06 +0200

Project runs before serializing into index-page islands

index.astro passes the full Run[] to 4 client:load islands
(StatisticalPowerCard, Charts, TopBottomConfigs, Grid). Astro serializes
each island's props independently, so the full eval_results payload
(gameplay bot report with per-test details, SonarQube details,
code_analysis, transcript_analysis) was embedded four times, once per
island -- ~34 MB of HTML on a 510-run dataset.

Add projectRunForIndex() in data.ts that returns a Run-shaped object
containing only the fields these islands and analysis.groupIntoCells
actually read (score summaries, functional.pass, cost, num_turns). Call
it once in index.astro and pass the slim array to all four islands.

dist/index.html: 34 MB -> 5.9 MB raw, 3.1 MB -> 217 KB gzipped.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/lib/data.ts | 45+++++++++++++++++++++++++++++++++++++++++++++
Mdashboard/src/pages/index.astro | 17++++++++++++-----
2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/dashboard/src/lib/data.ts b/dashboard/src/lib/data.ts @@ -183,3 +183,48 @@ export function aggregateRuns(runs: Run[]): AggregateStats { pass_rate: runs.length > 0 ? passes / runs.length : null, }; } + +/** + * Trim a Run for serialization into an Astro island that only needs + * summary-level fields. Drops per-test details, full SonarQube payloads, + * code_analysis/transcript_analysis payloads, etc. -- the bulk of + * eval_results.json content that bloats the index page HTML. + * + * Fields kept are the union of everything the index page islands + * (Charts, Grid, TopBottomConfigs, StatisticalPowerCard) plus + * analysis.groupIntoCells actually read. + */ +export function projectRunForIndex(run: Run): Run { + const er = run.eval_results as Record<string, any> | null; + const slimEval: EvalResults | null = er + ? { + score: er.score ?? null, + functional: er.functional + ? { pass: er.functional.pass, score: er.functional.score } + : undefined, + structural: er.structural + ? { pass: er.structural.pass, score: er.structural.score, checks: [] } + : undefined, + quality: er.quality ? { score: er.quality.score } : undefined, + // Non-interface fields the analysis layer reads via `as any`. + ...(er.gameplay_bot ? { gameplay_bot: { score: er.gameplay_bot.score } } : {}), + ...(er.code_analysis ? { code_analysis: { score: er.code_analysis.score } } : {}), + ...(er.transcript_analysis ? { transcript_analysis: { score: er.transcript_analysis.score } } : {}), + ...(er.sonarqube ? { sonarqube: { score: er.sonarqube.score } } : {}), + } as EvalResults + : null; + + const slimOutput: ClaudeOutput | null = run.claude_output + ? { + total_cost_usd: run.claude_output.total_cost_usd, + num_turns: run.claude_output.num_turns, + } + : null; + + return { + meta: run.meta, + eval_results: slimEval, + claude_output: slimOutput, + has_transcript: run.has_transcript, + }; +} diff --git a/dashboard/src/pages/index.astro b/dashboard/src/pages/index.astro @@ -1,6 +1,6 @@ --- import Base from "../layouts/Base.astro"; -import { loadAllRuns, getAxisValues, getTaskNames } from "../lib/data"; +import { loadAllRuns, getAxisValues, getTaskNames, projectRunForIndex } from "../lib/data"; import type { Run } from "../lib/types"; import Grid from "../components/Grid"; import Charts from "../components/Charts"; @@ -11,6 +11,13 @@ const runs = loadAllRuns(); const axisValues = getAxisValues(runs); const tasks = getTaskNames(runs); +// Each client:load island below serializes its props independently into the +// HTML. Without projection, the full eval_results payload (gameplay bot +// reports, SonarQube details, etc.) gets embedded 4x at ~10KB/run. Projecting +// down to the union of fields these islands actually read drops per-run size +// ~50x. +const runsForIndex = runs.map(projectRunForIndex); + // Compute per-task cell-based stats interface TaskSummary { task: string; @@ -104,15 +111,15 @@ const totalCells = new Set(runs.map(r => r.meta.cell_id)).size; </div> ))} - <StatisticalPowerCard client:load runs={runs} /> + <StatisticalPowerCard client:load runs={runsForIndex} /> <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;"> - <Charts client:load runs={runs} /> - <TopBottomConfigs client:load runs={runs} /> + <Charts client:load runs={runsForIndex} /> + <TopBottomConfigs client:load runs={runsForIndex} /> </div> <div style="margin-top: 32px;"> <h2 style="margin-bottom: 16px;">All Cells</h2> - <Grid client:load runs={runs} axisValues={axisValues} tasks={tasks} /> + <Grid client:load runs={runsForIndex} axisValues={axisValues} tasks={tasks} /> </div> </Base>

Impressum · Datenschutz