loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 64445a2b95d17f200a03c19d19ed7294be1a9b89
parent b2013ff82112d8a2dcc2ae66b5079b774ac1e47b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri,  3 Apr 2026 19:36:47 +0200

UI improvements: readable run IDs, run detail layout, config pills

- Grid: run IDs show only task + active config values, not the full string
- Run detail: stats bar at top, config as pills with inactive values
  greyed out and active highlighted white, transcript left + scores right,
  full-width layout, exit codes explained (0=Success, 124=Timeout, etc.)
- Transcript viewer: 80vh height instead of fixed 600px
- Harness: saves claude_version in run metadata

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Grid.tsx | 20++++++++++++++++----
Mdashboard/src/components/RunDetail.tsx | 356++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mdashboard/src/components/TranscriptViewer.tsx | 4++--
Mdashboard/src/pages/run/[id].astro | 36++++++++++++++++++++++++++----------
Mharness/run.py | 11+++++++++++
5 files changed, 343 insertions(+), 84 deletions(-)

diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -25,6 +25,20 @@ function formatCost(cost: number | null | undefined): string { return "$" + cost.toFixed(2); } +function formatRunId(runId: string): string { + const parts = runId.split("_run"); + const runNum = parts.length > 1 ? `#${parts[parts.length - 1]}` : ""; + const segments = parts[0].split("_"); + const task = segments[0]; + // Only show non-default/interesting config values + const config = segments + .slice(1) + .filter((s) => !s.includes("=off") && !s.includes("=none")) + .map((s) => s.split("=")[1]) + .join(" "); + return `${task} ${config} ${runNum}`.trim(); +} + function formatTime(seconds: number | null | undefined): string { if (seconds === null || seconds === undefined) return "-"; if (seconds < 60) return seconds + "s"; @@ -86,10 +100,8 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { filteredRuns.map((run) => ( <tr key={run.meta.run_id}> <td> - <a href={`/run/${run.meta.run_id}`}> - {run.meta.run_id.length > 40 - ? run.meta.run_id.slice(0, 40) + "..." - : run.meta.run_id} + <a href={`/run/${run.meta.run_id}`} style={{ fontSize: "0.75rem" }}> + {formatRunId(run.meta.run_id)} </a> </td> <td>{run.meta.task}</td> diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx @@ -1,16 +1,152 @@ -import type { Run } from "../lib/data"; +import type { Run, AxisName } from "../lib/data"; import TranscriptViewer from "./TranscriptViewer"; interface RunDetailProps { run: Run; transcriptLines: string[]; + axisValues: Record<AxisName, string[]>; } -function ScoreBar({ label, score }: { label: string; score: number | null | undefined }) { +const EXIT_CODES: Record<number, string> = { + 0: "Success", + 1: "Error", + 2: "Misuse of shell command", + 124: "Timeout (exceeded time limit)", + 125: "Command failed to execute", + 126: "Command not executable", + 127: "Command not found", + 130: "Interrupted (SIGINT)", + 137: "Killed (SIGKILL / OOM)", + 143: "Terminated (SIGTERM)", +}; + +const AXIS_CONFIG: Array<{ + key: string; + label: string; +}> = [ + { key: "model", label: "Model" }, + { key: "effort", label: "Effort" }, + { key: "prompt_style", label: "Prompt" }, + { key: "language", label: "Language" }, + { key: "human_language", label: "Human Lang" }, + { key: "tool_read", label: "Read" }, + { key: "tool_write", label: "Write" }, + { key: "tool_edit", label: "Edit" }, + { key: "tool_glob", label: "Glob" }, + { key: "tool_grep", label: "Grep" }, + { key: "linter", label: "Linter" }, + { key: "playwright", label: "Playwright" }, + { key: "context_file", label: "Context" }, + { key: "sub_agents", label: "Sub-agents" }, + { key: "web_search", label: "Web Search" }, + { key: "max_budget", label: "Budget" }, +]; + +function ConfigPills({ + axisKey, + label, + activeValue, + allValues, +}: { + axisKey: string; + label: string; + activeValue: string; + allValues: string[]; +}) { + return ( + <div style={{ display: "flex", alignItems: "center", gap: "8px", marginBottom: "6px" }}> + <div + style={{ + width: "90px", + fontSize: "0.7rem", + color: "var(--text-muted)", + textTransform: "uppercase", + letterSpacing: "0.03em", + textAlign: "right", + flexShrink: 0, + }} + > + {label} + </div> + <div style={{ display: "flex", gap: "4px", flexWrap: "wrap" }}> + {allValues.map((val) => ( + <span + key={val} + style={{ + padding: "2px 8px", + borderRadius: "4px", + fontSize: "0.7rem", + fontFamily: "var(--font-mono)", + background: + val === activeValue + ? "rgba(255, 255, 255, 0.1)" + : "transparent", + color: + val === activeValue ? "#fff" : "rgba(255, 255, 255, 0.2)", + border: + val === activeValue + ? "1px solid rgba(255, 255, 255, 0.3)" + : "1px solid rgba(255, 255, 255, 0.05)", + }} + > + {val} + </span> + ))} + </div> + </div> + ); +} + +function ExitCodeBadge({ code }: { code: number | undefined }) { + if (code === undefined || code === null) + return <span style={{ color: "var(--text-muted)" }}>?</span>; + + const label = EXIT_CODES[code] || `Exit ${code}`; + const isOk = code === 0; + + return ( + <div> + <span + style={{ + fontFamily: "var(--font-mono)", + fontWeight: 700, + fontSize: "1.75rem", + color: isOk ? "var(--green)" : "var(--red)", + }} + > + {code} + </span> + <div + style={{ + fontSize: "0.7rem", + color: isOk ? "var(--green)" : "var(--red)", + opacity: 0.8, + }} + > + {label} + </div> + </div> + ); +} + +function ScoreBar({ + label, + score, +}: { + label: string; + score: number | null | undefined; +}) { if (score === null || score === undefined) { return ( <div style={{ marginBottom: "8px" }}> - <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.8rem", marginBottom: "4px" }}> + <div + style={{ + display: "flex", + justifyContent: "space-between", + fontSize: "0.8rem", + marginBottom: "4px", + }} + > <span>{label}</span> <span style={{ color: "var(--text-muted)" }}>N/A</span> </div> @@ -19,61 +155,61 @@ function ScoreBar({ label, score }: { label: string; score: number | null | unde } const pct = Math.round(score * 100); - const color = pct >= 70 ? "var(--green)" : pct >= 40 ? "var(--yellow)" : "var(--red)"; + const color = + pct >= 70 ? "var(--green)" : pct >= 40 ? "var(--yellow)" : "var(--red)"; return ( <div style={{ marginBottom: "8px" }}> - <div style={{ display: "flex", justifyContent: "space-between", fontSize: "0.8rem", marginBottom: "4px" }}> + <div + style={{ + display: "flex", + justifyContent: "space-between", + fontSize: "0.8rem", + marginBottom: "4px", + }} + > <span>{label}</span> - <span style={{ fontFamily: "var(--font-mono)", fontWeight: 600, color }}>{pct}%</span> + <span + style={{ fontFamily: "var(--font-mono)", fontWeight: 600, color }} + > + {pct}% + </span> </div> - <div style={{ background: "var(--bg)", borderRadius: "4px", height: "6px", overflow: "hidden" }}> - <div style={{ width: `${pct}%`, height: "100%", background: color, borderRadius: "4px" }} /> + <div + style={{ + background: "var(--bg)", + borderRadius: "4px", + height: "6px", + overflow: "hidden", + }} + > + <div + style={{ + width: `${pct}%`, + height: "100%", + background: color, + borderRadius: "4px", + }} + /> </div> </div> ); } -export default function RunDetail({ run, transcriptLines }: RunDetailProps) { +export default function RunDetail({ + run, + transcriptLines, + axisValues, +}: RunDetailProps) { const { meta, eval_results, claude_output } = run; return ( <div style={{ display: "flex", flexDirection: "column", gap: "24px" }}> - {/* Config */} - <div className="card"> - <h3 style={{ marginBottom: "12px" }}>Configuration</h3> - <div style={{ display: "grid", gridTemplateColumns: "repeat(auto-fill, minmax(160px, 1fr))", gap: "12px" }}> - {[ - ["Task", meta.task], - ["Model", meta.model], - ["Effort", meta.effort], - ["Prompt", meta.prompt_style], - ["Language", meta.language], - ["Human Lang", meta.human_language], - ["Linter", meta.linter], - ["Playwright", meta.playwright], - ["Context", meta.context_file], - ["Sub-agents", meta.sub_agents], - ["Web Search", meta.web_search], - ["Budget", `$${meta.max_budget_usd}`], - ].map(([label, value]) => ( - <div key={label as string}> - <div style={{ fontSize: "0.7rem", color: "var(--text-muted)", textTransform: "uppercase" }}> - {label} - </div> - <div style={{ fontFamily: "var(--font-mono)", fontSize: "0.875rem" }}>{value}</div> - </div> - ))} - </div> - </div> - - {/* Metrics */} + {/* Top bar: stats */} <div className="stats-grid"> <div className="stat-card"> - <div className="stat-value" style={{ color: meta.exit_code === 0 ? "var(--green)" : "var(--red)" }}> - {meta.exit_code === 0 ? "OK" : `Exit ${meta.exit_code ?? "?"}`} - </div> - <div className="stat-label">Status</div> + <ExitCodeBadge code={meta.exit_code} /> + <div className="stat-label">Exit Code</div> </div> <div className="stat-card"> <div className="stat-value"> @@ -87,7 +223,9 @@ export default function RunDetail({ run, transcriptLines }: RunDetailProps) { </div> <div className="stat-card"> <div className="stat-value"> - {claude_output?.total_cost_usd != null ? `$${claude_output.total_cost_usd.toFixed(2)}` : "-"} + {claude_output?.total_cost_usd != null + ? `$${claude_output.total_cost_usd.toFixed(2)}` + : "-"} </div> <div className="stat-label">Cost</div> </div> @@ -98,42 +236,124 @@ export default function RunDetail({ run, transcriptLines }: RunDetailProps) { <div className="stat-card"> <div className="stat-value"> {claude_output?.usage - ? ((claude_output.usage.input_tokens ?? 0) + (claude_output.usage.output_tokens ?? 0)).toLocaleString() + ? ( + (claude_output.usage.input_tokens ?? 0) + + (claude_output.usage.output_tokens ?? 0) + ).toLocaleString() : "-"} </div> <div className="stat-label">Total Tokens</div> </div> </div> - {/* Scores */} - {eval_results && ( - <div className="card"> - <h3 style={{ marginBottom: "16px" }}>Evaluation Scores</h3> - <ScoreBar label="Overall" score={eval_results.score} /> - <ScoreBar label="Structural" score={eval_results.structural?.score} /> - <ScoreBar label="Functional" score={eval_results.functional?.score} /> - <ScoreBar label="Quality" score={eval_results.quality?.score} /> - - {/* Check details */} - {eval_results.structural?.checks && ( - <div style={{ marginTop: "16px" }}> - <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}> + {/* Config pills */} + <div className="card"> + <h3 style={{ marginBottom: "12px" }}>Configuration</h3> + {AXIS_CONFIG.map(({ key, label }) => { + const active = String( + (meta as Record<string, unknown>)[key] ?? "" + ); + const all = (axisValues as Record<string, string[]>)[key] || [ + active, + ]; + if (!active) return null; + return ( + <ConfigPills + key={key} + axisKey={key} + label={label} + activeValue={active} + allValues={all} + /> + ); + })} + </div> + + {/* Two-column: transcript left, scores right */} + <div + style={{ + display: "grid", + gridTemplateColumns: "1fr 360px", + gap: "24px", + alignItems: "start", + }} + > + {/* Transcript */} + <TranscriptViewer lines={transcriptLines} /> + + {/* Scores sidebar */} + <div + style={{ + display: "flex", + flexDirection: "column", + gap: "16px", + position: "sticky", + top: "16px", + }} + > + {eval_results && ( + <div className="card"> + <h3 style={{ marginBottom: "16px" }}>Scores</h3> + <ScoreBar label="Overall" score={eval_results.score} /> + <ScoreBar + label="Structural" + score={eval_results.structural?.score} + /> + <ScoreBar + label="Functional" + score={eval_results.functional?.score} + /> + <ScoreBar label="Quality" score={eval_results.quality?.score} /> + </div> + )} + + {eval_results?.structural?.checks && ( + <div className="card"> + <h4 + style={{ + fontSize: "0.8rem", + color: "var(--text-muted)", + marginBottom: "8px", + }} + > Structural Checks </h4> - {eval_results.structural.checks.map((check, i) => ( - <div key={i} style={{ display: "flex", gap: "8px", fontSize: "0.8rem", marginBottom: "4px" }}> - <span>{check.pass ? "+" : "-"}</span> - <span>{check.name}</span> - <span style={{ color: "var(--text-muted)" }}>{check.detail}</span> - </div> - ))} + {eval_results.structural.checks.map( + ( + check: { pass: boolean; name: string; detail: string }, + i: number + ) => ( + <div + key={i} + style={{ + display: "flex", + gap: "8px", + fontSize: "0.75rem", + marginBottom: "4px", + alignItems: "baseline", + }} + > + <span + style={{ + color: check.pass ? "var(--green)" : "var(--red)", + flexShrink: 0, + }} + > + {check.pass ? "+" : "-"} + </span> + <span style={{ fontFamily: "var(--font-mono)" }}> + {check.name} + </span> + <span style={{ color: "var(--text-muted)" }}> + {check.detail} + </span> + </div> + ) + )} </div> )} </div> - )} - - {/* Transcript */} - <TranscriptViewer lines={transcriptLines} /> + </div> </div> ); } diff --git a/dashboard/src/components/TranscriptViewer.tsx b/dashboard/src/components/TranscriptViewer.tsx @@ -164,8 +164,8 @@ export default function TranscriptViewer({ lines }: TranscriptViewerProps) { .filter(Boolean); return ( - <div className="card" style={{ maxHeight: "600px", overflow: "auto" }}> - <h3 style={{ marginBottom: "16px", position: "sticky", top: 0, background: "var(--bg-card)", paddingBottom: "8px" }}> + <div className="card" style={{ maxHeight: "80vh", overflow: "auto" }}> + <h3 style={{ marginBottom: "16px", position: "sticky", top: 0, background: "var(--bg-card)", paddingBottom: "8px", zIndex: 1 }}> Transcript ({events.length} events) </h3> {events.map((event, i) => renderEvent(event, i))} diff --git a/dashboard/src/pages/run/[id].astro b/dashboard/src/pages/run/[id].astro @@ -1,30 +1,46 @@ --- import Base from "../../layouts/Base.astro"; -import { loadAllRuns, loadTranscript } from "../../lib/data"; +import { loadAllRuns, loadTranscript, getAxisValues } from "../../lib/data"; import RunDetail from "../../components/RunDetail"; +const allRuns = loadAllRuns(); +const axisValues = getAxisValues(allRuns); + export function getStaticPaths() { const runs = loadAllRuns(); + const axisValues = getAxisValues(runs); return runs.map((run) => ({ params: { id: run.meta.run_id }, - props: { run }, + props: { run, axisValues }, })); } -const { run } = Astro.props; +const { run, axisValues: av } = Astro.props; const transcriptLines = loadTranscript(run.meta.run_id); + +// Format run ID for display +const parts = run.meta.run_id.split("_run"); +const runNum = parts.length > 1 ? `Run #${parts[parts.length - 1]}` : ""; --- -<Base title={`Run: ${run.meta.run_id}`}> +<Base title={`${run.meta.task} - ${run.meta.model} - ${run.meta.prompt_style}`}> <div style="margin-bottom: 16px;"> <a href="/" style="font-size: 0.875rem;">Back to Grid</a> </div> - <h1 style="margin-bottom: 4px; font-size: 1.25rem; font-family: var(--font-mono);"> - {run.meta.run_id} - </h1> - <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.875rem;"> - Run #{run.meta.run_number} - {run.meta.completed_at || "in progress"} + <div style="display: flex; align-items: baseline; gap: 12px; margin-bottom: 4px;"> + <h1 style="font-size: 1.5rem;">{run.meta.task}</h1> + <span style="color: var(--text-muted); font-size: 0.875rem;">{runNum}</span> + </div> + <p style="color: var(--text-muted); margin-bottom: 24px; font-size: 0.8rem;"> + {run.meta.completed_at || "in progress"} </p> - <RunDetail client:load run={run} transcriptLines={transcriptLines} /> + <RunDetail client:load run={run} transcriptLines={transcriptLines} axisValues={av} /> </Base> + +<style> + :global(.container) { + max-width: none !important; + padding: 0 32px !important; + } +</style> diff --git a/harness/run.py b/harness/run.py @@ -293,6 +293,16 @@ def main(): print("ERROR: claude CLI not found in PATH.") sys.exit(1) + # Capture claude version for metadata + claude_version = "unknown" + try: + result = subprocess.run( + ["claude", "--version"], capture_output=True, text=True, timeout=5 + ) + claude_version = result.stdout.strip() or result.stderr.strip() + except Exception: + pass + print("=" * 40) print("Loop Benchmarking Harness") print("=" * 40) @@ -354,6 +364,7 @@ def main(): **cell, "run_id": run_id, "run_number": run_num, + "claude_version": claude_version, "started_at": datetime.now(timezone.utc).isoformat(), } (run_dir / "meta.json").write_text(json.dumps(meta, indent=2))

Impressum · Datenschutz