loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit dea4bd6e2cb1f3b32bb6b65cc2c87e920e57078d
parent 5e6a1d04950a4355252486add7a6f766dd9cb1de
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 06:31:18 +0200

Fix quality scoring, add budget/timeout indicators

Quality scoring:
- performance object was missing .pass field, causing it to always
  score as failed (2/3 = 67% max). Now correctly sets pass=true/false.
- All runs need re-eval to get corrected quality scores.

Budget/timeout indicators:
- Run detail: yellow warning banner when cost >= 95% of budget or
  exit code 124 (timeout). "Results may be incomplete."
- Grid table: yellow ! icon on cells where any run hit limits.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/Grid.tsx | 9++++++++-
Mdashboard/src/components/RunDetail.tsx | 21+++++++++++++++++++++
Mtasks/tetris/eval/quality.sh | 4++--
3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx @@ -238,7 +238,14 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) { ))} </div> </td> - <td>{g.meta.task}</td> + <td> + {g.meta.task} + {g.runs.some(r => { + const cost = r.claude_output?.total_cost_usd ?? 0; + const budget = r.meta.max_budget_usd ?? 0; + return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124; + }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>} + </td> <td><span className="badge badge-neutral">{g.meta.model}</span></td> <td>{g.meta.effort}</td> <td>{g.meta.prompt_style}</td> diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx @@ -144,6 +144,27 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon {/* Stats */} <div className="card" style={{ padding: "16px" }}> <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Metrics</h3> + {(() => { + const cost = claude_output?.total_cost_usd ?? 0; + const budget = meta.max_budget_usd ?? 0; + const hitBudget = budget > 0 && cost >= budget * 0.95; + const hitTimeout = meta.exit_code === 124; + if (hitBudget || hitTimeout) { + return ( + <div style={{ + background: "rgba(234, 179, 8, 0.1)", border: "1px solid var(--yellow)", + padding: "6px 10px", marginBottom: "12px", fontSize: "0.7rem", + color: "var(--yellow)", textTransform: "uppercase", letterSpacing: "0.5px", + display: "flex", gap: "8px", alignItems: "center", + }}> + {hitBudget && <span>Budget limit reached (${cost.toFixed(2)} / ${budget.toFixed(2)})</span>} + {hitTimeout && <span>Timeout (exceeded time limit)</span>} + <span style={{ color: "var(--text-muted)" }}>Results may be incomplete</span> + </div> + ); + } + return null; + })()} <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "12px" }}> <div> <ExitCodeBadge code={meta.exit_code} /> diff --git a/tasks/tetris/eval/quality.sh b/tasks/tetris/eval/quality.sh @@ -75,10 +75,10 @@ total_size=${total_size:-0} if [[ "$total_size" -gt 524288 ]]; then results=$(echo "$results" | jq --argjson s "$total_size" \ - '. + {performance: {bundle_size_bytes: $s, size_under_512kb: false}}') + '. + {performance: {pass: false, bundle_size_bytes: $s, size_under_512kb: false}}') else results=$(echo "$results" | jq --argjson s "$total_size" \ - '. + {performance: {bundle_size_bytes: $s, size_under_512kb: true}}') + '. + {performance: {pass: true, bundle_size_bytes: $s, size_under_512kb: true}}') fi # --- Compute aggregate quality score ---

Impressum · Datenschutz