Fix quality scoring, add budget/timeout indicators - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit dea4bd6e2cb1f3b32bb6b65cc2c87e920e57078d
parent 5e6a1d04950a4355252486add7a6f766dd9cb1de
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon,  6 Apr 2026 06:31:18 +0200

Fix quality scoring, add budget/timeout indicators

Quality scoring:
- performance object was missing .pass field, causing it to always
  score as failed (2/3 = 67% max). Now correctly sets pass=true/false.
- All runs need re-eval to get corrected quality scores.

Budget/timeout indicators:
- Run detail: yellow warning banner when cost >= 95% of budget or
  exit code 124 (timeout). "Results may be incomplete."
- Grid table: yellow ! icon on cells where any run hit limits.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/Grid.tsx  | 9 ++++++++-
M dashboard/src/components/RunDetail.tsx  | 21 +++++++++++++++++++++
M tasks/tetris/eval/quality.sh  | 4 ++--

3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx
@@ -238,7 +238,14 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
                         ))}
                       </div>
                     </td>
-                    <td>{g.meta.task}</td>
+                    <td>
+                      {g.meta.task}
+                      {g.runs.some(r => {
+                        const cost = r.claude_output?.total_cost_usd ?? 0;
+                        const budget = r.meta.max_budget_usd ?? 0;
+                        return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124;
+                      }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>}
+                    </td>
                     <td><span className="badge badge-neutral">{g.meta.model}</span></td>
                     <td>{g.meta.effort}</td>
                     <td>{g.meta.prompt_style}</td>
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -144,6 +144,27 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
         {/* Stats */}
         <div className="card" style={{ padding: "16px" }}>
           <h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Metrics</h3>
+          {(() => {
+            const cost = claude_output?.total_cost_usd ?? 0;
+            const budget = meta.max_budget_usd ?? 0;
+            const hitBudget = budget > 0 && cost >= budget * 0.95;
+            const hitTimeout = meta.exit_code === 124;
+            if (hitBudget || hitTimeout) {
+              return (
+                <div style={{
+                  background: "rgba(234, 179, 8, 0.1)", border: "1px solid var(--yellow)",
+                  padding: "6px 10px", marginBottom: "12px", fontSize: "0.7rem",
+                  color: "var(--yellow)", textTransform: "uppercase", letterSpacing: "0.5px",
+                  display: "flex", gap: "8px", alignItems: "center",
+                }}>
+                  {hitBudget && <span>Budget limit reached (${cost.toFixed(2)} / ${budget.toFixed(2)})</span>}
+                  {hitTimeout && <span>Timeout (exceeded time limit)</span>}
+                  <span style={{ color: "var(--text-muted)" }}>Results may be incomplete</span>
+                </div>
+              );
+            }
+            return null;
+          })()}
           <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "12px" }}>
             <div>
               <ExitCodeBadge code={meta.exit_code} />
diff --git a/tasks/tetris/eval/quality.sh b/tasks/tetris/eval/quality.sh
@@ -75,10 +75,10 @@ total_size=${total_size:-0}
 
 if [[ "$total_size" -gt 524288 ]]; then
   results=$(echo "$results" | jq --argjson s "$total_size" \
-    '. + {performance: {bundle_size_bytes: $s, size_under_512kb: false}}')
+    '. + {performance: {pass: false, bundle_size_bytes: $s, size_under_512kb: false}}')
 else
   results=$(echo "$results" | jq --argjson s "$total_size" \
-    '. + {performance: {bundle_size_bytes: $s, size_under_512kb: true}}')
+    '. + {performance: {pass: true, bundle_size_bytes: $s, size_under_512kb: true}}')
 fi
 
 # --- Compute aggregate quality score ---

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/Grid.tsx	\|	9	++++++++-
M	dashboard/src/components/RunDetail.tsx	\|	21	+++++++++++++++++++++
M	tasks/tetris/eval/quality.sh	\|	4	++--