commit dea4bd6e2cb1f3b32bb6b65cc2c87e920e57078d
parent 5e6a1d04950a4355252486add7a6f766dd9cb1de
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Mon, 6 Apr 2026 06:31:18 +0200
Fix quality scoring, add budget/timeout indicators
Quality scoring:
- performance object was missing .pass field, causing it to always
score as failed (2/3 = 67% max). Now correctly sets pass=true/false.
- All runs need re-eval to get corrected quality scores.
Budget/timeout indicators:
- Run detail: yellow warning banner when cost >= 95% of budget or
exit code 124 (timeout). "Results may be incomplete."
- Grid table: yellow ! icon on cells where any run hit limits.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/dashboard/src/components/Grid.tsx b/dashboard/src/components/Grid.tsx
@@ -238,7 +238,14 @@ export default function Grid({ runs, axisValues, tasks }: GridProps) {
))}
</div>
</td>
- <td>{g.meta.task}</td>
+ <td>
+ {g.meta.task}
+ {g.runs.some(r => {
+ const cost = r.claude_output?.total_cost_usd ?? 0;
+ const budget = r.meta.max_budget_usd ?? 0;
+ return (budget > 0 && cost >= budget * 0.95) || r.meta.exit_code === 124;
+ }) && <span style={{ color: "var(--yellow)", marginLeft: "4px", fontSize: "0.65rem" }} title="Budget or time limit reached">!</span>}
+ </td>
<td><span className="badge badge-neutral">{g.meta.model}</span></td>
<td>{g.meta.effort}</td>
<td>{g.meta.prompt_style}</td>
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -144,6 +144,27 @@ export default function RunDetail({ run, transcriptLines, axisValues, contextCon
{/* Stats */}
<div className="card" style={{ padding: "16px" }}>
<h3 style={{ marginBottom: "12px", fontSize: "0.85rem" }}>Metrics</h3>
+ {(() => {
+ const cost = claude_output?.total_cost_usd ?? 0;
+ const budget = meta.max_budget_usd ?? 0;
+ const hitBudget = budget > 0 && cost >= budget * 0.95;
+ const hitTimeout = meta.exit_code === 124;
+ if (hitBudget || hitTimeout) {
+ return (
+ <div style={{
+ background: "rgba(234, 179, 8, 0.1)", border: "1px solid var(--yellow)",
+ padding: "6px 10px", marginBottom: "12px", fontSize: "0.7rem",
+ color: "var(--yellow)", textTransform: "uppercase", letterSpacing: "0.5px",
+ display: "flex", gap: "8px", alignItems: "center",
+ }}>
+ {hitBudget && <span>Budget limit reached (${cost.toFixed(2)} / ${budget.toFixed(2)})</span>}
+ {hitTimeout && <span>Timeout (exceeded time limit)</span>}
+ <span style={{ color: "var(--text-muted)" }}>Results may be incomplete</span>
+ </div>
+ );
+ }
+ return null;
+ })()}
<div style={{ display: "grid", gridTemplateColumns: "1fr 1fr", gap: "12px" }}>
<div>
<ExitCodeBadge code={meta.exit_code} />
diff --git a/tasks/tetris/eval/quality.sh b/tasks/tetris/eval/quality.sh
@@ -75,10 +75,10 @@ total_size=${total_size:-0}
if [[ "$total_size" -gt 524288 ]]; then
results=$(echo "$results" | jq --argjson s "$total_size" \
- '. + {performance: {bundle_size_bytes: $s, size_under_512kb: false}}')
+ '. + {performance: {pass: false, bundle_size_bytes: $s, size_under_512kb: false}}')
else
results=$(echo "$results" | jq --argjson s "$total_size" \
- '. + {performance: {bundle_size_bytes: $s, size_under_512kb: true}}')
+ '. + {performance: {pass: true, bundle_size_bytes: $s, size_under_512kb: true}}')
fi
# --- Compute aggregate quality score ---