commit 94409286cf8c72a0a629e21c349d23425e0739ad
parent ee245799e717394d2faaa0060b8b1b0a24fed503
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Sat, 4 Apr 2026 08:26:56 +0200
Add re-eval command, show all eval dimensions in run detail UI
reeval.py:
- Re-runs eval scripts against existing workspace archives
- Supports -j N for parallel execution
- Uses artifact directories or extracts from .tar.gz
- Usage: python3 harness/reeval.py -j 4
Run detail page now shows:
- 7 score bars (overall, structural, functional, quality, code analysis,
transcript, gameplay bot)
- Code analysis card: files, LOC, deps, function length, nesting depth,
naming consistency, comments ratio, separation of concerns, duplication
- Agent behavior card: tool call breakdown, wasted turns, productivity
ratio, self-testing, error count
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 227 insertions(+), 16 deletions(-)
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -3,6 +3,15 @@ import TranscriptViewer from "./TranscriptViewer";
const REPO_URL = "https://git.statagroup.com/research/loop-benchmarking";
+function Stat({ label, value }: { label: string; value: string | number | boolean }) {
+ return (
+ <div style={{ display: "flex", justifyContent: "space-between" }}>
+ <span style={{ color: "var(--text-muted)" }}>{label}</span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{String(value)}</span>
+ </div>
+ );
+}
+
interface RunDetailProps {
run: Run;
transcriptLines: string[];
@@ -193,7 +202,7 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai
})}
</div>
- {/* Scores + checks */}
+ {/* Scores */}
<div className="card" style={{ padding: "16px" }}>
<h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Evaluation</h3>
{eval_results && (
@@ -202,26 +211,82 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai
<ScoreBar label="Structural" score={eval_results.structural?.score} />
<ScoreBar label="Functional" score={eval_results.functional?.score} />
<ScoreBar label="Quality" score={eval_results.quality?.score} />
+ <ScoreBar label="Code Analysis" score={(eval_results as Record<string, any>).code_analysis?.score} />
+ <ScoreBar label="Transcript" score={(eval_results as Record<string, any>).transcript_analysis?.score} />
+ <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
</>
)}
- {eval_results?.structural?.checks && (
- <div style={{ marginTop: "10px", borderTop: "1px solid var(--border)", paddingTop: "8px" }}>
- <div style={{ fontSize: "0.7rem", color: "var(--text-muted)", marginBottom: "4px" }}>Checks</div>
- {eval_results.structural.checks.map(
- (check: { pass: boolean; name: string; detail: string }, i: number) => (
- <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}>
- <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>
- {check.pass ? "+" : "-"}
- </span>
- <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span>
- </div>
- )
- )}
- </div>
- )}
</div>
</div>
+ {/* Detail cards row */}
+ <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr 1fr", gap: "16px" }}>
+ {/* Structural checks */}
+ {eval_results?.structural?.checks && (
+ <div className="card" style={{ padding: "16px" }}>
+ <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Structural Checks</h4>
+ {eval_results.structural.checks.map(
+ (check: { pass: boolean; name: string; detail: string }, i: number) => (
+ <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}>
+ <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>
+ {check.pass ? "+" : "-"}
+ </span>
+ <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span>
+ </div>
+ )
+ )}
+ </div>
+ )}
+
+ {/* Code analysis details */}
+ {(eval_results as Record<string, any>)?.code_analysis && !(eval_results as Record<string, any>).code_analysis.error && (
+ <div className="card" style={{ padding: "16px" }}>
+ <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Code Analysis</h4>
+ {(() => {
+ const ca = (eval_results as Record<string, any>).code_analysis;
+ return (
+ <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}>
+ <Stat label="Files" value={`${ca.files?.code ?? "?"} code, ${ca.files?.unnecessary ?? 0} unnecessary`} />
+ <Stat label="Lines of code" value={ca.lines_of_code} />
+ <Stat label="Dependencies" value={ca.dependencies?.total ?? 0} />
+ <Stat label="Complexity" value={ca.complexity} />
+ <Stat label="Functions" value={`${ca.function_length?.count ?? "?"} (avg ${ca.function_length?.average ?? "?"} lines, max ${ca.function_length?.max ?? "?"})`} />
+ <Stat label="Max nesting" value={`${ca.max_nesting_depth} levels`} />
+ <Stat label="Naming" value={`${ca.naming?.dominant_style} (${ca.naming?.consistency_pct}% consistent)`} />
+ <Stat label="Comments" value={`${ca.comments?.ratio_pct ?? 0}% of source`} />
+ <Stat label="Separation" value={ca.separation_of_concerns?.verdict} />
+ <Stat label="Console.logs" value={ca.console_logs} />
+ <Stat label="Duplication" value={`${ca.duplication_percentage ?? 0}%`} />
+ {ca.html_validation && <Stat label="HTML valid" value={ca.html_validation.valid ? "yes" : `no (${ca.html_validation.errors} errors)`} />}
+ </div>
+ );
+ })()}
+ </div>
+ )}
+
+ {/* Transcript analysis details */}
+ {(eval_results as Record<string, any>)?.transcript_analysis && !(eval_results as Record<string, any>).transcript_analysis.error && (
+ <div className="card" style={{ padding: "16px" }}>
+ <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Agent Behavior</h4>
+ {(() => {
+ const ta = (eval_results as Record<string, any>).transcript_analysis;
+ return (
+ <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}>
+ <Stat label="Tool calls" value={ta.tool_calls?.total ?? 0} />
+ <Stat label="Bash" value={ta.tool_calls?.bash ?? 0} />
+ <Stat label="Write/Edit" value={`${ta.tool_calls?.write ?? 0} / ${ta.tool_calls?.edit ?? 0}`} />
+ <Stat label="Wasted turns" value={`${ta.wasted_turns?.total ?? 0} (${ta.wasted_turns?.docs ?? 0} docs, ${ta.wasted_turns?.ascii_art ?? 0} ascii, ${ta.wasted_turns?.server_starts ?? 0} server)`} />
+ <Stat label="Errors hit" value={ta.errors_encountered ?? 0} />
+ <Stat label="Productivity" value={`${((ta.productivity_ratio ?? 0) * 100).toFixed(0)}%`} />
+ <Stat label="Self-tested" value={ta.self_tested ? "yes" : "no"} />
+ <Stat label="Thinking blocks" value={ta.thinking_blocks ?? 0} />
+ </div>
+ );
+ })()}
+ </div>
+ )}
+ </div>
+
{/* Bottom: transcript + artifact preview */}
<div style={{
display: "grid",
diff --git a/harness/reeval.py b/harness/reeval.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""Re-evaluate existing runs with current eval scripts.
+
+Extracts workspace archives, runs all eval scripts against them,
+and overwrites eval_results.json. Does NOT re-run claude.
+
+Usage:
+ python3 reeval.py [results_dir] [-j N]
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_DIR = SCRIPT_DIR.parent
+
+sys.path.insert(0, str(SCRIPT_DIR / "lib"))
+from run import evaluate, run_eval_script, safe_parse_json
+
+
+def reeval_single(run_dir: Path, project_dir: Path) -> str:
+ """Re-evaluate a single run."""
+ meta_path = run_dir / "meta.json"
+ archive_path = run_dir / "workspace.tar.gz"
+
+ if not meta_path.exists():
+ return "skip_no_meta"
+
+ meta = json.loads(meta_path.read_text())
+ task = meta.get("task", "")
+ language = meta.get("language", "typescript")
+ run_id = meta.get("run_id", run_dir.name)
+ task_dir = project_dir / "tasks" / task
+
+ if not task_dir.exists():
+ print(f" SKIP {run_id}: task '{task}' not found")
+ return "skip_no_task"
+
+ # Extract workspace from archive if available
+ workspace = None
+ artifact_dir = project_dir / "dashboard" / "public" / "artifacts" / run_dir.name
+
+ if artifact_dir.exists() and any(artifact_dir.iterdir()):
+ # Use existing artifact directory as workspace
+ workspace = artifact_dir
+ elif archive_path.exists():
+ # Extract from archive to temp dir
+ workspace = Path(tempfile.mkdtemp(prefix="reeval-"))
+ try:
+ with tarfile.open(archive_path, "r:gz") as tar:
+ tar.extractall(workspace, filter="data")
+ # The archive has a subdirectory, find it
+ subdirs = [d for d in workspace.iterdir() if d.is_dir()]
+ if subdirs:
+ workspace = subdirs[0]
+ except Exception as e:
+ print(f" ERROR {run_id}: failed to extract archive: {e}")
+ shutil.rmtree(workspace, ignore_errors=True)
+ return "error"
+
+ if workspace is None:
+ print(f" SKIP {run_id}: no workspace or archive")
+ return "skip_no_workspace"
+
+ print(f" EVAL {run_id} ({task}, {meta.get('model', '?')})")
+
+ # Build cell dict from meta
+ cell = dict(meta)
+
+ # Run evaluation
+ evaluate(task_dir, workspace, cell, run_dir)
+
+ # Clean up temp workspace (but not artifact dirs)
+ if not str(workspace).startswith(str(project_dir / "dashboard")):
+ shutil.rmtree(workspace, ignore_errors=True)
+
+ return "completed"
+
+
+def main():
+ args = sys.argv[1:]
+ parallel = 1
+ results_dir = PROJECT_DIR / "results"
+
+ i = 0
+ while i < len(args):
+ if args[i] == "-j" and i + 1 < len(args):
+ parallel = int(args[i + 1])
+ i += 2
+ else:
+ results_dir = Path(args[i])
+ i += 1
+
+ runs_dir = results_dir / "runs"
+ if not runs_dir.exists():
+ print("No runs directory found.")
+ return
+
+ run_dirs = sorted([d for d in runs_dir.iterdir() if d.is_dir()])
+ print(f"Re-evaluating {len(run_dirs)} runs (parallel={parallel})")
+ print()
+
+ completed = 0
+ skipped = 0
+ errors = 0
+
+ if parallel <= 1:
+ for run_dir in run_dirs:
+ result = reeval_single(run_dir, PROJECT_DIR)
+ if result == "completed":
+ completed += 1
+ elif result.startswith("skip"):
+ skipped += 1
+ else:
+ errors += 1
+ else:
+ with ThreadPoolExecutor(max_workers=parallel) as executor:
+ futures = {
+ executor.submit(reeval_single, rd, PROJECT_DIR): rd
+ for rd in run_dirs
+ }
+ for future in as_completed(futures):
+ try:
+ result = future.result()
+ except Exception as e:
+ print(f" ERROR: {e}")
+ result = "error"
+ if result == "completed":
+ completed += 1
+ elif result.startswith("skip"):
+ skipped += 1
+ else:
+ errors += 1
+
+ print()
+ print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}")
+
+
+if __name__ == "__main__":
+ main()