loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 94409286cf8c72a0a629e21c349d23425e0739ad
parent ee245799e717394d2faaa0060b8b1b0a24fed503
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:26:56 +0200

Add re-eval command, show all eval dimensions in run detail UI

reeval.py:
- Re-runs eval scripts against existing workspace archives
- Supports -j N for parallel execution
- Uses artifact directories or extracts from .tar.gz
- Usage: python3 harness/reeval.py -j 4

Run detail page now shows:
- 7 score bars (overall, structural, functional, quality, code analysis,
  transcript, gameplay bot)
- Code analysis card: files, LOC, deps, function length, nesting depth,
  naming consistency, comments ratio, separation of concerns, duplication
- Agent behavior card: tool call breakdown, wasted turns, productivity
  ratio, self-testing, error count

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mdashboard/src/components/RunDetail.tsx | 97++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
Aharness/reeval.py | 146+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 227 insertions(+), 16 deletions(-)

diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx @@ -3,6 +3,15 @@ import TranscriptViewer from "./TranscriptViewer"; const REPO_URL = "https://git.statagroup.com/research/loop-benchmarking"; +function Stat({ label, value }: { label: string; value: string | number | boolean }) { + return ( + <div style={{ display: "flex", justifyContent: "space-between" }}> + <span style={{ color: "var(--text-muted)" }}>{label}</span> + <span style={{ fontFamily: "var(--font-mono)" }}>{String(value)}</span> + </div> + ); +} + interface RunDetailProps { run: Run; transcriptLines: string[]; @@ -193,7 +202,7 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai })} </div> - {/* Scores + checks */} + {/* Scores */} <div className="card" style={{ padding: "16px" }}> <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Evaluation</h3> {eval_results && ( @@ -202,26 +211,82 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai <ScoreBar label="Structural" score={eval_results.structural?.score} /> <ScoreBar label="Functional" score={eval_results.functional?.score} /> <ScoreBar label="Quality" score={eval_results.quality?.score} /> + <ScoreBar label="Code Analysis" score={(eval_results as Record<string, any>).code_analysis?.score} /> + <ScoreBar label="Transcript" score={(eval_results as Record<string, any>).transcript_analysis?.score} /> + <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} /> </> )} - {eval_results?.structural?.checks && ( - <div style={{ marginTop: "10px", borderTop: "1px solid var(--border)", paddingTop: "8px" }}> - <div style={{ fontSize: "0.7rem", color: "var(--text-muted)", marginBottom: "4px" }}>Checks</div> - {eval_results.structural.checks.map( - (check: { pass: boolean; name: string; detail: string }, i: number) => ( - <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}> - <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}> - {check.pass ? "+" : "-"} - </span> - <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span> - </div> - ) - )} - </div> - )} </div> </div> + {/* Detail cards row */} + <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr 1fr", gap: "16px" }}> + {/* Structural checks */} + {eval_results?.structural?.checks && ( + <div className="card" style={{ padding: "16px" }}> + <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Structural Checks</h4> + {eval_results.structural.checks.map( + (check: { pass: boolean; name: string; detail: string }, i: number) => ( + <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}> + <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}> + {check.pass ? "+" : "-"} + </span> + <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span> + </div> + ) + )} + </div> + )} + + {/* Code analysis details */} + {(eval_results as Record<string, any>)?.code_analysis && !(eval_results as Record<string, any>).code_analysis.error && ( + <div className="card" style={{ padding: "16px" }}> + <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Code Analysis</h4> + {(() => { + const ca = (eval_results as Record<string, any>).code_analysis; + return ( + <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}> + <Stat label="Files" value={`${ca.files?.code ?? "?"} code, ${ca.files?.unnecessary ?? 0} unnecessary`} /> + <Stat label="Lines of code" value={ca.lines_of_code} /> + <Stat label="Dependencies" value={ca.dependencies?.total ?? 0} /> + <Stat label="Complexity" value={ca.complexity} /> + <Stat label="Functions" value={`${ca.function_length?.count ?? "?"} (avg ${ca.function_length?.average ?? "?"} lines, max ${ca.function_length?.max ?? "?"})`} /> + <Stat label="Max nesting" value={`${ca.max_nesting_depth} levels`} /> + <Stat label="Naming" value={`${ca.naming?.dominant_style} (${ca.naming?.consistency_pct}% consistent)`} /> + <Stat label="Comments" value={`${ca.comments?.ratio_pct ?? 0}% of source`} /> + <Stat label="Separation" value={ca.separation_of_concerns?.verdict} /> + <Stat label="Console.logs" value={ca.console_logs} /> + <Stat label="Duplication" value={`${ca.duplication_percentage ?? 0}%`} /> + {ca.html_validation && <Stat label="HTML valid" value={ca.html_validation.valid ? "yes" : `no (${ca.html_validation.errors} errors)`} />} + </div> + ); + })()} + </div> + )} + + {/* Transcript analysis details */} + {(eval_results as Record<string, any>)?.transcript_analysis && !(eval_results as Record<string, any>).transcript_analysis.error && ( + <div className="card" style={{ padding: "16px" }}> + <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Agent Behavior</h4> + {(() => { + const ta = (eval_results as Record<string, any>).transcript_analysis; + return ( + <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}> + <Stat label="Tool calls" value={ta.tool_calls?.total ?? 0} /> + <Stat label="Bash" value={ta.tool_calls?.bash ?? 0} /> + <Stat label="Write/Edit" value={`${ta.tool_calls?.write ?? 0} / ${ta.tool_calls?.edit ?? 0}`} /> + <Stat label="Wasted turns" value={`${ta.wasted_turns?.total ?? 0} (${ta.wasted_turns?.docs ?? 0} docs, ${ta.wasted_turns?.ascii_art ?? 0} ascii, ${ta.wasted_turns?.server_starts ?? 0} server)`} /> + <Stat label="Errors hit" value={ta.errors_encountered ?? 0} /> + <Stat label="Productivity" value={`${((ta.productivity_ratio ?? 0) * 100).toFixed(0)}%`} /> + <Stat label="Self-tested" value={ta.self_tested ? "yes" : "no"} /> + <Stat label="Thinking blocks" value={ta.thinking_blocks ?? 0} /> + </div> + ); + })()} + </div> + )} + </div> + {/* Bottom: transcript + artifact preview */} <div style={{ display: "grid", diff --git a/harness/reeval.py b/harness/reeval.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Re-evaluate existing runs with current eval scripts. + +Extracts workspace archives, runs all eval scripts against them, +and overwrites eval_results.json. Does NOT re-run claude. + +Usage: + python3 reeval.py [results_dir] [-j N] +""" + +import json +import shutil +import subprocess +import sys +import tarfile +import tempfile +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_DIR = SCRIPT_DIR.parent + +sys.path.insert(0, str(SCRIPT_DIR / "lib")) +from run import evaluate, run_eval_script, safe_parse_json + + +def reeval_single(run_dir: Path, project_dir: Path) -> str: + """Re-evaluate a single run.""" + meta_path = run_dir / "meta.json" + archive_path = run_dir / "workspace.tar.gz" + + if not meta_path.exists(): + return "skip_no_meta" + + meta = json.loads(meta_path.read_text()) + task = meta.get("task", "") + language = meta.get("language", "typescript") + run_id = meta.get("run_id", run_dir.name) + task_dir = project_dir / "tasks" / task + + if not task_dir.exists(): + print(f" SKIP {run_id}: task '{task}' not found") + return "skip_no_task" + + # Extract workspace from archive if available + workspace = None + artifact_dir = project_dir / "dashboard" / "public" / "artifacts" / run_dir.name + + if artifact_dir.exists() and any(artifact_dir.iterdir()): + # Use existing artifact directory as workspace + workspace = artifact_dir + elif archive_path.exists(): + # Extract from archive to temp dir + workspace = Path(tempfile.mkdtemp(prefix="reeval-")) + try: + with tarfile.open(archive_path, "r:gz") as tar: + tar.extractall(workspace, filter="data") + # The archive has a subdirectory, find it + subdirs = [d for d in workspace.iterdir() if d.is_dir()] + if subdirs: + workspace = subdirs[0] + except Exception as e: + print(f" ERROR {run_id}: failed to extract archive: {e}") + shutil.rmtree(workspace, ignore_errors=True) + return "error" + + if workspace is None: + print(f" SKIP {run_id}: no workspace or archive") + return "skip_no_workspace" + + print(f" EVAL {run_id} ({task}, {meta.get('model', '?')})") + + # Build cell dict from meta + cell = dict(meta) + + # Run evaluation + evaluate(task_dir, workspace, cell, run_dir) + + # Clean up temp workspace (but not artifact dirs) + if not str(workspace).startswith(str(project_dir / "dashboard")): + shutil.rmtree(workspace, ignore_errors=True) + + return "completed" + + +def main(): + args = sys.argv[1:] + parallel = 1 + results_dir = PROJECT_DIR / "results" + + i = 0 + while i < len(args): + if args[i] == "-j" and i + 1 < len(args): + parallel = int(args[i + 1]) + i += 2 + else: + results_dir = Path(args[i]) + i += 1 + + runs_dir = results_dir / "runs" + if not runs_dir.exists(): + print("No runs directory found.") + return + + run_dirs = sorted([d for d in runs_dir.iterdir() if d.is_dir()]) + print(f"Re-evaluating {len(run_dirs)} runs (parallel={parallel})") + print() + + completed = 0 + skipped = 0 + errors = 0 + + if parallel <= 1: + for run_dir in run_dirs: + result = reeval_single(run_dir, PROJECT_DIR) + if result == "completed": + completed += 1 + elif result.startswith("skip"): + skipped += 1 + else: + errors += 1 + else: + with ThreadPoolExecutor(max_workers=parallel) as executor: + futures = { + executor.submit(reeval_single, rd, PROJECT_DIR): rd + for rd in run_dirs + } + for future in as_completed(futures): + try: + result = future.result() + except Exception as e: + print(f" ERROR: {e}") + result = "error" + if result == "completed": + completed += 1 + elif result.startswith("skip"): + skipped += 1 + else: + errors += 1 + + print() + print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz