Add re-eval command, show all eval dimensions in run detail UI - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 94409286cf8c72a0a629e21c349d23425e0739ad
parent ee245799e717394d2faaa0060b8b1b0a24fed503
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Sat,  4 Apr 2026 08:26:56 +0200

Add re-eval command, show all eval dimensions in run detail UI

reeval.py:
- Re-runs eval scripts against existing workspace archives
- Supports -j N for parallel execution
- Uses artifact directories or extracts from .tar.gz
- Usage: python3 harness/reeval.py -j 4

Run detail page now shows:
- 7 score bars (overall, structural, functional, quality, code analysis,
  transcript, gameplay bot)
- Code analysis card: files, LOC, deps, function length, nesting depth,
  naming consistency, comments ratio, separation of concerns, duplication
- Agent behavior card: tool call breakdown, wasted turns, productivity
  ratio, self-testing, error count

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M dashboard/src/components/RunDetail.tsx  | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
A harness/reeval.py  | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 227 insertions(+), 16 deletions(-)
diff --git a/dashboard/src/components/RunDetail.tsx b/dashboard/src/components/RunDetail.tsx
@@ -3,6 +3,15 @@ import TranscriptViewer from "./TranscriptViewer";
 
 const REPO_URL = "https://git.statagroup.com/research/loop-benchmarking";
 
+function Stat({ label, value }: { label: string; value: string | number | boolean }) {
+  return (
+    <div style={{ display: "flex", justifyContent: "space-between" }}>
+      <span style={{ color: "var(--text-muted)" }}>{label}</span>
+      <span style={{ fontFamily: "var(--font-mono)" }}>{String(value)}</span>
+    </div>
+  );
+}
+
 interface RunDetailProps {
   run: Run;
   transcriptLines: string[];
@@ -193,7 +202,7 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai
           })}
         </div>
 
-        {/* Scores + checks */}
+        {/* Scores */}
         <div className="card" style={{ padding: "16px" }}>
           <h3 style={{ marginBottom: "10px", fontSize: "0.85rem" }}>Evaluation</h3>
           {eval_results && (
@@ -202,26 +211,82 @@ export default function RunDetail({ run, transcriptLines, axisValues }: RunDetai
               <ScoreBar label="Structural" score={eval_results.structural?.score} />
               <ScoreBar label="Functional" score={eval_results.functional?.score} />
               <ScoreBar label="Quality" score={eval_results.quality?.score} />
+              <ScoreBar label="Code Analysis" score={(eval_results as Record<string, any>).code_analysis?.score} />
+              <ScoreBar label="Transcript" score={(eval_results as Record<string, any>).transcript_analysis?.score} />
+              <ScoreBar label="Gameplay Bot" score={(eval_results as Record<string, any>).gameplay_bot?.score} />
             </>
           )}
-          {eval_results?.structural?.checks && (
-            <div style={{ marginTop: "10px", borderTop: "1px solid var(--border)", paddingTop: "8px" }}>
-              <div style={{ fontSize: "0.7rem", color: "var(--text-muted)", marginBottom: "4px" }}>Checks</div>
-              {eval_results.structural.checks.map(
-                (check: { pass: boolean; name: string; detail: string }, i: number) => (
-                  <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}>
-                    <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>
-                      {check.pass ? "+" : "-"}
-                    </span>
-                    <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span>
-                  </div>
-                )
-              )}
-            </div>
-          )}
         </div>
       </div>
 
+      {/* Detail cards row */}
+      <div style={{ display: "grid", gridTemplateColumns: "1fr 1fr 1fr", gap: "16px" }}>
+        {/* Structural checks */}
+        {eval_results?.structural?.checks && (
+          <div className="card" style={{ padding: "16px" }}>
+            <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Structural Checks</h4>
+            {eval_results.structural.checks.map(
+              (check: { pass: boolean; name: string; detail: string }, i: number) => (
+                <div key={i} style={{ display: "flex", gap: "6px", fontSize: "0.7rem", marginBottom: "2px" }}>
+                  <span style={{ color: check.pass ? "var(--green)" : "var(--red)", flexShrink: 0 }}>
+                    {check.pass ? "+" : "-"}
+                  </span>
+                  <span style={{ fontFamily: "var(--font-mono)" }}>{check.name}</span>
+                </div>
+              )
+            )}
+          </div>
+        )}
+
+        {/* Code analysis details */}
+        {(eval_results as Record<string, any>)?.code_analysis && !(eval_results as Record<string, any>).code_analysis.error && (
+          <div className="card" style={{ padding: "16px" }}>
+            <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Code Analysis</h4>
+            {(() => {
+              const ca = (eval_results as Record<string, any>).code_analysis;
+              return (
+                <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}>
+                  <Stat label="Files" value={`${ca.files?.code ?? "?"} code, ${ca.files?.unnecessary ?? 0} unnecessary`} />
+                  <Stat label="Lines of code" value={ca.lines_of_code} />
+                  <Stat label="Dependencies" value={ca.dependencies?.total ?? 0} />
+                  <Stat label="Complexity" value={ca.complexity} />
+                  <Stat label="Functions" value={`${ca.function_length?.count ?? "?"} (avg ${ca.function_length?.average ?? "?"}  lines, max ${ca.function_length?.max ?? "?"})`} />
+                  <Stat label="Max nesting" value={`${ca.max_nesting_depth} levels`} />
+                  <Stat label="Naming" value={`${ca.naming?.dominant_style} (${ca.naming?.consistency_pct}% consistent)`} />
+                  <Stat label="Comments" value={`${ca.comments?.ratio_pct ?? 0}% of source`} />
+                  <Stat label="Separation" value={ca.separation_of_concerns?.verdict} />
+                  <Stat label="Console.logs" value={ca.console_logs} />
+                  <Stat label="Duplication" value={`${ca.duplication_percentage ?? 0}%`} />
+                  {ca.html_validation && <Stat label="HTML valid" value={ca.html_validation.valid ? "yes" : `no (${ca.html_validation.errors} errors)`} />}
+                </div>
+              );
+            })()}
+          </div>
+        )}
+
+        {/* Transcript analysis details */}
+        {(eval_results as Record<string, any>)?.transcript_analysis && !(eval_results as Record<string, any>).transcript_analysis.error && (
+          <div className="card" style={{ padding: "16px" }}>
+            <h4 style={{ fontSize: "0.8rem", color: "var(--text-muted)", marginBottom: "8px" }}>Agent Behavior</h4>
+            {(() => {
+              const ta = (eval_results as Record<string, any>).transcript_analysis;
+              return (
+                <div style={{ fontSize: "0.7rem", display: "flex", flexDirection: "column", gap: "3px" }}>
+                  <Stat label="Tool calls" value={ta.tool_calls?.total ?? 0} />
+                  <Stat label="Bash" value={ta.tool_calls?.bash ?? 0} />
+                  <Stat label="Write/Edit" value={`${ta.tool_calls?.write ?? 0} / ${ta.tool_calls?.edit ?? 0}`} />
+                  <Stat label="Wasted turns" value={`${ta.wasted_turns?.total ?? 0} (${ta.wasted_turns?.docs ?? 0} docs, ${ta.wasted_turns?.ascii_art ?? 0} ascii, ${ta.wasted_turns?.server_starts ?? 0} server)`} />
+                  <Stat label="Errors hit" value={ta.errors_encountered ?? 0} />
+                  <Stat label="Productivity" value={`${((ta.productivity_ratio ?? 0) * 100).toFixed(0)}%`} />
+                  <Stat label="Self-tested" value={ta.self_tested ? "yes" : "no"} />
+                  <Stat label="Thinking blocks" value={ta.thinking_blocks ?? 0} />
+                </div>
+              );
+            })()}
+          </div>
+        )}
+      </div>
+
       {/* Bottom: transcript + artifact preview */}
       <div style={{
         display: "grid",
diff --git a/harness/reeval.py b/harness/reeval.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+"""Re-evaluate existing runs with current eval scripts.
+
+Extracts workspace archives, runs all eval scripts against them,
+and overwrites eval_results.json. Does NOT re-run claude.
+
+Usage:
+    python3 reeval.py [results_dir] [-j N]
+"""
+
+import json
+import shutil
+import subprocess
+import sys
+import tarfile
+import tempfile
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_DIR = SCRIPT_DIR.parent
+
+sys.path.insert(0, str(SCRIPT_DIR / "lib"))
+from run import evaluate, run_eval_script, safe_parse_json
+
+
+def reeval_single(run_dir: Path, project_dir: Path) -> str:
+    """Re-evaluate a single run."""
+    meta_path = run_dir / "meta.json"
+    archive_path = run_dir / "workspace.tar.gz"
+
+    if not meta_path.exists():
+        return "skip_no_meta"
+
+    meta = json.loads(meta_path.read_text())
+    task = meta.get("task", "")
+    language = meta.get("language", "typescript")
+    run_id = meta.get("run_id", run_dir.name)
+    task_dir = project_dir / "tasks" / task
+
+    if not task_dir.exists():
+        print(f"  SKIP {run_id}: task '{task}' not found")
+        return "skip_no_task"
+
+    # Extract workspace from archive if available
+    workspace = None
+    artifact_dir = project_dir / "dashboard" / "public" / "artifacts" / run_dir.name
+
+    if artifact_dir.exists() and any(artifact_dir.iterdir()):
+        # Use existing artifact directory as workspace
+        workspace = artifact_dir
+    elif archive_path.exists():
+        # Extract from archive to temp dir
+        workspace = Path(tempfile.mkdtemp(prefix="reeval-"))
+        try:
+            with tarfile.open(archive_path, "r:gz") as tar:
+                tar.extractall(workspace, filter="data")
+            # The archive has a subdirectory, find it
+            subdirs = [d for d in workspace.iterdir() if d.is_dir()]
+            if subdirs:
+                workspace = subdirs[0]
+        except Exception as e:
+            print(f"  ERROR {run_id}: failed to extract archive: {e}")
+            shutil.rmtree(workspace, ignore_errors=True)
+            return "error"
+
+    if workspace is None:
+        print(f"  SKIP {run_id}: no workspace or archive")
+        return "skip_no_workspace"
+
+    print(f"  EVAL {run_id} ({task}, {meta.get('model', '?')})")
+
+    # Build cell dict from meta
+    cell = dict(meta)
+
+    # Run evaluation
+    evaluate(task_dir, workspace, cell, run_dir)
+
+    # Clean up temp workspace (but not artifact dirs)
+    if not str(workspace).startswith(str(project_dir / "dashboard")):
+        shutil.rmtree(workspace, ignore_errors=True)
+
+    return "completed"
+
+
+def main():
+    args = sys.argv[1:]
+    parallel = 1
+    results_dir = PROJECT_DIR / "results"
+
+    i = 0
+    while i < len(args):
+        if args[i] == "-j" and i + 1 < len(args):
+            parallel = int(args[i + 1])
+            i += 2
+        else:
+            results_dir = Path(args[i])
+            i += 1
+
+    runs_dir = results_dir / "runs"
+    if not runs_dir.exists():
+        print("No runs directory found.")
+        return
+
+    run_dirs = sorted([d for d in runs_dir.iterdir() if d.is_dir()])
+    print(f"Re-evaluating {len(run_dirs)} runs (parallel={parallel})")
+    print()
+
+    completed = 0
+    skipped = 0
+    errors = 0
+
+    if parallel <= 1:
+        for run_dir in run_dirs:
+            result = reeval_single(run_dir, PROJECT_DIR)
+            if result == "completed":
+                completed += 1
+            elif result.startswith("skip"):
+                skipped += 1
+            else:
+                errors += 1
+    else:
+        with ThreadPoolExecutor(max_workers=parallel) as executor:
+            futures = {
+                executor.submit(reeval_single, rd, PROJECT_DIR): rd
+                for rd in run_dirs
+            }
+            for future in as_completed(futures):
+                try:
+                    result = future.result()
+                except Exception as e:
+                    print(f"  ERROR: {e}")
+                    result = "error"
+                if result == "completed":
+                    completed += 1
+                elif result.startswith("skip"):
+                    skipped += 1
+                else:
+                    errors += 1
+
+    print()
+    print(f"Done. Completed: {completed} | Skipped: {skipped} | Errors: {errors}")
+
+
+if __name__ == "__main__":
+    main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README

M	dashboard/src/components/RunDetail.tsx	\|	97	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
A	harness/reeval.py	\|	146	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++