Add analyze-and-push.py for quick analysis without re-eval - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

commit 1ed25ff4935f30adfa97e97c30cb9efb07ae85be
parent bd7899ed2c94b13b438823d3895834c2eb5979c4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 06:41:27 +0200

Add analyze-and-push.py for quick analysis without re-eval

clean-and-reeval.py re-runs all evals (hours).
analyze-and-push.py just rebuilds index, runs analysis, commits, pushes (seconds).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
A harness/analyze-and-push.py  | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 121 insertions(+), 0 deletions(-)
diff --git a/harness/analyze-and-push.py b/harness/analyze-and-push.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""Run analysis on existing results and push. No re-evaluation.
+
+Usage:
+    python3 harness/analyze-and-push.py [--no-push]
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+RESULTS_DIR = PROJECT_DIR / "results"
+
+sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
+
+
+def rebuild_index():
+    """Rebuild results/index.jsonl from remaining runs."""
+    index_path = RESULTS_DIR / "index.jsonl"
+    runs_dir = RESULTS_DIR / "runs"
+    count = 0
+    with open(index_path, "w") as f:
+        for run_dir in sorted(runs_dir.iterdir()):
+            meta_path = run_dir / "meta.json"
+            eval_path = run_dir / "eval_results.json"
+            if meta_path.exists() and eval_path.exists():
+                meta = json.loads(meta_path.read_text())
+                entry = {
+                    "run_id": meta.get("run_id", run_dir.name),
+                    "task": meta.get("task"),
+                    "model": meta.get("model"),
+                    "cell_id": meta.get("cell_id"),
+                    "short_id": meta.get("short_id"),
+                    "short_cell_id": meta.get("short_cell_id"),
+                    "completed_at": meta.get("completed_at"),
+                }
+                f.write(json.dumps(entry) + "\n")
+                count += 1
+    return count
+
+
+def run_analysis():
+    """Run main effects analysis for all metrics."""
+    from experiment_design import analyze_main_effects
+
+    analysis_dir = RESULTS_DIR / "analysis"
+    analysis_dir.mkdir(exist_ok=True)
+
+    metrics = [
+        "score", "cost", "turns", "wall_time",
+        "gameplay", "sonarqube", "code_quality",
+        "structural", "transcript", "build_quality",
+    ]
+    for metric in metrics:
+        try:
+            effects = analyze_main_effects(str(RESULTS_DIR), metric)
+            (analysis_dir / f"main_effects_{metric}.json").write_text(
+                json.dumps(effects, indent=2)
+            )
+        except Exception as e:
+            print(f"  Error on {metric}: {e}")
+    print(f"  Analysis updated for {len(metrics)} metrics")
+
+
+def main():
+    do_push = "--no-push" not in sys.argv
+
+    print("1. Rebuilding index...")
+    count = rebuild_index()
+    print(f"   {count} valid runs indexed")
+
+    # Count by model
+    runs_dir = RESULTS_DIR / "runs"
+    models: dict[str, int] = {}
+    for run_dir in runs_dir.iterdir():
+        if run_dir.is_dir():
+            meta_path = run_dir / "meta.json"
+            if meta_path.exists():
+                meta = json.loads(meta_path.read_text())
+                model = meta.get("actual_model", meta.get("model", "?"))
+                provider = meta.get("provider", "anthropic")
+                key = f"{model} ({provider})" if provider != "anthropic" else model
+                models[key] = models.get(key, 0) + 1
+    for model, n in sorted(models.items()):
+        print(f"   {model}: {n}")
+
+    print("\n2. Running analysis...")
+    run_analysis()
+
+    print("\n3. Committing...")
+    subprocess.run(
+        ["git", "add", "-A", "results/", "artifacts/"],
+        cwd=str(PROJECT_DIR), capture_output=True,
+    )
+
+    total = sum(models.values())
+    msg = f"Analyze and push {total} runs"
+    subprocess.run(
+        ["git", "commit", "-m", msg],
+        cwd=str(PROJECT_DIR), capture_output=True,
+    )
+
+    if do_push:
+        result = subprocess.run(
+            ["git", "push"],
+            cwd=str(PROJECT_DIR), capture_output=True, text=True,
+        )
+        if result.returncode == 0:
+            print("   Pushed.")
+        else:
+            print(f"   Push failed: {result.stderr.strip()}")
+    else:
+        print("   Committed locally (--no-push)")
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()

	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README