commit 1ed25ff4935f30adfa97e97c30cb9efb07ae85be
parent bd7899ed2c94b13b438823d3895834c2eb5979c4
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 7 Apr 2026 06:41:27 +0200
Add analyze-and-push.py for quick analysis without re-eval
clean-and-reeval.py re-runs all evals (hours).
analyze-and-push.py just rebuilds index, runs analysis, commits, pushes (seconds).
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 121 insertions(+), 0 deletions(-)
diff --git a/harness/analyze-and-push.py b/harness/analyze-and-push.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""Run analysis on existing results and push. No re-evaluation.
+
+Usage:
+ python3 harness/analyze-and-push.py [--no-push]
+"""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+RESULTS_DIR = PROJECT_DIR / "results"
+
+sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
+
+
+def rebuild_index():
+ """Rebuild results/index.jsonl from remaining runs."""
+ index_path = RESULTS_DIR / "index.jsonl"
+ runs_dir = RESULTS_DIR / "runs"
+ count = 0
+ with open(index_path, "w") as f:
+ for run_dir in sorted(runs_dir.iterdir()):
+ meta_path = run_dir / "meta.json"
+ eval_path = run_dir / "eval_results.json"
+ if meta_path.exists() and eval_path.exists():
+ meta = json.loads(meta_path.read_text())
+ entry = {
+ "run_id": meta.get("run_id", run_dir.name),
+ "task": meta.get("task"),
+ "model": meta.get("model"),
+ "cell_id": meta.get("cell_id"),
+ "short_id": meta.get("short_id"),
+ "short_cell_id": meta.get("short_cell_id"),
+ "completed_at": meta.get("completed_at"),
+ }
+ f.write(json.dumps(entry) + "\n")
+ count += 1
+ return count
+
+
+def run_analysis():
+ """Run main effects analysis for all metrics."""
+ from experiment_design import analyze_main_effects
+
+ analysis_dir = RESULTS_DIR / "analysis"
+ analysis_dir.mkdir(exist_ok=True)
+
+ metrics = [
+ "score", "cost", "turns", "wall_time",
+ "gameplay", "sonarqube", "code_quality",
+ "structural", "transcript", "build_quality",
+ ]
+ for metric in metrics:
+ try:
+ effects = analyze_main_effects(str(RESULTS_DIR), metric)
+ (analysis_dir / f"main_effects_{metric}.json").write_text(
+ json.dumps(effects, indent=2)
+ )
+ except Exception as e:
+ print(f" Error on {metric}: {e}")
+ print(f" Analysis updated for {len(metrics)} metrics")
+
+
+def main():
+ do_push = "--no-push" not in sys.argv
+
+ print("1. Rebuilding index...")
+ count = rebuild_index()
+ print(f" {count} valid runs indexed")
+
+ # Count by model
+ runs_dir = RESULTS_DIR / "runs"
+ models: dict[str, int] = {}
+ for run_dir in runs_dir.iterdir():
+ if run_dir.is_dir():
+ meta_path = run_dir / "meta.json"
+ if meta_path.exists():
+ meta = json.loads(meta_path.read_text())
+ model = meta.get("actual_model", meta.get("model", "?"))
+ provider = meta.get("provider", "anthropic")
+ key = f"{model} ({provider})" if provider != "anthropic" else model
+ models[key] = models.get(key, 0) + 1
+ for model, n in sorted(models.items()):
+ print(f" {model}: {n}")
+
+ print("\n2. Running analysis...")
+ run_analysis()
+
+ print("\n3. Committing...")
+ subprocess.run(
+ ["git", "add", "-A", "results/", "artifacts/"],
+ cwd=str(PROJECT_DIR), capture_output=True,
+ )
+
+ total = sum(models.values())
+ msg = f"Analyze and push {total} runs"
+ subprocess.run(
+ ["git", "commit", "-m", msg],
+ cwd=str(PROJECT_DIR), capture_output=True,
+ )
+
+ if do_push:
+ result = subprocess.run(
+ ["git", "push"],
+ cwd=str(PROJECT_DIR), capture_output=True, text=True,
+ )
+ if result.returncode == 0:
+ print(" Pushed.")
+ else:
+ print(f" Push failed: {result.stderr.strip()}")
+ else:
+ print(" Committed locally (--no-push)")
+
+ print("\nDone.")
+
+
+if __name__ == "__main__":
+ main()