analyze-and-push.py - loop-benchmarking - Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.

analyze-and-push.py (3785B)
      1 #!/usr/bin/env python3
      2 """Run analysis on existing results and push. No re-evaluation.
      3 
      4 Usage:
      5     python3 harness/analyze-and-push.py [--no-push]
      6 """
      7 
      8 import json
      9 import subprocess
     10 import sys
     11 from pathlib import Path
     12 
     13 PROJECT_DIR = Path(__file__).resolve().parent.parent
     14 RESULTS_DIR = PROJECT_DIR / "results"
     15 
     16 sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib"))
     17 
     18 
     19 def rebuild_index():
     20     """Rebuild results/index.jsonl from remaining runs."""
     21     index_path = RESULTS_DIR / "index.jsonl"
     22     runs_dir = RESULTS_DIR / "runs"
     23     count = 0
     24     with open(index_path, "w") as f:
     25         for run_dir in sorted(runs_dir.iterdir()):
     26             meta_path = run_dir / "meta.json"
     27             eval_path = run_dir / "eval_results.json"
     28             if meta_path.exists() and eval_path.exists():
     29                 meta = json.loads(meta_path.read_text())
     30                 entry = {
     31                     "run_id": meta.get("run_id", run_dir.name),
     32                     "task": meta.get("task"),
     33                     "model": meta.get("model"),
     34                     "cell_id": meta.get("cell_id"),
     35                     "short_id": meta.get("short_id"),
     36                     "short_cell_id": meta.get("short_cell_id"),
     37                     "completed_at": meta.get("completed_at"),
     38                 }
     39                 f.write(json.dumps(entry) + "\n")
     40                 count += 1
     41     return count
     42 
     43 
     44 def run_analysis():
     45     """Run main effects analysis for all metrics."""
     46     from experiment_design import analyze_main_effects
     47 
     48     analysis_dir = RESULTS_DIR / "analysis"
     49     analysis_dir.mkdir(exist_ok=True)
     50 
     51     metrics = [
     52         "score", "cost", "turns", "wall_time",
     53         "gameplay", "sonarqube", "code_quality",
     54         "structural", "transcript", "build_quality",
     55     ]
     56     for metric in metrics:
     57         try:
     58             effects = analyze_main_effects(str(RESULTS_DIR), metric)
     59             (analysis_dir / f"main_effects_{metric}.json").write_text(
     60                 json.dumps(effects, indent=2)
     61             )
     62         except Exception as e:
     63             print(f"  Error on {metric}: {e}")
     64     print(f"  Analysis updated for {len(metrics)} metrics")
     65 
     66 
     67 def main():
     68     do_push = "--no-push" not in sys.argv
     69 
     70     print("1. Rebuilding index...")
     71     count = rebuild_index()
     72     print(f"   {count} valid runs indexed")
     73 
     74     # Count by model
     75     runs_dir = RESULTS_DIR / "runs"
     76     models: dict[str, int] = {}
     77     for run_dir in runs_dir.iterdir():
     78         if run_dir.is_dir():
     79             meta_path = run_dir / "meta.json"
     80             if meta_path.exists():
     81                 meta = json.loads(meta_path.read_text())
     82                 model = meta.get("actual_model", meta.get("model", "?"))
     83                 provider = meta.get("provider", "anthropic")
     84                 key = f"{model} ({provider})" if provider != "anthropic" else model
     85                 models[key] = models.get(key, 0) + 1
     86     for model, n in sorted(models.items()):
     87         print(f"   {model}: {n}")
     88 
     89     print("\n2. Running analysis...")
     90     run_analysis()
     91 
     92     print("\n3. Committing...")
     93     subprocess.run(
     94         ["git", "add", "-A", "results/", "artifacts/"],
     95         cwd=str(PROJECT_DIR), capture_output=True,
     96     )
     97 
     98     total = sum(models.values())
     99     msg = f"Analyze and push {total} runs"
    100     subprocess.run(
    101         ["git", "commit", "-m", msg],
    102         cwd=str(PROJECT_DIR), capture_output=True,
    103     )
    104 
    105     if do_push:
    106         result = subprocess.run(
    107             ["git", "push"],
    108             cwd=str(PROJECT_DIR), capture_output=True, text=True,
    109         )
    110         if result.returncode == 0:
    111             print("   Pushed.")
    112         else:
    113             print(f"   Push failed: {result.stderr.strip()}")
    114     else:
    115         print("   Committed locally (--no-push)")
    116 
    117     print("\nDone.")
    118 
    119 
    120 if __name__ == "__main__":
    121     main()
	loop-benchmarking Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
	git clone https://git.shiptheloop.com/loop-benchmarking.git
	Log \| Files \| Refs \| README