analyze-and-push.py (3785B)
1 #!/usr/bin/env python3 2 """Run analysis on existing results and push. No re-evaluation. 3 4 Usage: 5 python3 harness/analyze-and-push.py [--no-push] 6 """ 7 8 import json 9 import subprocess 10 import sys 11 from pathlib import Path 12 13 PROJECT_DIR = Path(__file__).resolve().parent.parent 14 RESULTS_DIR = PROJECT_DIR / "results" 15 16 sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib")) 17 18 19 def rebuild_index(): 20 """Rebuild results/index.jsonl from remaining runs.""" 21 index_path = RESULTS_DIR / "index.jsonl" 22 runs_dir = RESULTS_DIR / "runs" 23 count = 0 24 with open(index_path, "w") as f: 25 for run_dir in sorted(runs_dir.iterdir()): 26 meta_path = run_dir / "meta.json" 27 eval_path = run_dir / "eval_results.json" 28 if meta_path.exists() and eval_path.exists(): 29 meta = json.loads(meta_path.read_text()) 30 entry = { 31 "run_id": meta.get("run_id", run_dir.name), 32 "task": meta.get("task"), 33 "model": meta.get("model"), 34 "cell_id": meta.get("cell_id"), 35 "short_id": meta.get("short_id"), 36 "short_cell_id": meta.get("short_cell_id"), 37 "completed_at": meta.get("completed_at"), 38 } 39 f.write(json.dumps(entry) + "\n") 40 count += 1 41 return count 42 43 44 def run_analysis(): 45 """Run main effects analysis for all metrics.""" 46 from experiment_design import analyze_main_effects 47 48 analysis_dir = RESULTS_DIR / "analysis" 49 analysis_dir.mkdir(exist_ok=True) 50 51 metrics = [ 52 "score", "cost", "turns", "wall_time", 53 "gameplay", "sonarqube", "code_quality", 54 "structural", "transcript", "build_quality", 55 ] 56 for metric in metrics: 57 try: 58 effects = analyze_main_effects(str(RESULTS_DIR), metric) 59 (analysis_dir / f"main_effects_{metric}.json").write_text( 60 json.dumps(effects, indent=2) 61 ) 62 except Exception as e: 63 print(f" Error on {metric}: {e}") 64 print(f" Analysis updated for {len(metrics)} metrics") 65 66 67 def main(): 68 do_push = "--no-push" not in sys.argv 69 70 print("1. Rebuilding index...") 71 count = rebuild_index() 72 print(f" {count} valid runs indexed") 73 74 # Count by model 75 runs_dir = RESULTS_DIR / "runs" 76 models: dict[str, int] = {} 77 for run_dir in runs_dir.iterdir(): 78 if run_dir.is_dir(): 79 meta_path = run_dir / "meta.json" 80 if meta_path.exists(): 81 meta = json.loads(meta_path.read_text()) 82 model = meta.get("actual_model", meta.get("model", "?")) 83 provider = meta.get("provider", "anthropic") 84 key = f"{model} ({provider})" if provider != "anthropic" else model 85 models[key] = models.get(key, 0) + 1 86 for model, n in sorted(models.items()): 87 print(f" {model}: {n}") 88 89 print("\n2. Running analysis...") 90 run_analysis() 91 92 print("\n3. Committing...") 93 subprocess.run( 94 ["git", "add", "-A", "results/", "artifacts/"], 95 cwd=str(PROJECT_DIR), capture_output=True, 96 ) 97 98 total = sum(models.values()) 99 msg = f"Analyze and push {total} runs" 100 subprocess.run( 101 ["git", "commit", "-m", msg], 102 cwd=str(PROJECT_DIR), capture_output=True, 103 ) 104 105 if do_push: 106 result = subprocess.run( 107 ["git", "push"], 108 cwd=str(PROJECT_DIR), capture_output=True, text=True, 109 ) 110 if result.returncode == 0: 111 print(" Pushed.") 112 else: 113 print(f" Push failed: {result.stderr.strip()}") 114 else: 115 print(" Committed locally (--no-push)") 116 117 print("\nDone.") 118 119 120 if __name__ == "__main__": 121 main()