loop-benchmarking

Controlled experiments across agentic coding configurations. Same task, one variable, what actually works.
git clone https://git.shiptheloop.com/loop-benchmarking.git
Log | Files | Refs | README

commit 1ed25ff4935f30adfa97e97c30cb9efb07ae85be
parent bd7899ed2c94b13b438823d3895834c2eb5979c4
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue,  7 Apr 2026 06:41:27 +0200

Add analyze-and-push.py for quick analysis without re-eval

clean-and-reeval.py re-runs all evals (hours).
analyze-and-push.py just rebuilds index, runs analysis, commits, pushes (seconds).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Aharness/analyze-and-push.py | 121+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 121 insertions(+), 0 deletions(-)

diff --git a/harness/analyze-and-push.py b/harness/analyze-and-push.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""Run analysis on existing results and push. No re-evaluation. + +Usage: + python3 harness/analyze-and-push.py [--no-push] +""" + +import json +import subprocess +import sys +from pathlib import Path + +PROJECT_DIR = Path(__file__).resolve().parent.parent +RESULTS_DIR = PROJECT_DIR / "results" + +sys.path.insert(0, str(PROJECT_DIR / "harness" / "lib")) + + +def rebuild_index(): + """Rebuild results/index.jsonl from remaining runs.""" + index_path = RESULTS_DIR / "index.jsonl" + runs_dir = RESULTS_DIR / "runs" + count = 0 + with open(index_path, "w") as f: + for run_dir in sorted(runs_dir.iterdir()): + meta_path = run_dir / "meta.json" + eval_path = run_dir / "eval_results.json" + if meta_path.exists() and eval_path.exists(): + meta = json.loads(meta_path.read_text()) + entry = { + "run_id": meta.get("run_id", run_dir.name), + "task": meta.get("task"), + "model": meta.get("model"), + "cell_id": meta.get("cell_id"), + "short_id": meta.get("short_id"), + "short_cell_id": meta.get("short_cell_id"), + "completed_at": meta.get("completed_at"), + } + f.write(json.dumps(entry) + "\n") + count += 1 + return count + + +def run_analysis(): + """Run main effects analysis for all metrics.""" + from experiment_design import analyze_main_effects + + analysis_dir = RESULTS_DIR / "analysis" + analysis_dir.mkdir(exist_ok=True) + + metrics = [ + "score", "cost", "turns", "wall_time", + "gameplay", "sonarqube", "code_quality", + "structural", "transcript", "build_quality", + ] + for metric in metrics: + try: + effects = analyze_main_effects(str(RESULTS_DIR), metric) + (analysis_dir / f"main_effects_{metric}.json").write_text( + json.dumps(effects, indent=2) + ) + except Exception as e: + print(f" Error on {metric}: {e}") + print(f" Analysis updated for {len(metrics)} metrics") + + +def main(): + do_push = "--no-push" not in sys.argv + + print("1. Rebuilding index...") + count = rebuild_index() + print(f" {count} valid runs indexed") + + # Count by model + runs_dir = RESULTS_DIR / "runs" + models: dict[str, int] = {} + for run_dir in runs_dir.iterdir(): + if run_dir.is_dir(): + meta_path = run_dir / "meta.json" + if meta_path.exists(): + meta = json.loads(meta_path.read_text()) + model = meta.get("actual_model", meta.get("model", "?")) + provider = meta.get("provider", "anthropic") + key = f"{model} ({provider})" if provider != "anthropic" else model + models[key] = models.get(key, 0) + 1 + for model, n in sorted(models.items()): + print(f" {model}: {n}") + + print("\n2. Running analysis...") + run_analysis() + + print("\n3. Committing...") + subprocess.run( + ["git", "add", "-A", "results/", "artifacts/"], + cwd=str(PROJECT_DIR), capture_output=True, + ) + + total = sum(models.values()) + msg = f"Analyze and push {total} runs" + subprocess.run( + ["git", "commit", "-m", msg], + cwd=str(PROJECT_DIR), capture_output=True, + ) + + if do_push: + result = subprocess.run( + ["git", "push"], + cwd=str(PROJECT_DIR), capture_output=True, text=True, + ) + if result.returncode == 0: + print(" Pushed.") + else: + print(f" Push failed: {result.stderr.strip()}") + else: + print(" Committed locally (--no-push)") + + print("\nDone.") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz