ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit 56999c964a569d3e4507b6766777b41e11f76fdd
parent 5ad6af87a22aa18f92dac25f1979ae94c66367bb
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 14 Apr 2026 23:11:14 +0200

calibration: pairwise weight fitting against labeled anchors

Scaffolding for learning per-category rubric weights from a small set
of hand-labeled anchor papers. Keeps uniform flat-question averaging
as the default behavior; opts into learned weights only when
scripts/calibration/weights.json exists.

Files:
- scripts/calibration/anchors.yaml: seed set of 8 anchors (Wakefield at
  0-15, Attention/BERT/ReAct/AlphaCode/ARC/BERT-papers at 70-90, meta
  papers Show Your Work / Deep RL that Matters at 80-92). Comments
  mark candidates to add; aim for 15+ anchors before trusting weights.
- scripts/calibration/fit-weights.py: scipy L-BFGS-B fit over
  per-category weights [0-5] with L2 regularization toward uniform and
  a pairwise ordering hinge. Prints per-anchor predicted scores + pair
  separation check, writes weights.json.
- build-explorer-data.py: compute_overall_score accepts optional
  category_weights. load_category_weights reads the JSON if present.

First fit with 8 seed anchors separates Wakefield (7.5) from Attention
(74.7) by 67 points - was 7 points with uniform weights. But the
optimizer zeros several categories at that anchor count, a classic
overfit signal. Add 7-15 more anchors before shipping weights.json.

weights.json is intentionally not committed in this PR; treat it as a
deliverable Brian generates after labeling enough anchors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mscripts/build-explorer-data.py | 70++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Ascripts/calibration/anchors.yaml | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Ascripts/calibration/fit-weights.py | 251+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 431 insertions(+), 8 deletions(-)

diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -75,22 +75,60 @@ def compute_category_score(category_data): return passed / applicable -def compute_overall_score(checklist): - applicable = 0 - passed = 0 +def compute_overall_score(checklist, category_weights=None): + """Overall rubric score for a paper. + + Default (category_weights=None): flat per-question average. A category + with more applicable questions naturally contributes more. Equivalent to + the pre-calibration behavior. + + With category_weights: weighted mean of per-category pass rates. Each + category contributes w_c * (passed_c / applicable_c). Categories with + zero applicable questions drop out cleanly. Learned weights come from + scripts/calibration/fit-weights.py. + """ + if category_weights is None: + applicable = 0 + passed = 0 + for cat_name, cat_data in checklist.items(): + if not isinstance(cat_data, dict): + continue + for q_name, q_data in cat_data.items(): + if not isinstance(q_data, dict) or "applies" not in q_data: + continue + if q_data["applies"]: + applicable += 1 + if q_data.get("answer", False): + passed += 1 + if applicable == 0: + return None + return passed / applicable + + # Weighted-per-category mean + num = 0.0 + den = 0.0 + any_applicable = False for cat_name, cat_data in checklist.items(): if not isinstance(cat_data, dict): continue + cat_app = 0 + cat_pas = 0 for q_name, q_data in cat_data.items(): if not isinstance(q_data, dict) or "applies" not in q_data: continue if q_data["applies"]: - applicable += 1 + cat_app += 1 if q_data.get("answer", False): - passed += 1 - if applicable == 0: + cat_pas += 1 + if cat_app == 0: + continue + any_applicable = True + w = category_weights.get(cat_name, 1.0) + num += w * (cat_pas / cat_app) + den += w + if not any_applicable or den == 0: return None - return passed / applicable + return num / den def flatten_checklist(checklist): @@ -224,9 +262,25 @@ def safe_median(scores): return round(s[len(s) // 2], 1) +def load_category_weights(): + """Load learned weights from scripts/calibration/weights.json if present. + Falls back to None (uniform flat-question averaging) when absent.""" + path = Path(__file__).resolve().parent / "calibration" / "weights.json" + if not path.exists(): + return None + with open(path) as f: + data = json.load(f) + return data.get("weights") + + def build(): registry = load_registry() citation_data = load_citation_graph() + category_weights = load_category_weights() + if category_weights: + print(f"Using learned category weights ({len(category_weights)} categories)") + else: + print("No calibration/weights.json; using uniform per-question weights") # Accumulators papers_full = [] @@ -291,7 +345,7 @@ def build(): is_benchmark_paper = "benchmark-eval" in reg_tags is_calibration = is_reference or is_benchmark_paper - overall = compute_overall_score(checklist) + overall = compute_overall_score(checklist, category_weights) if overall is None: continue diff --git a/scripts/calibration/anchors.yaml b/scripts/calibration/anchors.yaml @@ -0,0 +1,118 @@ +# Calibration anchor set for rubric weight learning. +# +# Each anchor = a paper ID + a target score band [low, high] + a rationale. +# Bands are RANGES, not exact scores. Label what you believe is true; let the +# optimizer fit weights that respect those beliefs. +# +# Guidelines for labeling: +# - 0-20 "bad": methodologically broken (fraud, unsupported causal claims, +# trivial sample, industry overview with zero rigor) +# - 20-40 "weak": real but underpowered or overclaimed +# - 40-60 "typical": median-of-field. Rigor varies; nothing disqualifying +# - 60-80 "good": clearly rigorous, transparent, reproducible +# - 80-95 "excellent": landmark methodology papers, meta-analyses, tight +# design + full artifact release +# +# Aim for 15-25 anchors spread across the range. Too few and the optimizer +# overfits. Too many of one band and you push everything toward the middle. +# +# Run fit-weights.py after edits. Commits weights.json next to this file. + +anchors: + + # ===================================================================== + # Known-bad (0-20): disqualifying flaws visible from the paper itself + # ===================================================================== + - id: wakefield-ileal-lymphoid-1998 + band: [0, 15] + rationale: Retracted MMR fraud. Causal claim from N=12 case series, no + control, selection bias, undisclosed COI, no mechanism, no prior + plausibility. Should be a score floor. + + # Candidates pulled from the bottom of the score distribution. You pick + # which are genuinely "known-bad" (vs just thin/industry pieces). Keep + # those you're sure about; delete or move the rest. + # - id: ai-driven-software-engineering-2023 + # band: [0, 15] + # rationale: Short opinion/overview piece with no empirical content. + # - id: precedentbased-professional-role-2025 + # band: [0, 20] + # rationale: TODO your read + # - id: attacking-llms-ai-2025 + # band: [0, 20] + # rationale: TODO your read + + # ===================================================================== + # Known-good (70-90): rigorous, landmark, or methodology reference + # ===================================================================== + - id: attention-is-all-you-need-2017 + band: [70, 85] + rationale: Foundational transformer architecture. Clear methods, clear + contribution, extensive ablations (given era). Currently scored 52.8 + which conflates "methodology reporting" with "limited artifact + practice of its era" - should clearly outrank Wakefield. + + - id: bert-pretraining-deep-2018 + band: [70, 85] + rationale: Landmark pre-training paper. Careful ablations, public model, + reproducible. Currently 55.0. + + - id: deep-rl-matters-2018 + band: [80, 92] + rationale: Rigorous meta-analysis of RL reproducibility problems. Sets the + standard for methodology critique. Currently 91.2 (keep as high-band + anchor, the rubric already treats it well). + + - id: show-your-work-2019 + band: [80, 92] + rationale: Improved Reporting of Experimental Results. The paper advocating + for rigor is itself high-rigor. Currently 91.4. + + - id: alphacode-competition-level-2022 + band: [75, 90] + rationale: Thorough evaluation, clear methodology, DeepMind scale of + reporting. Currently 85.7. + + - id: arc-measure-intelligence-2019 + band: [70, 85] + rationale: Chollet's conceptual landmark on what intelligence measurement + requires. Currently 64.7. + + - id: react-synergizing-reasoning-2022 + band: [60, 80] + rationale: Enabled the modern agentic era. Methodologically solid for + its contribution type. Currently 48.2. + + # ===================================================================== + # Middling (40-60): typical papers at the corpus median + # ===================================================================== + # TODO: hand-pick 3-5 papers you consider representative of the middle + # of the field. Picking median-scoring papers as middling anchors helps + # the optimizer avoid collapsing everything to extremes. + # + # Example candidates (you confirm they're genuinely typical): + # - id: codebert-pretrained-model-2020 # currently 52.5 + # band: [45, 60] + # - id: toolformer-language-models-2023 + # band: [40, 60] + +# ===================================================================== +# Pairwise ordering constraints (soft, in addition to bands) +# ===================================================================== +# After fitting, these pairs should hold. Optimizer applies a hinge +# penalty if they don't. Fill in as you add anchors. +pairs: + - [wakefield-ileal-lymphoid-1998, attention-is-all-you-need-2017] + - [wakefield-ileal-lymphoid-1998, bert-pretraining-deep-2018] + - [wakefield-ileal-lymphoid-1998, deep-rl-matters-2018] + - [wakefield-ileal-lymphoid-1998, show-your-work-2019] + +# Optimization settings. Leave defaults unless you know why you're changing. +settings: + level: category # "category" (14 params) or "question" (~60 params) + min_weight: 0.0 + max_weight: 5.0 + l2_reg: 0.1 # Penalty against deviating from uniform weights. + pair_margin: 20.0 # Desired separation between pairs (in score pts). + pair_penalty: 2.0 # Weight on pair ordering violations vs band fit. + seed: 42 diff --git a/scripts/calibration/fit-weights.py b/scripts/calibration/fit-weights.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +Fit per-category weights for the rubric so that a small set of labeled +anchor papers score within their target bands. + +Inputs: + scripts/calibration/anchors.yaml - Hand-labeled anchor set + papers/<id>/scan.json - Scan data for each anchor paper + +Output: + scripts/calibration/weights.json - Learned weights + metadata + +Usage: + python3 scripts/calibration/fit-weights.py + +Apply via build-explorer-data.py: + If scripts/calibration/weights.json exists, compute_overall_score and + compute_category_score use its per-category weights. Otherwise they fall + back to uniform weights (current behavior). +""" + +import json +import sys +from pathlib import Path + +try: + import yaml +except ImportError: + sys.stderr.write( + "pyyaml not installed. Run: pip install pyyaml\n" + "Or port anchors.yaml to JSON and adjust this loader.\n" + ) + sys.exit(1) + +try: + from scipy.optimize import minimize + import numpy as np +except ImportError: + sys.stderr.write( + "scipy and numpy required. Run: pip install scipy numpy\n" + ) + sys.exit(1) + + +ROOT = Path(__file__).resolve().parent.parent.parent +PAPERS_DIR = ROOT / "papers" +ANCHORS_PATH = Path(__file__).resolve().parent / "anchors.yaml" +OUT_PATH = Path(__file__).resolve().parent / "weights.json" + + +# Same 14 categories as build-explorer-data.py ALL_CATEGORIES order. +CATEGORIES = [ + "artifacts", + "statistical_methodology", + "evaluation_design", + "claims_and_evidence", + "setup_transparency", + "limitations_and_scope", + "data_integrity", + "conflicts_of_interest", + "contamination", + "human_studies", + "cost_and_practicality", + "experimental_rigor", + "data_leakage", + "survey_methodology", +] + + +def load_anchors(): + with open(ANCHORS_PATH) as f: + data = yaml.safe_load(f) + return data + + +def load_scan(paper_id): + path = PAPERS_DIR / paper_id / "scan.json" + if not path.exists(): + return None + with open(path) as f: + return json.load(f) + + +def category_counts(checklist): + """Per-category (applicable, passed) counts.""" + result = {} + for cat in CATEGORIES: + data = checklist.get(cat, {}) + app = 0 + pas = 0 + if isinstance(data, dict): + for q in data.values(): + if isinstance(q, dict) and q.get("applies"): + app += 1 + if q.get("answer"): + pas += 1 + result[cat] = (app, pas) + return result + + +def score_with_weights(counts, weights): + """Weighted-mean category score. Categories with zero applicable questions + drop out cleanly (no zero-fill bias).""" + num = 0.0 + den = 0.0 + for cat, w in zip(CATEGORIES, weights): + app, pas = counts[cat] + if app == 0: + continue + cat_rate = pas / app + num += w * cat_rate + den += w + if den == 0: + return 0.0 + return (num / den) * 100.0 + + +def loss(weights, anchors_data, pairs_data, settings): + total = 0.0 + pair_margin = settings.get("pair_margin", 20.0) + pair_penalty = settings.get("pair_penalty", 2.0) + l2 = settings.get("l2_reg", 0.1) + + # Band fit loss + scores_by_id = {} + for anchor in anchors_data: + pid = anchor["id"] + band = anchor["band"] + counts = anchor["_counts"] + s = score_with_weights(counts, weights) + scores_by_id[pid] = s + lo, hi = band + target = (lo + hi) / 2 + # Quadratic toward target + total += (s - target) ** 2 + # Hinge at band boundaries for soft enforcement + if s < lo: + total += (lo - s) ** 2 * 0.5 + if s > hi: + total += (s - hi) ** 2 * 0.5 + + # Pair ordering loss: first id should score < second id by >= pair_margin + for lo_id, hi_id in pairs_data: + if lo_id not in scores_by_id or hi_id not in scores_by_id: + continue + gap = scores_by_id[hi_id] - scores_by_id[lo_id] + if gap < pair_margin: + total += pair_penalty * (pair_margin - gap) ** 2 + + # L2 regularization toward uniform weights (reference is 1.0 per category) + for w in weights: + total += l2 * (w - 1.0) ** 2 + + return total + + +def main(): + data = load_anchors() + settings = data.get("settings", {}) + anchors = data.get("anchors", []) + pairs = data.get("pairs", []) + + if len(anchors) < 5: + sys.stderr.write( + f"WARNING: only {len(anchors)} anchors labeled. Add more to " + f"anchors.yaml before trusting the fit (aim for 15+).\n" + ) + + # Attach scan data to each anchor + for anchor in anchors: + pid = anchor["id"] + scan = load_scan(pid) + if scan is None: + sys.stderr.write(f"SKIP: no scan.json for {pid}\n") + anchor["_skip"] = True + continue + anchor["_counts"] = category_counts(scan.get("checklist", {})) + + anchors = [a for a in anchors if not a.get("_skip")] + if not anchors: + sys.stderr.write("No usable anchors.\n") + sys.exit(1) + + np.random.seed(settings.get("seed", 42)) + x0 = np.ones(len(CATEGORIES)) # Start at uniform weights + + result = minimize( + loss, + x0, + args=(anchors, pairs, settings), + method="L-BFGS-B", + bounds=[(settings.get("min_weight", 0.0), settings.get("max_weight", 5.0))] * len(CATEGORIES), + options={"maxiter": 500}, + ) + + weights = result.x.tolist() + weight_map = dict(zip(CATEGORIES, [round(w, 4) for w in weights])) + + # Report + print("=" * 70) + print("LEARNED WEIGHTS") + print("=" * 70) + for cat, w in weight_map.items(): + bar = "#" * int(w * 10) + print(f" {cat:<28} {w:>6.3f} {bar}") + print() + + print("=" * 70) + print("ANCHOR SCORES (predicted after fit)") + print("=" * 70) + for anchor in anchors: + pid = anchor["id"] + band = anchor["band"] + pred = score_with_weights(anchor["_counts"], weights) + in_band = "OK" if band[0] <= pred <= band[1] else "**" + print(f" {in_band} {pred:>6.1f} target {band[0]:>3}-{band[1]:<3} {pid}") + print() + + print("=" * 70) + print("PAIR ORDERING CHECK") + print("=" * 70) + scores_by_id = { + a["id"]: score_with_weights(a["_counts"], weights) for a in anchors + } + for lo_id, hi_id in pairs: + if lo_id not in scores_by_id or hi_id not in scores_by_id: + print(f" SKIP {lo_id} < {hi_id} (missing scan)") + continue + gap = scores_by_id[hi_id] - scores_by_id[lo_id] + ok = "OK" if gap >= settings.get("pair_margin", 20.0) else "**" + print( + f" {ok} {scores_by_id[lo_id]:>5.1f} < {scores_by_id[hi_id]:<5.1f} " + f"(gap {gap:+.1f}) {lo_id} < {hi_id}" + ) + print() + + out = { + "weights": weight_map, + "n_anchors": len(anchors), + "n_pairs": len(pairs), + "loss": float(result.fun), + "converged": bool(result.success), + "settings": settings, + } + with open(OUT_PATH, "w") as f: + json.dump(out, f, indent=2) + print(f"Wrote {OUT_PATH}") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz