calibration: pairwise weight fitting against labeled anchors - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit 56999c964a569d3e4507b6766777b41e11f76fdd
parent 5ad6af87a22aa18f92dac25f1979ae94c66367bb
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 14 Apr 2026 23:11:14 +0200

calibration: pairwise weight fitting against labeled anchors

Scaffolding for learning per-category rubric weights from a small set
of hand-labeled anchor papers. Keeps uniform flat-question averaging
as the default behavior; opts into learned weights only when
scripts/calibration/weights.json exists.

Files:
- scripts/calibration/anchors.yaml: seed set of 8 anchors (Wakefield at
  0-15, Attention/BERT/ReAct/AlphaCode/ARC/BERT-papers at 70-90, meta
  papers Show Your Work / Deep RL that Matters at 80-92). Comments
  mark candidates to add; aim for 15+ anchors before trusting weights.
- scripts/calibration/fit-weights.py: scipy L-BFGS-B fit over
  per-category weights [0-5] with L2 regularization toward uniform and
  a pairwise ordering hinge. Prints per-anchor predicted scores + pair
  separation check, writes weights.json.
- build-explorer-data.py: compute_overall_score accepts optional
  category_weights. load_category_weights reads the JSON if present.

First fit with 8 seed anchors separates Wakefield (7.5) from Attention
(74.7) by 67 points - was 7 points with uniform weights. But the
optimizer zeros several categories at that anchor count, a classic
overfit signal. Add 7-15 more anchors before shipping weights.json.

weights.json is intentionally not committed in this PR; treat it as a
deliverable Brian generates after labeling enough anchors.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M scripts/build-explorer-data.py  | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
A scripts/calibration/anchors.yaml  | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A scripts/calibration/fit-weights.py  | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 431 insertions(+), 8 deletions(-)
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -75,22 +75,60 @@ def compute_category_score(category_data):
     return passed / applicable
 
 
-def compute_overall_score(checklist):
-    applicable = 0
-    passed = 0
+def compute_overall_score(checklist, category_weights=None):
+    """Overall rubric score for a paper.
+
+    Default (category_weights=None): flat per-question average. A category
+    with more applicable questions naturally contributes more. Equivalent to
+    the pre-calibration behavior.
+
+    With category_weights: weighted mean of per-category pass rates. Each
+    category contributes w_c * (passed_c / applicable_c). Categories with
+    zero applicable questions drop out cleanly. Learned weights come from
+    scripts/calibration/fit-weights.py.
+    """
+    if category_weights is None:
+        applicable = 0
+        passed = 0
+        for cat_name, cat_data in checklist.items():
+            if not isinstance(cat_data, dict):
+                continue
+            for q_name, q_data in cat_data.items():
+                if not isinstance(q_data, dict) or "applies" not in q_data:
+                    continue
+                if q_data["applies"]:
+                    applicable += 1
+                    if q_data.get("answer", False):
+                        passed += 1
+        if applicable == 0:
+            return None
+        return passed / applicable
+
+    # Weighted-per-category mean
+    num = 0.0
+    den = 0.0
+    any_applicable = False
     for cat_name, cat_data in checklist.items():
         if not isinstance(cat_data, dict):
             continue
+        cat_app = 0
+        cat_pas = 0
         for q_name, q_data in cat_data.items():
             if not isinstance(q_data, dict) or "applies" not in q_data:
                 continue
             if q_data["applies"]:
-                applicable += 1
+                cat_app += 1
                 if q_data.get("answer", False):
-                    passed += 1
-    if applicable == 0:
+                    cat_pas += 1
+        if cat_app == 0:
+            continue
+        any_applicable = True
+        w = category_weights.get(cat_name, 1.0)
+        num += w * (cat_pas / cat_app)
+        den += w
+    if not any_applicable or den == 0:
         return None
-    return passed / applicable
+    return num / den
 
 
 def flatten_checklist(checklist):
@@ -224,9 +262,25 @@ def safe_median(scores):
     return round(s[len(s) // 2], 1)
 
 
+def load_category_weights():
+    """Load learned weights from scripts/calibration/weights.json if present.
+    Falls back to None (uniform flat-question averaging) when absent."""
+    path = Path(__file__).resolve().parent / "calibration" / "weights.json"
+    if not path.exists():
+        return None
+    with open(path) as f:
+        data = json.load(f)
+    return data.get("weights")
+
+
 def build():
     registry = load_registry()
     citation_data = load_citation_graph()
+    category_weights = load_category_weights()
+    if category_weights:
+        print(f"Using learned category weights ({len(category_weights)} categories)")
+    else:
+        print("No calibration/weights.json; using uniform per-question weights")
 
     # Accumulators
     papers_full = []
@@ -291,7 +345,7 @@ def build():
         is_benchmark_paper = "benchmark-eval" in reg_tags
         is_calibration = is_reference or is_benchmark_paper
 
-        overall = compute_overall_score(checklist)
+        overall = compute_overall_score(checklist, category_weights)
         if overall is None:
             continue
 
diff --git a/scripts/calibration/anchors.yaml b/scripts/calibration/anchors.yaml
@@ -0,0 +1,118 @@
+# Calibration anchor set for rubric weight learning.
+#
+# Each anchor = a paper ID + a target score band [low, high] + a rationale.
+# Bands are RANGES, not exact scores. Label what you believe is true; let the
+# optimizer fit weights that respect those beliefs.
+#
+# Guidelines for labeling:
+#   - 0-20   "bad": methodologically broken (fraud, unsupported causal claims,
+#                   trivial sample, industry overview with zero rigor)
+#   - 20-40  "weak": real but underpowered or overclaimed
+#   - 40-60  "typical": median-of-field. Rigor varies; nothing disqualifying
+#   - 60-80  "good": clearly rigorous, transparent, reproducible
+#   - 80-95  "excellent": landmark methodology papers, meta-analyses, tight
+#                         design + full artifact release
+#
+# Aim for 15-25 anchors spread across the range. Too few and the optimizer
+# overfits. Too many of one band and you push everything toward the middle.
+#
+# Run fit-weights.py after edits. Commits weights.json next to this file.
+
+anchors:
+
+  # =====================================================================
+  # Known-bad (0-20): disqualifying flaws visible from the paper itself
+  # =====================================================================
+  - id: wakefield-ileal-lymphoid-1998
+    band: [0, 15]
+    rationale: Retracted MMR fraud. Causal claim from N=12 case series, no
+      control, selection bias, undisclosed COI, no mechanism, no prior
+      plausibility. Should be a score floor.
+
+  # Candidates pulled from the bottom of the score distribution. You pick
+  # which are genuinely "known-bad" (vs just thin/industry pieces). Keep
+  # those you're sure about; delete or move the rest.
+  # - id: ai-driven-software-engineering-2023
+  #   band: [0, 15]
+  #   rationale: Short opinion/overview piece with no empirical content.
+  # - id: precedentbased-professional-role-2025
+  #   band: [0, 20]
+  #   rationale: TODO your read
+  # - id: attacking-llms-ai-2025
+  #   band: [0, 20]
+  #   rationale: TODO your read
+
+  # =====================================================================
+  # Known-good (70-90): rigorous, landmark, or methodology reference
+  # =====================================================================
+  - id: attention-is-all-you-need-2017
+    band: [70, 85]
+    rationale: Foundational transformer architecture. Clear methods, clear
+      contribution, extensive ablations (given era). Currently scored 52.8
+      which conflates "methodology reporting" with "limited artifact
+      practice of its era" - should clearly outrank Wakefield.
+
+  - id: bert-pretraining-deep-2018
+    band: [70, 85]
+    rationale: Landmark pre-training paper. Careful ablations, public model,
+      reproducible. Currently 55.0.
+
+  - id: deep-rl-matters-2018
+    band: [80, 92]
+    rationale: Rigorous meta-analysis of RL reproducibility problems. Sets the
+      standard for methodology critique. Currently 91.2 (keep as high-band
+      anchor, the rubric already treats it well).
+
+  - id: show-your-work-2019
+    band: [80, 92]
+    rationale: Improved Reporting of Experimental Results. The paper advocating
+      for rigor is itself high-rigor. Currently 91.4.
+
+  - id: alphacode-competition-level-2022
+    band: [75, 90]
+    rationale: Thorough evaluation, clear methodology, DeepMind scale of
+      reporting. Currently 85.7.
+
+  - id: arc-measure-intelligence-2019
+    band: [70, 85]
+    rationale: Chollet's conceptual landmark on what intelligence measurement
+      requires. Currently 64.7.
+
+  - id: react-synergizing-reasoning-2022
+    band: [60, 80]
+    rationale: Enabled the modern agentic era. Methodologically solid for
+      its contribution type. Currently 48.2.
+
+  # =====================================================================
+  # Middling (40-60): typical papers at the corpus median
+  # =====================================================================
+  # TODO: hand-pick 3-5 papers you consider representative of the middle
+  # of the field. Picking median-scoring papers as middling anchors helps
+  # the optimizer avoid collapsing everything to extremes.
+  #
+  # Example candidates (you confirm they're genuinely typical):
+  # - id: codebert-pretrained-model-2020    # currently 52.5
+  #   band: [45, 60]
+  # - id: toolformer-language-models-2023
+  #   band: [40, 60]
+
+# =====================================================================
+# Pairwise ordering constraints (soft, in addition to bands)
+# =====================================================================
+# After fitting, these pairs should hold. Optimizer applies a hinge
+# penalty if they don't. Fill in as you add anchors.
+pairs:
+  - [wakefield-ileal-lymphoid-1998, attention-is-all-you-need-2017]
+  - [wakefield-ileal-lymphoid-1998, bert-pretraining-deep-2018]
+  - [wakefield-ileal-lymphoid-1998, deep-rl-matters-2018]
+  - [wakefield-ileal-lymphoid-1998, show-your-work-2019]
+
+# Optimization settings. Leave defaults unless you know why you're changing.
+settings:
+  level: category          # "category" (14 params) or "question" (~60 params)
+  min_weight: 0.0
+  max_weight: 5.0
+  l2_reg: 0.1              # Penalty against deviating from uniform weights.
+  pair_margin: 20.0        # Desired separation between pairs (in score pts).
+  pair_penalty: 2.0        # Weight on pair ordering violations vs band fit.
+  seed: 42
diff --git a/scripts/calibration/fit-weights.py b/scripts/calibration/fit-weights.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Fit per-category weights for the rubric so that a small set of labeled
+anchor papers score within their target bands.
+
+Inputs:
+  scripts/calibration/anchors.yaml     - Hand-labeled anchor set
+  papers/<id>/scan.json                - Scan data for each anchor paper
+
+Output:
+  scripts/calibration/weights.json     - Learned weights + metadata
+
+Usage:
+  python3 scripts/calibration/fit-weights.py
+
+Apply via build-explorer-data.py:
+  If scripts/calibration/weights.json exists, compute_overall_score and
+  compute_category_score use its per-category weights. Otherwise they fall
+  back to uniform weights (current behavior).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+try:
+    import yaml
+except ImportError:
+    sys.stderr.write(
+        "pyyaml not installed. Run: pip install pyyaml\n"
+        "Or port anchors.yaml to JSON and adjust this loader.\n"
+    )
+    sys.exit(1)
+
+try:
+    from scipy.optimize import minimize
+    import numpy as np
+except ImportError:
+    sys.stderr.write(
+        "scipy and numpy required. Run: pip install scipy numpy\n"
+    )
+    sys.exit(1)
+
+
+ROOT = Path(__file__).resolve().parent.parent.parent
+PAPERS_DIR = ROOT / "papers"
+ANCHORS_PATH = Path(__file__).resolve().parent / "anchors.yaml"
+OUT_PATH = Path(__file__).resolve().parent / "weights.json"
+
+
+# Same 14 categories as build-explorer-data.py ALL_CATEGORIES order.
+CATEGORIES = [
+    "artifacts",
+    "statistical_methodology",
+    "evaluation_design",
+    "claims_and_evidence",
+    "setup_transparency",
+    "limitations_and_scope",
+    "data_integrity",
+    "conflicts_of_interest",
+    "contamination",
+    "human_studies",
+    "cost_and_practicality",
+    "experimental_rigor",
+    "data_leakage",
+    "survey_methodology",
+]
+
+
+def load_anchors():
+    with open(ANCHORS_PATH) as f:
+        data = yaml.safe_load(f)
+    return data
+
+
+def load_scan(paper_id):
+    path = PAPERS_DIR / paper_id / "scan.json"
+    if not path.exists():
+        return None
+    with open(path) as f:
+        return json.load(f)
+
+
+def category_counts(checklist):
+    """Per-category (applicable, passed) counts."""
+    result = {}
+    for cat in CATEGORIES:
+        data = checklist.get(cat, {})
+        app = 0
+        pas = 0
+        if isinstance(data, dict):
+            for q in data.values():
+                if isinstance(q, dict) and q.get("applies"):
+                    app += 1
+                    if q.get("answer"):
+                        pas += 1
+        result[cat] = (app, pas)
+    return result
+
+
+def score_with_weights(counts, weights):
+    """Weighted-mean category score. Categories with zero applicable questions
+    drop out cleanly (no zero-fill bias)."""
+    num = 0.0
+    den = 0.0
+    for cat, w in zip(CATEGORIES, weights):
+        app, pas = counts[cat]
+        if app == 0:
+            continue
+        cat_rate = pas / app
+        num += w * cat_rate
+        den += w
+    if den == 0:
+        return 0.0
+    return (num / den) * 100.0
+
+
+def loss(weights, anchors_data, pairs_data, settings):
+    total = 0.0
+    pair_margin = settings.get("pair_margin", 20.0)
+    pair_penalty = settings.get("pair_penalty", 2.0)
+    l2 = settings.get("l2_reg", 0.1)
+
+    # Band fit loss
+    scores_by_id = {}
+    for anchor in anchors_data:
+        pid = anchor["id"]
+        band = anchor["band"]
+        counts = anchor["_counts"]
+        s = score_with_weights(counts, weights)
+        scores_by_id[pid] = s
+        lo, hi = band
+        target = (lo + hi) / 2
+        # Quadratic toward target
+        total += (s - target) ** 2
+        # Hinge at band boundaries for soft enforcement
+        if s < lo:
+            total += (lo - s) ** 2 * 0.5
+        if s > hi:
+            total += (s - hi) ** 2 * 0.5
+
+    # Pair ordering loss: first id should score < second id by >= pair_margin
+    for lo_id, hi_id in pairs_data:
+        if lo_id not in scores_by_id or hi_id not in scores_by_id:
+            continue
+        gap = scores_by_id[hi_id] - scores_by_id[lo_id]
+        if gap < pair_margin:
+            total += pair_penalty * (pair_margin - gap) ** 2
+
+    # L2 regularization toward uniform weights (reference is 1.0 per category)
+    for w in weights:
+        total += l2 * (w - 1.0) ** 2
+
+    return total
+
+
+def main():
+    data = load_anchors()
+    settings = data.get("settings", {})
+    anchors = data.get("anchors", [])
+    pairs = data.get("pairs", [])
+
+    if len(anchors) < 5:
+        sys.stderr.write(
+            f"WARNING: only {len(anchors)} anchors labeled. Add more to "
+            f"anchors.yaml before trusting the fit (aim for 15+).\n"
+        )
+
+    # Attach scan data to each anchor
+    for anchor in anchors:
+        pid = anchor["id"]
+        scan = load_scan(pid)
+        if scan is None:
+            sys.stderr.write(f"SKIP: no scan.json for {pid}\n")
+            anchor["_skip"] = True
+            continue
+        anchor["_counts"] = category_counts(scan.get("checklist", {}))
+
+    anchors = [a for a in anchors if not a.get("_skip")]
+    if not anchors:
+        sys.stderr.write("No usable anchors.\n")
+        sys.exit(1)
+
+    np.random.seed(settings.get("seed", 42))
+    x0 = np.ones(len(CATEGORIES))  # Start at uniform weights
+
+    result = minimize(
+        loss,
+        x0,
+        args=(anchors, pairs, settings),
+        method="L-BFGS-B",
+        bounds=[(settings.get("min_weight", 0.0), settings.get("max_weight", 5.0))] * len(CATEGORIES),
+        options={"maxiter": 500},
+    )
+
+    weights = result.x.tolist()
+    weight_map = dict(zip(CATEGORIES, [round(w, 4) for w in weights]))
+
+    # Report
+    print("=" * 70)
+    print("LEARNED WEIGHTS")
+    print("=" * 70)
+    for cat, w in weight_map.items():
+        bar = "#" * int(w * 10)
+        print(f"  {cat:<28} {w:>6.3f}  {bar}")
+    print()
+
+    print("=" * 70)
+    print("ANCHOR SCORES (predicted after fit)")
+    print("=" * 70)
+    for anchor in anchors:
+        pid = anchor["id"]
+        band = anchor["band"]
+        pred = score_with_weights(anchor["_counts"], weights)
+        in_band = "OK" if band[0] <= pred <= band[1] else "**"
+        print(f"  {in_band} {pred:>6.1f}  target {band[0]:>3}-{band[1]:<3}  {pid}")
+    print()
+
+    print("=" * 70)
+    print("PAIR ORDERING CHECK")
+    print("=" * 70)
+    scores_by_id = {
+        a["id"]: score_with_weights(a["_counts"], weights) for a in anchors
+    }
+    for lo_id, hi_id in pairs:
+        if lo_id not in scores_by_id or hi_id not in scores_by_id:
+            print(f"  SKIP  {lo_id} < {hi_id}  (missing scan)")
+            continue
+        gap = scores_by_id[hi_id] - scores_by_id[lo_id]
+        ok = "OK" if gap >= settings.get("pair_margin", 20.0) else "**"
+        print(
+            f"  {ok} {scores_by_id[lo_id]:>5.1f} < {scores_by_id[hi_id]:<5.1f} "
+            f"(gap {gap:+.1f}) {lo_id}  <  {hi_id}"
+        )
+    print()
+
+    out = {
+        "weights": weight_map,
+        "n_anchors": len(anchors),
+        "n_pairs": len(pairs),
+        "loss": float(result.fun),
+        "converged": bool(result.success),
+        "settings": settings,
+    }
+    with open(OUT_PATH, "w") as f:
+        json.dump(out, f, indent=2)
+    print(f"Wrote {OUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs

M	scripts/build-explorer-data.py	\|	70	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
A	scripts/calibration/anchors.yaml	\|	118	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	scripts/calibration/fit-weights.py	\|	251	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++