commit 56999c964a569d3e4507b6766777b41e11f76fdd
parent 5ad6af87a22aa18f92dac25f1979ae94c66367bb
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 14 Apr 2026 23:11:14 +0200
calibration: pairwise weight fitting against labeled anchors
Scaffolding for learning per-category rubric weights from a small set
of hand-labeled anchor papers. Keeps uniform flat-question averaging
as the default behavior; opts into learned weights only when
scripts/calibration/weights.json exists.
Files:
- scripts/calibration/anchors.yaml: seed set of 8 anchors (Wakefield at
0-15, Attention/BERT/ReAct/AlphaCode/ARC/BERT-papers at 70-90, meta
papers Show Your Work / Deep RL that Matters at 80-92). Comments
mark candidates to add; aim for 15+ anchors before trusting weights.
- scripts/calibration/fit-weights.py: scipy L-BFGS-B fit over
per-category weights [0-5] with L2 regularization toward uniform and
a pairwise ordering hinge. Prints per-anchor predicted scores + pair
separation check, writes weights.json.
- build-explorer-data.py: compute_overall_score accepts optional
category_weights. load_category_weights reads the JSON if present.
First fit with 8 seed anchors separates Wakefield (7.5) from Attention
(74.7) by 67 points - was 7 points with uniform weights. But the
optimizer zeros several categories at that anchor count, a classic
overfit signal. Add 7-15 more anchors before shipping weights.json.
weights.json is intentionally not committed in this PR; treat it as a
deliverable Brian generates after labeling enough anchors.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
3 files changed, 431 insertions(+), 8 deletions(-)
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -75,22 +75,60 @@ def compute_category_score(category_data):
return passed / applicable
-def compute_overall_score(checklist):
- applicable = 0
- passed = 0
+def compute_overall_score(checklist, category_weights=None):
+ """Overall rubric score for a paper.
+
+ Default (category_weights=None): flat per-question average. A category
+ with more applicable questions naturally contributes more. Equivalent to
+ the pre-calibration behavior.
+
+ With category_weights: weighted mean of per-category pass rates. Each
+ category contributes w_c * (passed_c / applicable_c). Categories with
+ zero applicable questions drop out cleanly. Learned weights come from
+ scripts/calibration/fit-weights.py.
+ """
+ if category_weights is None:
+ applicable = 0
+ passed = 0
+ for cat_name, cat_data in checklist.items():
+ if not isinstance(cat_data, dict):
+ continue
+ for q_name, q_data in cat_data.items():
+ if not isinstance(q_data, dict) or "applies" not in q_data:
+ continue
+ if q_data["applies"]:
+ applicable += 1
+ if q_data.get("answer", False):
+ passed += 1
+ if applicable == 0:
+ return None
+ return passed / applicable
+
+ # Weighted-per-category mean
+ num = 0.0
+ den = 0.0
+ any_applicable = False
for cat_name, cat_data in checklist.items():
if not isinstance(cat_data, dict):
continue
+ cat_app = 0
+ cat_pas = 0
for q_name, q_data in cat_data.items():
if not isinstance(q_data, dict) or "applies" not in q_data:
continue
if q_data["applies"]:
- applicable += 1
+ cat_app += 1
if q_data.get("answer", False):
- passed += 1
- if applicable == 0:
+ cat_pas += 1
+ if cat_app == 0:
+ continue
+ any_applicable = True
+ w = category_weights.get(cat_name, 1.0)
+ num += w * (cat_pas / cat_app)
+ den += w
+ if not any_applicable or den == 0:
return None
- return passed / applicable
+ return num / den
def flatten_checklist(checklist):
@@ -224,9 +262,25 @@ def safe_median(scores):
return round(s[len(s) // 2], 1)
+def load_category_weights():
+ """Load learned weights from scripts/calibration/weights.json if present.
+ Falls back to None (uniform flat-question averaging) when absent."""
+ path = Path(__file__).resolve().parent / "calibration" / "weights.json"
+ if not path.exists():
+ return None
+ with open(path) as f:
+ data = json.load(f)
+ return data.get("weights")
+
+
def build():
registry = load_registry()
citation_data = load_citation_graph()
+ category_weights = load_category_weights()
+ if category_weights:
+ print(f"Using learned category weights ({len(category_weights)} categories)")
+ else:
+ print("No calibration/weights.json; using uniform per-question weights")
# Accumulators
papers_full = []
@@ -291,7 +345,7 @@ def build():
is_benchmark_paper = "benchmark-eval" in reg_tags
is_calibration = is_reference or is_benchmark_paper
- overall = compute_overall_score(checklist)
+ overall = compute_overall_score(checklist, category_weights)
if overall is None:
continue
diff --git a/scripts/calibration/anchors.yaml b/scripts/calibration/anchors.yaml
@@ -0,0 +1,118 @@
+# Calibration anchor set for rubric weight learning.
+#
+# Each anchor = a paper ID + a target score band [low, high] + a rationale.
+# Bands are RANGES, not exact scores. Label what you believe is true; let the
+# optimizer fit weights that respect those beliefs.
+#
+# Guidelines for labeling:
+# - 0-20 "bad": methodologically broken (fraud, unsupported causal claims,
+# trivial sample, industry overview with zero rigor)
+# - 20-40 "weak": real but underpowered or overclaimed
+# - 40-60 "typical": median-of-field. Rigor varies; nothing disqualifying
+# - 60-80 "good": clearly rigorous, transparent, reproducible
+# - 80-95 "excellent": landmark methodology papers, meta-analyses, tight
+# design + full artifact release
+#
+# Aim for 15-25 anchors spread across the range. Too few and the optimizer
+# overfits. Too many of one band and you push everything toward the middle.
+#
+# Run fit-weights.py after edits. Commits weights.json next to this file.
+
+anchors:
+
+ # =====================================================================
+ # Known-bad (0-20): disqualifying flaws visible from the paper itself
+ # =====================================================================
+ - id: wakefield-ileal-lymphoid-1998
+ band: [0, 15]
+ rationale: Retracted MMR fraud. Causal claim from N=12 case series, no
+ control, selection bias, undisclosed COI, no mechanism, no prior
+ plausibility. Should be a score floor.
+
+ # Candidates pulled from the bottom of the score distribution. You pick
+ # which are genuinely "known-bad" (vs just thin/industry pieces). Keep
+ # those you're sure about; delete or move the rest.
+ # - id: ai-driven-software-engineering-2023
+ # band: [0, 15]
+ # rationale: Short opinion/overview piece with no empirical content.
+ # - id: precedentbased-professional-role-2025
+ # band: [0, 20]
+ # rationale: TODO your read
+ # - id: attacking-llms-ai-2025
+ # band: [0, 20]
+ # rationale: TODO your read
+
+ # =====================================================================
+ # Known-good (70-90): rigorous, landmark, or methodology reference
+ # =====================================================================
+ - id: attention-is-all-you-need-2017
+ band: [70, 85]
+ rationale: Foundational transformer architecture. Clear methods, clear
+ contribution, extensive ablations (given era). Currently scored 52.8
+ which conflates "methodology reporting" with "limited artifact
+ practice of its era" - should clearly outrank Wakefield.
+
+ - id: bert-pretraining-deep-2018
+ band: [70, 85]
+ rationale: Landmark pre-training paper. Careful ablations, public model,
+ reproducible. Currently 55.0.
+
+ - id: deep-rl-matters-2018
+ band: [80, 92]
+ rationale: Rigorous meta-analysis of RL reproducibility problems. Sets the
+ standard for methodology critique. Currently 91.2 (keep as high-band
+ anchor, the rubric already treats it well).
+
+ - id: show-your-work-2019
+ band: [80, 92]
+ rationale: Improved Reporting of Experimental Results. The paper advocating
+ for rigor is itself high-rigor. Currently 91.4.
+
+ - id: alphacode-competition-level-2022
+ band: [75, 90]
+ rationale: Thorough evaluation, clear methodology, DeepMind scale of
+ reporting. Currently 85.7.
+
+ - id: arc-measure-intelligence-2019
+ band: [70, 85]
+ rationale: Chollet's conceptual landmark on what intelligence measurement
+ requires. Currently 64.7.
+
+ - id: react-synergizing-reasoning-2022
+ band: [60, 80]
+ rationale: Enabled the modern agentic era. Methodologically solid for
+ its contribution type. Currently 48.2.
+
+ # =====================================================================
+ # Middling (40-60): typical papers at the corpus median
+ # =====================================================================
+ # TODO: hand-pick 3-5 papers you consider representative of the middle
+ # of the field. Picking median-scoring papers as middling anchors helps
+ # the optimizer avoid collapsing everything to extremes.
+ #
+ # Example candidates (you confirm they're genuinely typical):
+ # - id: codebert-pretrained-model-2020 # currently 52.5
+ # band: [45, 60]
+ # - id: toolformer-language-models-2023
+ # band: [40, 60]
+
+# =====================================================================
+# Pairwise ordering constraints (soft, in addition to bands)
+# =====================================================================
+# After fitting, these pairs should hold. Optimizer applies a hinge
+# penalty if they don't. Fill in as you add anchors.
+pairs:
+ - [wakefield-ileal-lymphoid-1998, attention-is-all-you-need-2017]
+ - [wakefield-ileal-lymphoid-1998, bert-pretraining-deep-2018]
+ - [wakefield-ileal-lymphoid-1998, deep-rl-matters-2018]
+ - [wakefield-ileal-lymphoid-1998, show-your-work-2019]
+
+# Optimization settings. Leave defaults unless you know why you're changing.
+settings:
+ level: category # "category" (14 params) or "question" (~60 params)
+ min_weight: 0.0
+ max_weight: 5.0
+ l2_reg: 0.1 # Penalty against deviating from uniform weights.
+ pair_margin: 20.0 # Desired separation between pairs (in score pts).
+ pair_penalty: 2.0 # Weight on pair ordering violations vs band fit.
+ seed: 42
diff --git a/scripts/calibration/fit-weights.py b/scripts/calibration/fit-weights.py
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Fit per-category weights for the rubric so that a small set of labeled
+anchor papers score within their target bands.
+
+Inputs:
+ scripts/calibration/anchors.yaml - Hand-labeled anchor set
+ papers/<id>/scan.json - Scan data for each anchor paper
+
+Output:
+ scripts/calibration/weights.json - Learned weights + metadata
+
+Usage:
+ python3 scripts/calibration/fit-weights.py
+
+Apply via build-explorer-data.py:
+ If scripts/calibration/weights.json exists, compute_overall_score and
+ compute_category_score use its per-category weights. Otherwise they fall
+ back to uniform weights (current behavior).
+"""
+
+import json
+import sys
+from pathlib import Path
+
+try:
+ import yaml
+except ImportError:
+ sys.stderr.write(
+ "pyyaml not installed. Run: pip install pyyaml\n"
+ "Or port anchors.yaml to JSON and adjust this loader.\n"
+ )
+ sys.exit(1)
+
+try:
+ from scipy.optimize import minimize
+ import numpy as np
+except ImportError:
+ sys.stderr.write(
+ "scipy and numpy required. Run: pip install scipy numpy\n"
+ )
+ sys.exit(1)
+
+
+ROOT = Path(__file__).resolve().parent.parent.parent
+PAPERS_DIR = ROOT / "papers"
+ANCHORS_PATH = Path(__file__).resolve().parent / "anchors.yaml"
+OUT_PATH = Path(__file__).resolve().parent / "weights.json"
+
+
+# Same 14 categories as build-explorer-data.py ALL_CATEGORIES order.
+CATEGORIES = [
+ "artifacts",
+ "statistical_methodology",
+ "evaluation_design",
+ "claims_and_evidence",
+ "setup_transparency",
+ "limitations_and_scope",
+ "data_integrity",
+ "conflicts_of_interest",
+ "contamination",
+ "human_studies",
+ "cost_and_practicality",
+ "experimental_rigor",
+ "data_leakage",
+ "survey_methodology",
+]
+
+
+def load_anchors():
+ with open(ANCHORS_PATH) as f:
+ data = yaml.safe_load(f)
+ return data
+
+
+def load_scan(paper_id):
+ path = PAPERS_DIR / paper_id / "scan.json"
+ if not path.exists():
+ return None
+ with open(path) as f:
+ return json.load(f)
+
+
+def category_counts(checklist):
+ """Per-category (applicable, passed) counts."""
+ result = {}
+ for cat in CATEGORIES:
+ data = checklist.get(cat, {})
+ app = 0
+ pas = 0
+ if isinstance(data, dict):
+ for q in data.values():
+ if isinstance(q, dict) and q.get("applies"):
+ app += 1
+ if q.get("answer"):
+ pas += 1
+ result[cat] = (app, pas)
+ return result
+
+
+def score_with_weights(counts, weights):
+ """Weighted-mean category score. Categories with zero applicable questions
+ drop out cleanly (no zero-fill bias)."""
+ num = 0.0
+ den = 0.0
+ for cat, w in zip(CATEGORIES, weights):
+ app, pas = counts[cat]
+ if app == 0:
+ continue
+ cat_rate = pas / app
+ num += w * cat_rate
+ den += w
+ if den == 0:
+ return 0.0
+ return (num / den) * 100.0
+
+
+def loss(weights, anchors_data, pairs_data, settings):
+ total = 0.0
+ pair_margin = settings.get("pair_margin", 20.0)
+ pair_penalty = settings.get("pair_penalty", 2.0)
+ l2 = settings.get("l2_reg", 0.1)
+
+ # Band fit loss
+ scores_by_id = {}
+ for anchor in anchors_data:
+ pid = anchor["id"]
+ band = anchor["band"]
+ counts = anchor["_counts"]
+ s = score_with_weights(counts, weights)
+ scores_by_id[pid] = s
+ lo, hi = band
+ target = (lo + hi) / 2
+ # Quadratic toward target
+ total += (s - target) ** 2
+ # Hinge at band boundaries for soft enforcement
+ if s < lo:
+ total += (lo - s) ** 2 * 0.5
+ if s > hi:
+ total += (s - hi) ** 2 * 0.5
+
+ # Pair ordering loss: first id should score < second id by >= pair_margin
+ for lo_id, hi_id in pairs_data:
+ if lo_id not in scores_by_id or hi_id not in scores_by_id:
+ continue
+ gap = scores_by_id[hi_id] - scores_by_id[lo_id]
+ if gap < pair_margin:
+ total += pair_penalty * (pair_margin - gap) ** 2
+
+ # L2 regularization toward uniform weights (reference is 1.0 per category)
+ for w in weights:
+ total += l2 * (w - 1.0) ** 2
+
+ return total
+
+
+def main():
+ data = load_anchors()
+ settings = data.get("settings", {})
+ anchors = data.get("anchors", [])
+ pairs = data.get("pairs", [])
+
+ if len(anchors) < 5:
+ sys.stderr.write(
+ f"WARNING: only {len(anchors)} anchors labeled. Add more to "
+ f"anchors.yaml before trusting the fit (aim for 15+).\n"
+ )
+
+ # Attach scan data to each anchor
+ for anchor in anchors:
+ pid = anchor["id"]
+ scan = load_scan(pid)
+ if scan is None:
+ sys.stderr.write(f"SKIP: no scan.json for {pid}\n")
+ anchor["_skip"] = True
+ continue
+ anchor["_counts"] = category_counts(scan.get("checklist", {}))
+
+ anchors = [a for a in anchors if not a.get("_skip")]
+ if not anchors:
+ sys.stderr.write("No usable anchors.\n")
+ sys.exit(1)
+
+ np.random.seed(settings.get("seed", 42))
+ x0 = np.ones(len(CATEGORIES)) # Start at uniform weights
+
+ result = minimize(
+ loss,
+ x0,
+ args=(anchors, pairs, settings),
+ method="L-BFGS-B",
+ bounds=[(settings.get("min_weight", 0.0), settings.get("max_weight", 5.0))] * len(CATEGORIES),
+ options={"maxiter": 500},
+ )
+
+ weights = result.x.tolist()
+ weight_map = dict(zip(CATEGORIES, [round(w, 4) for w in weights]))
+
+ # Report
+ print("=" * 70)
+ print("LEARNED WEIGHTS")
+ print("=" * 70)
+ for cat, w in weight_map.items():
+ bar = "#" * int(w * 10)
+ print(f" {cat:<28} {w:>6.3f} {bar}")
+ print()
+
+ print("=" * 70)
+ print("ANCHOR SCORES (predicted after fit)")
+ print("=" * 70)
+ for anchor in anchors:
+ pid = anchor["id"]
+ band = anchor["band"]
+ pred = score_with_weights(anchor["_counts"], weights)
+ in_band = "OK" if band[0] <= pred <= band[1] else "**"
+ print(f" {in_band} {pred:>6.1f} target {band[0]:>3}-{band[1]:<3} {pid}")
+ print()
+
+ print("=" * 70)
+ print("PAIR ORDERING CHECK")
+ print("=" * 70)
+ scores_by_id = {
+ a["id"]: score_with_weights(a["_counts"], weights) for a in anchors
+ }
+ for lo_id, hi_id in pairs:
+ if lo_id not in scores_by_id or hi_id not in scores_by_id:
+ print(f" SKIP {lo_id} < {hi_id} (missing scan)")
+ continue
+ gap = scores_by_id[hi_id] - scores_by_id[lo_id]
+ ok = "OK" if gap >= settings.get("pair_margin", 20.0) else "**"
+ print(
+ f" {ok} {scores_by_id[lo_id]:>5.1f} < {scores_by_id[hi_id]:<5.1f} "
+ f"(gap {gap:+.1f}) {lo_id} < {hi_id}"
+ )
+ print()
+
+ out = {
+ "weights": weight_map,
+ "n_anchors": len(anchors),
+ "n_pairs": len(pairs),
+ "loss": float(result.fun),
+ "converged": bool(result.success),
+ "settings": settings,
+ }
+ with open(OUT_PATH, "w") as f:
+ json.dump(out, f, indent=2)
+ print(f"Wrote {OUT_PATH}")
+
+
+if __name__ == "__main__":
+ main()