commit 47067ff2e58055add7db030be4ee682e0cd01f43
parent 4b8436506afa1c261f8cd6e046caa136ce386732
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 14 Apr 2026 22:24:55 +0200
partition calibration (reference-benchmark) specimens
Registry entries tagged "reference-benchmark" (currently Wakefield 1998
and Ioannidis 2005, only the first scanned) now skip the agentic-AI
corpus aggregates entirely. They still get per-paper scoring, still get
individual papers/{slug}.json written (so detail pages work), but they
no longer contribute to:
- total_papers / dash.n
- median / mean / full_reproducibility_pct
- histogram, category_rates, year_trends, tag_counts
- archetype_counts, game_counts / game_pcts
- venue_scores, citation_band_scores, funding_groups
- tensions (claim classification)
- papers-index.json (hidden from the papers explorer)
Effect: n = 1530 (was 1531), median unchanged at 49.1, full_repro
4.0 -> 4.1 (Wakefield's 0% full-reproducibility weight removed).
New output: calibration.json listing the calibration specimens with
their full detail + a calibration_notes field carrying the registry
notes so the consumer can explain each specimen's purpose.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
1 file changed, 25 insertions(+), 8 deletions(-)
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -232,6 +232,11 @@ def build():
papers_full = []
papers_index = []
paper_details = {}
+ # Calibration specimens (reference-benchmark tagged): scored the same way
+ # but partitioned out of all aggregate statistics so they don't skew the
+ # agentic-AI corpus numbers. Individual detail JSONs still written so
+ # /sigint/papers/{slug} works; listing lives in calibration.json.
+ calibration_papers = []
all_scores = []
cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
year_scores = defaultdict(list)
@@ -276,6 +281,7 @@ def build():
reg_entry = registry.get(paper_id, {})
metadata = load_metadata(paper_id)
hn_data = load_hn(paper_id)
+ is_calibration = "reference-benchmark" in (reg_entry.get("tags") or [])
overall = compute_overall_score(checklist)
if overall is None:
@@ -317,8 +323,8 @@ def build():
# Code URL extraction
code_url = extract_code_url(checklist)
- # Only empirical papers feed into findings aggregations
- if is_empirical:
+ # Only empirical, non-calibration papers feed into findings aggregations
+ if is_empirical and not is_calibration:
total_papers += 1
all_scores.append(score_pct)
year_scores[year].append(score_pct)
@@ -331,8 +337,8 @@ def build():
claims = scan.get("claims", [])
red_flags = scan.get("red_flags", [])
- # All remaining aggregations are empirical-only
- if not is_empirical:
+ # All remaining aggregations are empirical-only and skip calibration
+ if not is_empirical or is_calibration:
pass # skip to index/detail construction below
else:
# Category + question aggregations
@@ -387,8 +393,8 @@ def build():
else:
funding_groups["not_disclosed"].append(score_pct)
- # Tension classification (empirical papers only)
- if is_empirical:
+ # Tension classification (empirical papers only, calibration excluded)
+ if is_empirical and not is_calibration:
for claim in claims:
ct = claim.get("claim", "").lower()
entry = {"paper_id": paper_id, "claim": claim["claim"],
@@ -469,7 +475,8 @@ def build():
"drama_conflict", "demo_ability", "brand_recognition"]
] if scan.get("engagement_factors") else None,
}
- papers_index.append(index_entry)
+ if not is_calibration:
+ papers_index.append(index_entry)
# Full detail
detail = {
@@ -484,8 +491,17 @@ def build():
"hn_threads": hn_data.get("threads", []),
"engagement_factors": scan.get("engagement_factors"),
}
+ # Individual JSON gets written for everyone, so /sigint/papers/{slug}
+ # continues to work. Only non-calibration contributes to the aggregates.
paper_details[paper_id] = detail
- papers_full.append(detail)
+ if is_calibration:
+ # Carry the registry notes through so the consumer can explain
+ # why each specimen is in the corpus without re-deriving it.
+ cal_detail = dict(detail)
+ cal_detail["calibration_notes"] = reg_entry.get("notes", "")
+ calibration_papers.append(cal_detail)
+ else:
+ papers_full.append(detail)
# --- Dashboard aggregations ---
all_scores.sort()
@@ -1264,6 +1280,7 @@ def build():
write_json(OUTPUT_DIR / "papers-index.json", papers_index)
write_json(OUTPUT_DIR / "network.json", network)
write_json(OUTPUT_DIR / "tensions.json", tensions)
+ write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers})
for slug, detail in paper_details.items():
write_json(papers_detail_dir / f"{slug}.json", detail)