partition calibration (reference-benchmark) specimens - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit 47067ff2e58055add7db030be4ee682e0cd01f43
parent 4b8436506afa1c261f8cd6e046caa136ce386732
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 14 Apr 2026 22:24:55 +0200

partition calibration (reference-benchmark) specimens

Registry entries tagged "reference-benchmark" (currently Wakefield 1998
and Ioannidis 2005, only the first scanned) now skip the agentic-AI
corpus aggregates entirely. They still get per-paper scoring, still get
individual papers/{slug}.json written (so detail pages work), but they
no longer contribute to:

- total_papers / dash.n
- median / mean / full_reproducibility_pct
- histogram, category_rates, year_trends, tag_counts
- archetype_counts, game_counts / game_pcts
- venue_scores, citation_band_scores, funding_groups
- tensions (claim classification)
- papers-index.json (hidden from the papers explorer)

Effect: n = 1530 (was 1531), median unchanged at 49.1, full_repro
4.0 -> 4.1 (Wakefield's 0% full-reproducibility weight removed).

New output: calibration.json listing the calibration specimens with
their full detail + a calibration_notes field carrying the registry
notes so the consumer can explain each specimen's purpose.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
M scripts/build-explorer-data.py  | 33 +++++++++++++++++++++++++--------

1 file changed, 25 insertions(+), 8 deletions(-)
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -232,6 +232,11 @@ def build():
     papers_full = []
     papers_index = []
     paper_details = {}
+    # Calibration specimens (reference-benchmark tagged): scored the same way
+    # but partitioned out of all aggregate statistics so they don't skew the
+    # agentic-AI corpus numbers. Individual detail JSONs still written so
+    # /sigint/papers/{slug} works; listing lives in calibration.json.
+    calibration_papers = []
     all_scores = []
     cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
     year_scores = defaultdict(list)
@@ -276,6 +281,7 @@ def build():
         reg_entry = registry.get(paper_id, {})
         metadata = load_metadata(paper_id)
         hn_data = load_hn(paper_id)
+        is_calibration = "reference-benchmark" in (reg_entry.get("tags") or [])
 
         overall = compute_overall_score(checklist)
         if overall is None:
@@ -317,8 +323,8 @@ def build():
         # Code URL extraction
         code_url = extract_code_url(checklist)
 
-        # Only empirical papers feed into findings aggregations
-        if is_empirical:
+        # Only empirical, non-calibration papers feed into findings aggregations
+        if is_empirical and not is_calibration:
             total_papers += 1
             all_scores.append(score_pct)
             year_scores[year].append(score_pct)
@@ -331,8 +337,8 @@ def build():
         claims = scan.get("claims", [])
         red_flags = scan.get("red_flags", [])
 
-        # All remaining aggregations are empirical-only
-        if not is_empirical:
+        # All remaining aggregations are empirical-only and skip calibration
+        if not is_empirical or is_calibration:
             pass  # skip to index/detail construction below
         else:
             # Category + question aggregations
@@ -387,8 +393,8 @@ def build():
                 else:
                     funding_groups["not_disclosed"].append(score_pct)
 
-        # Tension classification (empirical papers only)
-        if is_empirical:
+        # Tension classification (empirical papers only, calibration excluded)
+        if is_empirical and not is_calibration:
           for claim in claims:
             ct = claim.get("claim", "").lower()
             entry = {"paper_id": paper_id, "claim": claim["claim"],
@@ -469,7 +475,8 @@ def build():
                             "drama_conflict", "demo_ability", "brand_recognition"]
                            ] if scan.get("engagement_factors") else None,
         }
-        papers_index.append(index_entry)
+        if not is_calibration:
+            papers_index.append(index_entry)
 
         # Full detail
         detail = {
@@ -484,8 +491,17 @@ def build():
             "hn_threads": hn_data.get("threads", []),
             "engagement_factors": scan.get("engagement_factors"),
         }
+        # Individual JSON gets written for everyone, so /sigint/papers/{slug}
+        # continues to work. Only non-calibration contributes to the aggregates.
         paper_details[paper_id] = detail
-        papers_full.append(detail)
+        if is_calibration:
+            # Carry the registry notes through so the consumer can explain
+            # why each specimen is in the corpus without re-deriving it.
+            cal_detail = dict(detail)
+            cal_detail["calibration_notes"] = reg_entry.get("notes", "")
+            calibration_papers.append(cal_detail)
+        else:
+            papers_full.append(detail)
 
     # --- Dashboard aggregations ---
     all_scores.sort()
@@ -1264,6 +1280,7 @@ def build():
     write_json(OUTPUT_DIR / "papers-index.json", papers_index)
     write_json(OUTPUT_DIR / "network.json", network)
     write_json(OUTPUT_DIR / "tensions.json", tensions)
+    write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers})
 
     for slug, detail in paper_details.items():
         write_json(papers_detail_dir / f"{slug}.json", detail)

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs