ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit 47067ff2e58055add7db030be4ee682e0cd01f43
parent 4b8436506afa1c261f8cd6e046caa136ce386732
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 14 Apr 2026 22:24:55 +0200

partition calibration (reference-benchmark) specimens

Registry entries tagged "reference-benchmark" (currently Wakefield 1998
and Ioannidis 2005, only the first scanned) now skip the agentic-AI
corpus aggregates entirely. They still get per-paper scoring, still get
individual papers/{slug}.json written (so detail pages work), but they
no longer contribute to:

- total_papers / dash.n
- median / mean / full_reproducibility_pct
- histogram, category_rates, year_trends, tag_counts
- archetype_counts, game_counts / game_pcts
- venue_scores, citation_band_scores, funding_groups
- tensions (claim classification)
- papers-index.json (hidden from the papers explorer)

Effect: n = 1530 (was 1531), median unchanged at 49.1, full_repro
4.0 -> 4.1 (Wakefield's 0% full-reproducibility weight removed).

New output: calibration.json listing the calibration specimens with
their full detail + a calibration_notes field carrying the registry
notes so the consumer can explain each specimen's purpose.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mscripts/build-explorer-data.py | 33+++++++++++++++++++++++++--------
1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -232,6 +232,11 @@ def build(): papers_full = [] papers_index = [] paper_details = {} + # Calibration specimens (reference-benchmark tagged): scored the same way + # but partitioned out of all aggregate statistics so they don't skew the + # agentic-AI corpus numbers. Individual detail JSONs still written so + # /sigint/papers/{slug} works; listing lives in calibration.json. + calibration_papers = [] all_scores = [] cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) year_scores = defaultdict(list) @@ -276,6 +281,7 @@ def build(): reg_entry = registry.get(paper_id, {}) metadata = load_metadata(paper_id) hn_data = load_hn(paper_id) + is_calibration = "reference-benchmark" in (reg_entry.get("tags") or []) overall = compute_overall_score(checklist) if overall is None: @@ -317,8 +323,8 @@ def build(): # Code URL extraction code_url = extract_code_url(checklist) - # Only empirical papers feed into findings aggregations - if is_empirical: + # Only empirical, non-calibration papers feed into findings aggregations + if is_empirical and not is_calibration: total_papers += 1 all_scores.append(score_pct) year_scores[year].append(score_pct) @@ -331,8 +337,8 @@ def build(): claims = scan.get("claims", []) red_flags = scan.get("red_flags", []) - # All remaining aggregations are empirical-only - if not is_empirical: + # All remaining aggregations are empirical-only and skip calibration + if not is_empirical or is_calibration: pass # skip to index/detail construction below else: # Category + question aggregations @@ -387,8 +393,8 @@ def build(): else: funding_groups["not_disclosed"].append(score_pct) - # Tension classification (empirical papers only) - if is_empirical: + # Tension classification (empirical papers only, calibration excluded) + if is_empirical and not is_calibration: for claim in claims: ct = claim.get("claim", "").lower() entry = {"paper_id": paper_id, "claim": claim["claim"], @@ -469,7 +475,8 @@ def build(): "drama_conflict", "demo_ability", "brand_recognition"] ] if scan.get("engagement_factors") else None, } - papers_index.append(index_entry) + if not is_calibration: + papers_index.append(index_entry) # Full detail detail = { @@ -484,8 +491,17 @@ def build(): "hn_threads": hn_data.get("threads", []), "engagement_factors": scan.get("engagement_factors"), } + # Individual JSON gets written for everyone, so /sigint/papers/{slug} + # continues to work. Only non-calibration contributes to the aggregates. paper_details[paper_id] = detail - papers_full.append(detail) + if is_calibration: + # Carry the registry notes through so the consumer can explain + # why each specimen is in the corpus without re-deriving it. + cal_detail = dict(detail) + cal_detail["calibration_notes"] = reg_entry.get("notes", "") + calibration_papers.append(cal_detail) + else: + papers_full.append(detail) # --- Dashboard aggregations --- all_scores.sort() @@ -1264,6 +1280,7 @@ def build(): write_json(OUTPUT_DIR / "papers-index.json", papers_index) write_json(OUTPUT_DIR / "network.json", network) write_json(OUTPUT_DIR / "tensions.json", tensions) + write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers}) for slug, detail in paper_details.items(): write_json(papers_detail_dir / f"{slug}.json", detail)

Impressum · Datenschutz