commit 5ad6af87a22aa18f92dac25f1979ae94c66367bb
parent 47067ff2e58055add7db030be4ee682e0cd01f43
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Tue, 14 Apr 2026 22:34:32 +0200
partition benchmark-eval + tag Attention as reference-benchmark
Two things:
1. attention-is-all-you-need-2017 now tagged reference-benchmark
(keeps 'landmark' too). Foundational transformer paper used as a
rubric anchor like Wakefield and Ioannidis.
2. Papers tagged benchmark-eval now partitioned from aggregates too.
Rationale: they introduce benchmarks used BY the field, they're
reference material rather than subjects of the same kind of rubric
evaluation. 5 papers affected.
Output: new benchmarks.json alongside calibration.json.
Effect on dashboard: n = 1530 -> 1524, median unchanged at 49.1.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Diffstat:
2 files changed, 19 insertions(+), 7 deletions(-)
diff --git a/registry.jsonl b/registry.jsonl
@@ -2645,7 +2645,7 @@
{"id": "beyond-token-probes-2025", "title": "Beyond Token Probes: Hallucination Detection via Activation Tensors with ACT-ViT", "authors": ["Guy Bar-Shalom", "Fabrizio Frasca", "Yaniv Galron", "Yftah Ziser", "Haggai Maron"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2510.00296", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Detecting hallucinations in Large Language Model-generated text is crucial for their safe deployment. While probing classifiers show promise, they operate on isolated layer-token pairs and are LLM-spe", "arxiv_id": "2510.00296", "doi": "10.48550/arXiv.2510.00296", "directory": "papers/beyond-token-probes-2025"}
{"id": "sources-hallucination-by-2023", "title": "Sources of Hallucination by Large Language Models on Inference Tasks", "authors": ["Nick McKenna", "Tianyi Li", "Liang Cheng", "Mohammad Javad Hosseini", "Mark Johnson"], "year": 2023, "venue": "Conference on Empirical Methods in Natural Language Processing", "source_url": "https://arxiv.org/abs/2305.14552", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large Language Models (LLMs) are claimed to be capable of Natural Language Inference (NLI), necessary for applied tasks like question answering and summarization. We present a series of behavioral stu", "arxiv_id": "2305.14552", "doi": "10.48550/arXiv.2305.14552", "directory": "papers/sources-hallucination-by-2023"}
{"id": "llm-agentic-approach-2025", "title": "An LLM Agentic Approach for Legal-Critical Software: A Case Study for Tax Prep Software", "authors": ["Sina Gogani-Khiabani", "Ashutosh Trivedi", "Diptikalyan Saha", "Saeid Tizpaz-Niari"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2509.13471", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large language models (LLMs) show promise for translating natural-language statutes into executable logic, but reliability in legally critical settings remains challenging due to ambiguity and halluci", "arxiv_id": "2509.13471", "doi": "10.1145/3744916.3764575", "directory": "papers/llm-agentic-approach-2025"}
-{"id": "attention-is-all-you-need-2017", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"], "year": 2017, "venue": "NeurIPS 2017", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational transformer architecture paper. 100k+ citations. Basis for all LLMs.", "arxiv_id": "1706.03762", "doi": "10.48550/arXiv.1706.03762", "directory": "papers/attention-is-all-you-need-2017"}
+{"id": "attention-is-all-you-need-2017", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"], "year": 2017, "venue": "NeurIPS 2017", "source": "manual", "status": "scanned", "tags": ["landmark", "reference-benchmark"], "added": "2026-03-05", "notes": "Foundational transformer architecture paper. Pre-dates the agentic AI corpus; included as a reference-benchmark for the rubric. Not an example of bad methodology (unlike Wakefield) — more a landmark anchor.", "arxiv_id": "1706.03762", "doi": "10.48550/arXiv.1706.03762", "directory": "papers/attention-is-all-you-need-2017"}
{"id": "bert-pretraining-deep-2018", "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], "year": 2018, "venue": "NAACL 2019", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational pre-training paper. Basis for CodeBERT and downstream code models.", "arxiv_id": "1810.04805", "doi": "10.48550/arXiv.1810.04805", "directory": "papers/bert-pretraining-deep-2018"}
{"id": "sparks-agi-early-2023", "title": "Sparks of Artificial General Intelligence: Early Experiments with GPT-4", "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan", "Johannes Gehrke", "Eric Horvitz", "Ece Kamar", "Peter Lee", "Yin Tat Lee", "Yuanzhi Li", "Scott Lundberg"], "year": 2023, "venue": "arXiv preprint", "source": "manual", "status": "downloaded", "tags": ["landmark"], "added": "2026-03-05", "notes": "Most-discussed GPT-4 capabilities analysis. Massive tech media coverage.", "arxiv_id": "2303.12528", "doi": "10.48550/arXiv.2303.12528", "directory": "papers/sparks-agi-early-2023"}
{"id": "react-synergizing-reasoning-2022", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], "year": 2022, "venue": "ICLR 2023", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundation of agent reasoning+acting paradigm. Cited by virtually every agentic AI paper.", "arxiv_id": "2210.03629", "doi": "10.48550/arXiv.2210.03629", "directory": "papers/react-synergizing-reasoning-2022"}
diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py
@@ -232,11 +232,16 @@ def build():
papers_full = []
papers_index = []
paper_details = {}
- # Calibration specimens (reference-benchmark tagged): scored the same way
- # but partitioned out of all aggregate statistics so they don't skew the
- # agentic-AI corpus numbers. Individual detail JSONs still written so
- # /sigint/papers/{slug} works; listing lives in calibration.json.
+ # Partitioned papers: scored the same way but kept out of aggregates and
+ # papers-index so they don't skew the agentic-AI corpus numbers. Individual
+ # detail JSONs still written so /sigint/papers/{slug} works.
+ # reference-benchmark: rubric calibration specimens (Wakefield, Ioannidis,
+ # Attention). Listed in calibration.json.
+ # benchmark-eval: papers that introduce benchmarks used BY the field.
+ # They're reference material, not subjects being
+ # evaluated. Listed in benchmarks.json.
calibration_papers = []
+ benchmark_papers = []
all_scores = []
cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
year_scores = defaultdict(list)
@@ -281,7 +286,10 @@ def build():
reg_entry = registry.get(paper_id, {})
metadata = load_metadata(paper_id)
hn_data = load_hn(paper_id)
- is_calibration = "reference-benchmark" in (reg_entry.get("tags") or [])
+ reg_tags = reg_entry.get("tags") or []
+ is_reference = "reference-benchmark" in reg_tags
+ is_benchmark_paper = "benchmark-eval" in reg_tags
+ is_calibration = is_reference or is_benchmark_paper
overall = compute_overall_score(checklist)
if overall is None:
@@ -499,7 +507,10 @@ def build():
# why each specimen is in the corpus without re-deriving it.
cal_detail = dict(detail)
cal_detail["calibration_notes"] = reg_entry.get("notes", "")
- calibration_papers.append(cal_detail)
+ if is_reference:
+ calibration_papers.append(cal_detail)
+ else: # benchmark-eval only (not also reference-benchmark)
+ benchmark_papers.append(cal_detail)
else:
papers_full.append(detail)
@@ -1281,6 +1292,7 @@ def build():
write_json(OUTPUT_DIR / "network.json", network)
write_json(OUTPUT_DIR / "tensions.json", tensions)
write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers})
+ write_json(OUTPUT_DIR / "benchmarks.json", {"papers": benchmark_papers})
for slug, detail in paper_details.items():
write_json(papers_detail_dir / f"{slug}.json", detail)