ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit 5ad6af87a22aa18f92dac25f1979ae94c66367bb
parent 47067ff2e58055add7db030be4ee682e0cd01f43
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Tue, 14 Apr 2026 22:34:32 +0200

partition benchmark-eval + tag Attention as reference-benchmark

Two things:

1. attention-is-all-you-need-2017 now tagged reference-benchmark
   (keeps 'landmark' too). Foundational transformer paper used as a
   rubric anchor like Wakefield and Ioannidis.

2. Papers tagged benchmark-eval now partitioned from aggregates too.
   Rationale: they introduce benchmarks used BY the field, they're
   reference material rather than subjects of the same kind of rubric
   evaluation. 5 papers affected.

Output: new benchmarks.json alongside calibration.json.

Effect on dashboard: n = 1530 -> 1524, median unchanged at 49.1.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mregistry.jsonl | 2+-
Mscripts/build-explorer-data.py | 24++++++++++++++++++------
2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/registry.jsonl b/registry.jsonl @@ -2645,7 +2645,7 @@ {"id": "beyond-token-probes-2025", "title": "Beyond Token Probes: Hallucination Detection via Activation Tensors with ACT-ViT", "authors": ["Guy Bar-Shalom", "Fabrizio Frasca", "Yaniv Galron", "Yftah Ziser", "Haggai Maron"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2510.00296", "source": "semantic_scholar", "status": "scanned", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Detecting hallucinations in Large Language Model-generated text is crucial for their safe deployment. While probing classifiers show promise, they operate on isolated layer-token pairs and are LLM-spe", "arxiv_id": "2510.00296", "doi": "10.48550/arXiv.2510.00296", "directory": "papers/beyond-token-probes-2025"} {"id": "sources-hallucination-by-2023", "title": "Sources of Hallucination by Large Language Models on Inference Tasks", "authors": ["Nick McKenna", "Tianyi Li", "Liang Cheng", "Mohammad Javad Hosseini", "Mark Johnson"], "year": 2023, "venue": "Conference on Empirical Methods in Natural Language Processing", "source_url": "https://arxiv.org/abs/2305.14552", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large Language Models (LLMs) are claimed to be capable of Natural Language Inference (NLI), necessary for applied tasks like question answering and summarization. We present a series of behavioral stu", "arxiv_id": "2305.14552", "doi": "10.48550/arXiv.2305.14552", "directory": "papers/sources-hallucination-by-2023"} {"id": "llm-agentic-approach-2025", "title": "An LLM Agentic Approach for Legal-Critical Software: A Case Study for Tax Prep Software", "authors": ["Sina Gogani-Khiabani", "Ashutosh Trivedi", "Diptikalyan Saha", "Saeid Tizpaz-Niari"], "year": 2025, "venue": "arXiv.org", "source_url": "https://arxiv.org/abs/2509.13471", "source": "semantic_scholar", "status": "downloaded", "tags": [], "added": "2026-02-27", "notes": "Found via Semantic Scholar. Abstract: Large language models (LLMs) show promise for translating natural-language statutes into executable logic, but reliability in legally critical settings remains challenging due to ambiguity and halluci", "arxiv_id": "2509.13471", "doi": "10.1145/3744916.3764575", "directory": "papers/llm-agentic-approach-2025"} -{"id": "attention-is-all-you-need-2017", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"], "year": 2017, "venue": "NeurIPS 2017", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational transformer architecture paper. 100k+ citations. Basis for all LLMs.", "arxiv_id": "1706.03762", "doi": "10.48550/arXiv.1706.03762", "directory": "papers/attention-is-all-you-need-2017"} +{"id": "attention-is-all-you-need-2017", "title": "Attention Is All You Need", "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N. Gomez", "Lukasz Kaiser", "Illia Polosukhin"], "year": 2017, "venue": "NeurIPS 2017", "source": "manual", "status": "scanned", "tags": ["landmark", "reference-benchmark"], "added": "2026-03-05", "notes": "Foundational transformer architecture paper. Pre-dates the agentic AI corpus; included as a reference-benchmark for the rubric. Not an example of bad methodology (unlike Wakefield) — more a landmark anchor.", "arxiv_id": "1706.03762", "doi": "10.48550/arXiv.1706.03762", "directory": "papers/attention-is-all-you-need-2017"} {"id": "bert-pretraining-deep-2018", "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], "year": 2018, "venue": "NAACL 2019", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundational pre-training paper. Basis for CodeBERT and downstream code models.", "arxiv_id": "1810.04805", "doi": "10.48550/arXiv.1810.04805", "directory": "papers/bert-pretraining-deep-2018"} {"id": "sparks-agi-early-2023", "title": "Sparks of Artificial General Intelligence: Early Experiments with GPT-4", "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan", "Johannes Gehrke", "Eric Horvitz", "Ece Kamar", "Peter Lee", "Yin Tat Lee", "Yuanzhi Li", "Scott Lundberg"], "year": 2023, "venue": "arXiv preprint", "source": "manual", "status": "downloaded", "tags": ["landmark"], "added": "2026-03-05", "notes": "Most-discussed GPT-4 capabilities analysis. Massive tech media coverage.", "arxiv_id": "2303.12528", "doi": "10.48550/arXiv.2303.12528", "directory": "papers/sparks-agi-early-2023"} {"id": "react-synergizing-reasoning-2022", "title": "ReAct: Synergizing Reasoning and Acting in Language Models", "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], "year": 2022, "venue": "ICLR 2023", "source": "manual", "status": "scanned", "tags": ["landmark"], "added": "2026-03-05", "notes": "Foundation of agent reasoning+acting paradigm. Cited by virtually every agentic AI paper.", "arxiv_id": "2210.03629", "doi": "10.48550/arXiv.2210.03629", "directory": "papers/react-synergizing-reasoning-2022"} diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -232,11 +232,16 @@ def build(): papers_full = [] papers_index = [] paper_details = {} - # Calibration specimens (reference-benchmark tagged): scored the same way - # but partitioned out of all aggregate statistics so they don't skew the - # agentic-AI corpus numbers. Individual detail JSONs still written so - # /sigint/papers/{slug} works; listing lives in calibration.json. + # Partitioned papers: scored the same way but kept out of aggregates and + # papers-index so they don't skew the agentic-AI corpus numbers. Individual + # detail JSONs still written so /sigint/papers/{slug} works. + # reference-benchmark: rubric calibration specimens (Wakefield, Ioannidis, + # Attention). Listed in calibration.json. + # benchmark-eval: papers that introduce benchmarks used BY the field. + # They're reference material, not subjects being + # evaluated. Listed in benchmarks.json. calibration_papers = [] + benchmark_papers = [] all_scores = [] cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) year_scores = defaultdict(list) @@ -281,7 +286,10 @@ def build(): reg_entry = registry.get(paper_id, {}) metadata = load_metadata(paper_id) hn_data = load_hn(paper_id) - is_calibration = "reference-benchmark" in (reg_entry.get("tags") or []) + reg_tags = reg_entry.get("tags") or [] + is_reference = "reference-benchmark" in reg_tags + is_benchmark_paper = "benchmark-eval" in reg_tags + is_calibration = is_reference or is_benchmark_paper overall = compute_overall_score(checklist) if overall is None: @@ -499,7 +507,10 @@ def build(): # why each specimen is in the corpus without re-deriving it. cal_detail = dict(detail) cal_detail["calibration_notes"] = reg_entry.get("notes", "") - calibration_papers.append(cal_detail) + if is_reference: + calibration_papers.append(cal_detail) + else: # benchmark-eval only (not also reference-benchmark) + benchmark_papers.append(cal_detail) else: papers_full.append(detail) @@ -1281,6 +1292,7 @@ def build(): write_json(OUTPUT_DIR / "network.json", network) write_json(OUTPUT_DIR / "tensions.json", tensions) write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers}) + write_json(OUTPUT_DIR / "benchmarks.json", {"papers": benchmark_papers}) for slug, detail in paper_details.items(): write_json(papers_detail_dir / f"{slug}.json", detail)

Impressum · Datenschutz