ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit 95f484d01c4aded0fbdb7faed0aa7f17b69da21b
parent b4f6f0caa07a8a5d8d382792a236646c772d9b4b
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Mon, 30 Mar 2026 16:10:48 +0200

Filter non-empirical papers from findings, tag in UI

Papers without both statistical_methodology and evaluation_design
applicable are classified as non-empirical (159 papers). These are
excluded from all findings aggregations (median, games, tensions,
correlations, year trends). Dashboard now reports empirical-only:
1,047 papers, median 47.2% (was 46.3% mixed).

Non-empirical papers still appear in the papers browser with:
- Score shown in gray with asterisk (e.g., "10.0%*")
- "Non-empirical" badge instead of archetype
- Tooltip explaining limited criteria

Progress bar shows empirical/non-empirical split.
Paper detail pages still show full checklist for all papers.

This is a stopgap before v4 instrument redesign that will add
paper-type-specific question panels.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Diffstat:
Mexplorer/src/data.ts | 3+++
Mexplorer/src/views/dashboard.ts | 2+-
Mexplorer/src/views/papers.ts | 11+++++++++--
Mexplorer/tests/explorer.spec.ts | 12++++++------
Apapers/mathematical-methods-human-2026/scan.json | 428+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mregistry.jsonl | 1+
Mscripts/build-explorer-data.py | 156++++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Atest-results/.last-run.json | 5+++++
8 files changed, 543 insertions(+), 75 deletions(-)

diff --git a/explorer/src/data.ts b/explorer/src/data.ts @@ -29,6 +29,7 @@ export interface PaperIndex { doi: string; code_url: string | null; dna: (number | null)[] | null; + paper_type: string | null; engagement: (number | null)[] | null; } @@ -51,6 +52,8 @@ export interface HistBin { export interface Pipeline { registry_total: number; v2_scanned: number; + empirical: number; + non_empirical: number; v3_scanned: number; v2_only: number; v1_needs_rescan: number; diff --git a/explorer/src/views/dashboard.ts b/explorer/src/views/dashboard.ts @@ -27,7 +27,7 @@ function renderProgressBar(p: Pipeline): string { return `<div class="pipeline-bar"> <div class="pipeline-header"> <span class="pipeline-title">Survey Progress</span> - <span class="pipeline-stat">${p.v2_scanned} of ${total} papers scanned (${totalScannedPct}%)${p.v3_scanned ? `, ${p.v3_scanned} with engagement factors` : ''}</span> + <span class="pipeline-stat">${p.v2_scanned} of ${total} scanned (${totalScannedPct}%) — ${p.empirical} empirical, ${p.non_empirical} non-empirical</span> </div> <div class="pipeline-track"> <div class="pipeline-seg v3" style="width:${v3Pct}%" title="V3 scanned (with engagement factors): ${p.v3_scanned}"></div> diff --git a/explorer/src/views/papers.ts b/explorer/src/views/papers.ts @@ -81,9 +81,16 @@ export async function renderPapers(app: HTMLElement) { const columns: Column<PaperIndex>[] = [ { key: 'title', label: 'Title', render: p => p.title.length > 60 ? p.title.slice(0, 57) + '...' : p.title, sortValue: p => p.title }, { key: 'year', label: 'Year', render: p => String(p.year || ''), sortValue: p => p.year || 0 }, - { key: 'score', label: 'Score', render: p => p.score != null ? `<span style="color:${scoreColor(p.score)}">${p.score}%</span>` : '<span style="color:var(--gray)">--</span>', sortValue: p => p.score ?? -1 }, + { key: 'score', label: 'Score', render: p => { + if (p.score == null) return '<span style="color:var(--gray)">--</span>'; + if (p.paper_type === 'non-empirical') return `<span style="color:var(--gray)" title="Non-empirical paper — scored on limited criteria">${p.score}%*</span>`; + return `<span style="color:${scoreColor(p.score)}">${p.score}%</span>`; + }, sortValue: p => p.score ?? -1 }, { key: 'dna', label: 'Profile', render: p => renderDna(p.dna, p.engagement), sortValue: p => p.score ?? -1 }, - { key: 'archetype', label: 'Type', render: p => p.archetype ? `<span class="archetype ${p.archetype}">${p.archetype}</span>` : '<span style="color:var(--gray)">--</span>', sortValue: p => p.archetype || '' }, + { key: 'archetype', label: 'Type', render: p => { + if (p.paper_type === 'non-empirical') return '<span class="archetype" style="background:rgba(139,143,163,0.15);color:var(--text-dim)">Non-empirical</span>'; + return p.archetype ? `<span class="archetype ${p.archetype}">${p.archetype}</span>` : '<span style="color:var(--gray)">--</span>'; + }, sortValue: p => p.paper_type === 'non-empirical' ? 'ZZ' : (p.archetype || '') }, ]; function renderTable() { diff --git a/explorer/tests/explorer.spec.ts b/explorer/tests/explorer.spec.ts @@ -6,8 +6,8 @@ test.describe('Dashboard', () => { await expect(page.locator('.card .value').first()).toBeVisible({ timeout: 10000 }); const cards = page.locator('.card'); await expect(cards).toHaveCount(4); - await expect(cards.nth(0).locator('.value')).toHaveText('1205'); - await expect(cards.nth(1).locator('.value')).toHaveText('46.3%'); + await expect(cards.nth(0).locator('.value')).toHaveText('1047'); + await expect(cards.nth(1).locator('.value')).toHaveText('47.2%'); }); test('shows spinner then content', async ({ page }) => { @@ -56,8 +56,8 @@ test.describe('Papers Browser', () => { test('shows paper table', async ({ page }) => { await page.goto('/#/papers'); await expect(page.locator('table tbody tr').first()).toBeVisible({ timeout: 10000 }); - expect(await page.locator('table tbody tr').count()).toBe(2687); - await expect(page.locator('#f-count')).toHaveText('2687 / 2687'); + expect(await page.locator('table tbody tr').count()).toBe(2688); + await expect(page.locator('#f-count')).toHaveText('2688 / 2688'); }); test('text search filters papers', async ({ page }) => { @@ -66,7 +66,7 @@ test.describe('Papers Browser', () => { await page.fill('#f-search', 'metr'); const count = await page.locator('table tbody tr').count(); expect(count).toBeGreaterThan(0); - expect(count).toBeLessThan(2687); + expect(count).toBeLessThan(2688); }); test('archetype filter works', async ({ page }) => { @@ -75,7 +75,7 @@ test.describe('Papers Browser', () => { await page.selectOption('#f-archetype', 'Complete'); const count = await page.locator('table tbody tr').count(); expect(count).toBeGreaterThan(0); - expect(count).toBeLessThan(2687); + expect(count).toBeLessThan(2688); }); test('clicking row navigates to paper detail', async ({ page }) => { diff --git a/papers/mathematical-methods-human-2026/scan.json b/papers/mathematical-methods-human-2026/scan.json @@ -0,0 +1,428 @@ +{ + "paper": { + "title": "Mathematical methods and human thought in the age of AI", + "authors": ["Tanya Klowden", "Terence Tao"], + "year": 2026, + "venue": "arXiv preprint", + "arxiv_id": "2603.26524", + "doi": "" + }, + "scan_version": 3, + "active_modules": [], + "methodology_tags": ["theoretical", "qualitative"], + "key_findings": "The paper argues that AI is a natural evolution of human cognitive tools and that its development must remain human-centered. Using mathematics as a 'sandbox,' the authors examine how formal verification is necessary but insufficient for mathematical knowledge, how AI decouples the outward form of intellectual products from the understanding behind them, and how a 'Copernican' reframing of human intelligence alongside artificial intelligence could guide responsible integration. The paper warns of digital divides, environmental costs, model collapse, and the risk of flooding fields with technically correct but insight-free AI-generated work.", + "checklist": { + "artifacts": { + "code_released": { + "applies": false, + "answer": false, + "justification": "This is a theoretical/philosophical position paper with no computational work. There is no code to release." + }, + "data_released": { + "applies": false, + "answer": false, + "justification": "No data was collected or analyzed. The paper is entirely argumentative and philosophical." + }, + "environment_specified": { + "applies": false, + "answer": false, + "justification": "No computational environment is relevant to a purely theoretical paper." + }, + "reproduction_instructions": { + "applies": false, + "answer": false, + "justification": "No experiments to reproduce. The paper consists of philosophical arguments and historical analysis." + } + }, + "statistical_methodology": { + "confidence_intervals_or_error_bars": { + "applies": false, + "answer": false, + "justification": "No quantitative results are reported. The paper presents philosophical arguments without statistical analysis." + }, + "significance_tests": { + "applies": false, + "answer": false, + "justification": "No comparative empirical claims requiring statistical testing are made." + }, + "effect_sizes_reported": { + "applies": false, + "answer": false, + "justification": "No empirical effects are measured in this theoretical paper." + }, + "sample_size_justified": { + "applies": false, + "answer": false, + "justification": "No sample or data collection. Purely theoretical work." + }, + "variance_reported": { + "applies": false, + "answer": false, + "justification": "No experimental runs are conducted. No quantitative results to report variance for." + } + }, + "evaluation_design": { + "baselines_included": { + "applies": false, + "answer": false, + "justification": "No system or method is evaluated. This is a philosophical essay, not an empirical evaluation." + }, + "baselines_contemporary": { + "applies": false, + "answer": false, + "justification": "No baselines are relevant for a theoretical position paper." + }, + "ablation_study": { + "applies": false, + "answer": false, + "justification": "No system with components to ablate. The paper is a philosophical argument." + }, + "multiple_metrics": { + "applies": false, + "answer": false, + "justification": "No metrics are used. The paper makes no quantitative evaluations." + }, + "human_evaluation": { + "applies": false, + "answer": false, + "justification": "No system outputs to evaluate. The paper is a theoretical discussion." + }, + "held_out_test_set": { + "applies": false, + "answer": false, + "justification": "No test sets or data splits. Purely theoretical work." + }, + "per_category_breakdown": { + "applies": false, + "answer": false, + "justification": "No categories of results to break down. The paper presents arguments, not measurements." + }, + "failure_cases_discussed": { + "applies": false, + "answer": false, + "justification": "No system is proposed whose failures could be analyzed. The paper discusses AI weaknesses in general terms but has no evaluation." + }, + "negative_results_reported": { + "applies": false, + "answer": false, + "justification": "No experiments to yield positive or negative results." + } + }, + "claims_and_evidence": { + "abstract_claims_supported": { + "applies": true, + "answer": true, + "justification": "The abstract claims that AI is a natural evolution of human tools, that development should be human-centered, and proposes a pathway for integration. Sections 2 (historical parallels), 5 (costs and benefits), and 6 (human/AI interface) develop each of these arguments at length. The abstract's claims are philosophical positions that are elaborated and argued throughout the paper." + }, + "causal_claims_justified": { + "applies": true, + "answer": false, + "justification": "The paper makes numerous causal assertions: 'strict regulation imposed at this point would disproportionately shut down the more positive use cases of AI' (Section 1.3), 'AI technologies... have dramatically shifted social, intellectual, and economic spheres' (Section 7), and AI could 'crowd out the more traditional paradigms' (Section 4.6). These are stated as arguments from analogy and assertion, without empirical evidence or causal identification strategies." + }, + "generalization_bounded": { + "applies": true, + "answer": false, + "justification": "While the paper acknowledges mathematics as a 'sandbox' (Section 3), it regularly generalizes to 'all humankind' and 'society as a whole' (abstract, Section 5.1, conclusion). The title itself — 'human thought in the age of AI' — claims scope far beyond the mathematical case study. The paper does not bound its philosophical conclusions to the mathematical domain from which most of its arguments are drawn." + }, + "alternative_explanations_discussed": { + "applies": false, + "answer": false, + "justification": "The paper presents no empirical results. It is a philosophical position paper, so alternative explanations for observed data are not applicable." + }, + "proxy_outcome_distinction": { + "applies": false, + "answer": false, + "justification": "No measurements or proxies are used. This is a theoretical paper." + } + }, + "setup_transparency": { + "model_versions_specified": { + "applies": false, + "answer": false, + "justification": "No AI models are used or evaluated in this paper. The paper discusses AI in general philosophical terms." + }, + "prompts_provided": { + "applies": false, + "answer": false, + "justification": "No prompting is done in this paper." + }, + "hyperparameters_reported": { + "applies": false, + "answer": false, + "justification": "No experiments with hyperparameters. Purely theoretical work." + }, + "scaffolding_described": { + "applies": false, + "answer": false, + "justification": "No agentic scaffolding is used in this paper." + }, + "data_preprocessing_documented": { + "applies": false, + "answer": false, + "justification": "No data is collected or preprocessed. The paper is a philosophical essay." + } + }, + "limitations_and_scope": { + "limitations_section_present": { + "applies": true, + "answer": false, + "justification": "There is no dedicated limitations section. The paper contains scattered hedges such as 'we of course do not pretend to have definitive resolutions to any of them; and the speed of change in this space is such that any proclamations we make are at risk of being overtaken by striking new technological advances' (Section 3), but these are not collected in a substantive limitations discussion." + }, + "threats_to_validity_specific": { + "applies": true, + "answer": false, + "justification": "No specific threats to the validity of the paper's own arguments are discussed. The paper acknowledges that AI is changing rapidly but does not identify specific ways in which its philosophical positions or analogies might be wrong or misleading." + }, + "scope_boundaries_stated": { + "applies": true, + "answer": false, + "justification": "The paper does not explicitly state what it does NOT claim. While it notes that mathematics is used as a 'sandbox' (Section 3) and hedges with 'we of course do not pretend to have definitive resolutions,' it does not list specific exclusions, untested scenarios, or things the reader should not conclude from the paper." + } + }, + "data_integrity": { + "raw_data_available": { + "applies": false, + "answer": false, + "justification": "No data is collected or analyzed. The paper is a purely argumentative work." + }, + "data_collection_described": { + "applies": false, + "answer": false, + "justification": "No data collection occurs in this theoretical paper." + }, + "recruitment_methods_described": { + "applies": false, + "answer": false, + "justification": "No participants or data sources are recruited. This is a philosophical essay." + }, + "data_pipeline_documented": { + "applies": false, + "answer": false, + "justification": "No data pipeline exists for this theoretical paper." + } + }, + "conflicts_of_interest": { + "funding_disclosed": { + "applies": true, + "answer": false, + "justification": "The Acknowledgments section (Section 7.1) thanks Silvia de Toffoli for comments but does not mention any funding sources. No funding disclosure is present." + }, + "affiliations_disclosed": { + "applies": true, + "answer": false, + "justification": "The paper lists author names but no institutional affiliations are visible in the text. The authors' academic positions and departments are not stated." + }, + "funder_independent_of_outcome": { + "applies": true, + "answer": false, + "justification": "No funding is disclosed, so independence cannot be assessed. The paper discusses AI companies and their practices but does not clarify whether the authors have any financial relationships with such entities." + }, + "financial_interests_declared": { + "applies": true, + "answer": false, + "justification": "No competing interests statement or financial disclosure is present in the paper." + } + }, + "contamination": { + "training_cutoff_stated": { + "applies": false, + "answer": false, + "justification": "The paper does not evaluate any pre-trained model on any benchmark. It is a philosophical discussion of AI." + }, + "train_test_overlap_discussed": { + "applies": false, + "answer": false, + "justification": "No model evaluation occurs in this paper." + }, + "benchmark_contamination_addressed": { + "applies": false, + "answer": false, + "justification": "No benchmark evaluation is performed." + } + }, + "human_studies": { + "pre_registered": { + "applies": false, + "answer": false, + "justification": "No human participants. This is a theoretical paper." + }, + "irb_or_ethics_approval": { + "applies": false, + "answer": false, + "justification": "No human participants. Purely philosophical work." + }, + "demographics_reported": { + "applies": false, + "answer": false, + "justification": "No human participants in this theoretical paper." + }, + "inclusion_exclusion_criteria": { + "applies": false, + "answer": false, + "justification": "No human participants." + }, + "randomization_described": { + "applies": false, + "answer": false, + "justification": "No experimental study with participants." + }, + "blinding_described": { + "applies": false, + "answer": false, + "justification": "No experimental study with participants." + }, + "attrition_reported": { + "applies": false, + "answer": false, + "justification": "No human participants." + } + }, + "cost_and_practicality": { + "inference_cost_reported": { + "applies": false, + "answer": false, + "justification": "Purely theoretical paper with no computational method to cost." + }, + "compute_budget_stated": { + "applies": false, + "answer": false, + "justification": "No computation performed. The paper is a philosophical essay." + } + } + }, + "claims": [ + { + "claim": "AI is a natural evolution of human tools developed throughout history to facilitate the creation, organization, and dissemination of ideas.", + "evidence": "Section 2 draws historical parallels with the printing press, the Industrial Revolution, and prior automation technologies, arguing that AI extends a continuous tradition of tool-building (Section 2.1).", + "supported": "moderate" + }, + { + "claim": "Mathematics serves as a suitable 'sandbox' for exploring the broader impact of AI on sciences and society.", + "evidence": "Section 3 argues mathematics has 'an older and more advanced foundation, and is by its nature well suited to explore a variety of hypothetical abstract scenarios.' The paper uses mathematics' objective standards of proof as a testbed for AI-related philosophical questions.", + "supported": "moderate" + }, + { + "claim": "Formal verification is necessary but insufficient for evaluating AI-generated mathematical knowledge; proofs can be 'odorless' — technically correct but devoid of insight.", + "evidence": "Section 4.4 argues that formalization 'only certifies that a formalized argument establishes a formal mathematical statement' and that AI can produce proofs that 'may even pass formal verification tests, but yet remain strangely unsatisfying.' Examples include AlphaProof's IMO solutions with 'numerous redundant or inexplicable steps' (footnote 16).", + "supported": "moderate" + }, + { + "claim": "Strict regulation at this point would disproportionately shut down positive AI use cases without eliminating wasteful or malicious ones.", + "evidence": "Asserted in Section 1.3 without empirical evidence. Stated as: 'strict regulation imposed at this point would disproportionately shut down the more positive use cases of AI, such as in the acceleration of scientific research, without eliminating the more wasteful or malicious uses of the technology.'", + "supported": "weak" + }, + { + "claim": "AI technologies exacerbate existing inequalities by creating a 'digital divide' between those with and without access to frontier models.", + "evidence": "Section 5.3 discusses how 'large scale AI tools may only be available to well-financed or well-connected research groups' and identifies a second divide where different models develop 'spiky capabilities,' creating uneven advantages. Draws on analogy to historical inequities but provides no data on current AI access disparities.", + "supported": "weak" + }, + { + "claim": "An uncritical embrace of AI in mathematics could produce a flood of technically correct but insight-free papers that contribute nothing to broader mathematical narratives.", + "evidence": "Section 4.6 argues this via reasoning about incentives and reduced cost of proof generation, noting the 'negative impressions produced by such low-quality work may lead to a stigma against even the most careful and responsible application of AI.' No empirical evidence for this scenario.", + "supported": "weak" + }, + { + "claim": "A 'Copernican' reframing — placing human and artificial intelligence in the same ontological category — offers a productive philosophical middle ground.", + "evidence": "Section 6.4 develops the analogy between the geocentric-to-heliocentric shift in astronomy and the current need to rethink human cognitive uniqueness. Draws parallel to chess, where human players thrive alongside engines. The argument is by analogy rather than evidence.", + "supported": "weak" + } + ], + "red_flags": [ + { + "flag": "Claims significantly outrun the evidence", + "detail": "The paper makes sweeping prescriptive claims about global AI policy, economic impacts, and the future of all intellectual disciplines ('benefit of all humankind') based primarily on philosophical argument from analogy and the authors' personal experiences. No empirical evidence, case studies, or systematic analysis supports the broad normative conclusions." + }, + { + "flag": "No structured methodology", + "detail": "For a paper that aims to provide 'a pathway to integrating AI into our most challenging and intellectually rigorous fields,' there is no structured framework for evaluation. The arguments draw on historical analogies (Luddites, printing press, Wikipedia) without systematic analysis of whether these analogies hold. The paper acknowledges no method for how its claims could be tested or falsified." + } + ], + "cited_papers": [ + { + "title": "Highly accurate protein structure prediction with AlphaFold", + "authors": ["J. Jumper", "R. Evans", "A. Pritzel"], + "year": 2021, + "relevance": "Landmark AI application in science — demonstrates AI capabilities leading to Nobel Prize-winning research." + }, + { + "title": "Autoformalization with Large Language Models", + "authors": ["Y. Wu", "A. Q. Jiang", "W. Li", "M. Rabe", "C. Staats", "M. Jamnik", "C. Szegedy"], + "year": 2022, + "relevance": "Directly relevant to AI capability in mathematics — LLMs translating informal math to formal proofs." + }, + { + "title": "AI achieves silver-medal standard solving International Mathematical Olympiad problems", + "authors": ["DeepMind"], + "year": 2024, + "relevance": "AlphaProof benchmark evaluation — AI system solving competitive mathematics problems with formal verification." + }, + { + "title": "Early science acceleration experiments with GPT-5", + "authors": ["S. Bubeck", "C. Coester", "R. Eldan", "T. Gowers", "Y. T. Lee"], + "year": 2025, + "relevance": "Evaluates frontier LLM capability on scientific research tasks including solving open mathematical problems." + }, + { + "title": "AI models collapse when trained on recursively generated data", + "authors": ["I. Shumailov", "Z. Shumaylov", "Y. Zhao", "N. Papernot", "R. Anderson", "Y. Gal"], + "year": 2024, + "relevance": "Directly relevant to AI evaluation methodology — demonstrates model collapse from training on AI-generated data." + }, + { + "title": "A Turing test of whether AI chatbots are behaviorally similar to humans", + "authors": ["Q. Mei", "Y. Xie", "W. Yuan", "M. O. Jackson"], + "year": 2024, + "relevance": "Empirical evaluation of LLM capabilities — behavioral comparison between AI chatbots and humans." + }, + { + "title": "Algorithm and abstraction in formal mathematics", + "authors": ["H. Macbeth"], + "year": 2024, + "relevance": "Discusses restructuring mathematical proofs for AI-assisted verification, directly relevant to AI in mathematics." + }, + { + "title": "Some thoughts on automation and mathematical research", + "authors": ["A. Venkatesh"], + "year": 2024, + "relevance": "Perspective on AI's impact on mathematical research from a Fields Medalist, relevant to AI capability assessment." + }, + { + "title": "Hard Proofs and Good Reasons", + "authors": ["S. DeDeo"], + "year": 2024, + "relevance": "Discusses quality and meaning of AI-generated mathematical proofs, relevant to evaluation of AI in intellectual work." + }, + { + "title": "The AI gambit: Leveraging artificial intelligence to combat climate change", + "authors": ["J. Cowls", "A. Tsamados", "M. Taddeo", "L. Floridi"], + "year": 2023, + "relevance": "Discusses AI applications for climate change, relevant to cost-benefit analysis of AI deployment." + } + ], + "engagement_factors": { + "practical_relevance": { + "score": 0, + "justification": "Pure philosophical reflection with no tools, techniques, or methods that a practitioner could apply." + }, + "surprise_contrarian": { + "score": 1, + "justification": "The Copernican framing of AI intelligence is a novel metaphor, but the overall positions (AI should be human-centered, has costs and benefits) are mainstream." + }, + "fear_safety": { + "score": 1, + "justification": "Mentions existential risks, model collapse, digital divide, and environmental costs, but does not present novel threats or demonstrate specific attacks." + }, + "drama_conflict": { + "score": 1, + "justification": "References the Faustian bargain and Luddite parallels, and critiques unchecked AI development, but avoids direct confrontation with specific companies or claims." + }, + "demo_ability": { + "score": 0, + "justification": "No code, tool, or demo. The paper is entirely discursive." + }, + "brand_recognition": { + "score": 3, + "justification": "Terence Tao is a Fields Medalist and one of the most recognized mathematicians alive; his name alone drives significant attention to any paper he co-authors." + } + } +} diff --git a/registry.jsonl b/registry.jsonl @@ -2686,3 +2686,4 @@ {"id": "reproducibility-ml-overview-2025", "title": "Reproducibility in Machine-Learning-based Research: Overview, Barriers and Drivers", "authors": ["Semmelrock et al."], "year": 2025, "venue": "AI Magazine", "arxiv_id": "2406.14325", "source": "manual", "status": "scanned", "tags": ["meta", "reproducibility"], "notes": "Recent overview synthesizing reproducibility barriers and drivers in ML.", "directory": "papers/reproducibility-ml-overview-2025"} {"id": "trust-ai-benchmarks-2025", "title": "Can We Trust AI Benchmarks? An Interdisciplinary Review of Current Issues in AI Evaluation", "authors": ["European Commission"], "year": 2025, "venue": "Preprint", "arxiv_id": "2502.06559", "source": "manual", "status": "scanned", "tags": ["meta", "benchmarks", "evaluation"], "notes": "Interdisciplinary review of construct validity, contamination, and metric gaming in AI eval.", "directory": "papers/trust-ai-benchmarks-2025"} {"id": "ioannidis-why-most-2005", "title": "Why Most Published Research Findings Are False", "authors": ["John P.A. Ioannidis"], "year": 2005, "venue": "PLOS Medicine", "doi": "10.1371/journal.pmed.0020124", "source": "manual", "status": "queued", "tags": ["meta", "reference-benchmark", "foundational"], "notes": "Foundational meta-science paper. Not ML-specific but intellectual ancestor of all ML reproducibility work."} +{"id": "mathematical-methods-human-2026", "title": "Mathematical methods and human thought in the age of AI", "arxiv_id": "2603.26524", "year": 2026, "source": "manual", "status": "scanned", "doi": "", "source_url": "https://arxiv.org/abs/2603.26524", "tags": []} diff --git a/scripts/build-explorer-data.py b/scripts/build-explorer-data.py @@ -276,6 +276,17 @@ def build(): if overall is None: continue + # Classify paper type: empirical if both stats and eval have applicable questions + def _has_applicable(cat_name): + cd = checklist.get(cat_name, {}) + if not isinstance(cd, dict): + return False + return any(isinstance(qd, dict) and qd.get("applies", False) + for qd in cd.values()) + + is_empirical = _has_applicable("statistical_methodology") and _has_applicable("evaluation_design") + paper_type = "empirical" if is_empirical else "non-empirical" + cat_scores = {} for cat in ALL_CATEGORIES: cat_data = checklist.get(cat, {}) @@ -284,16 +295,14 @@ def build(): if cs is not None: cat_scores[cat] = cs - total_papers += 1 score_pct = round(overall * 100, 1) - all_scores.append(score_pct) score_map[paper_id] = score_pct year = paper_meta.get("year") or reg_entry.get("year") venue = metadata.get("venue") or paper_meta.get("venue") or reg_entry.get("venue", "") tags = scan.get("methodology_tags", []) or reg_entry.get("tags", []) - archetype = classify_archetype(cat_scores) - games = detect_games(checklist, overall, cat_scores) + archetype = classify_archetype(cat_scores) if is_empirical else None + games = detect_games(checklist, overall, cat_scores) if is_empirical else [] # External links arxiv_id = paper_meta.get("arxiv_id") or reg_entry.get("arxiv_id", "") @@ -303,70 +312,79 @@ def build(): # Code URL extraction code_url = extract_code_url(checklist) - year_scores[year].append(score_pct) - for t in tags: - tag_counts[t] += 1 - archetype_counts[archetype] += 1 - for g in games: - game_counts[g] += 1 - - # Category + question aggregations - for cat in ALL_CATEGORIES: - cat_data = checklist.get(cat, {}) - if not isinstance(cat_data, dict): - continue - for q_name, q_data in cat_data.items(): - if not isinstance(q_data, dict) or "applies" not in q_data: - continue - if q_data["applies"]: - cat_pass_counts[cat]["applicable"] += 1 - question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1 - if q_data.get("answer", False): - cat_pass_counts[cat]["passed"] += 1 - question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1 - # Year × category - if year: - year_cat_scores[year][cat]["applicable"] += 1 - if q_data.get("answer", False): - year_cat_scores[year][cat]["passed"] += 1 - - # Venue scoring (skip arXiv — it's a preprint server, not a venue) - venue_clean = venue.strip() - if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""): - venue_scores[venue_clean].append(score_pct) - - # Citation band scoring - cit = metadata.get("citation_count") - if cit is not None: - if cit == 0: - band = "0" - elif cit <= 50: - band = "1-50" - elif cit <= 500: - band = "51-500" - else: - band = "500+" - citation_band_scores[band].append(score_pct) - - # Benchmark monoculture - if year: - benchmark_only_by_year[year]["total"] += 1 - if tags == ["benchmark-eval"]: - benchmark_only_by_year[year]["benchmark_only"] += 1 - - # Funding gap - fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {}) - if fd.get("applies"): - if fd.get("answer"): - funding_groups["disclosed"].append(score_pct) - else: - funding_groups["not_disclosed"].append(score_pct) + # Only empirical papers feed into findings aggregations + if is_empirical: + total_papers += 1 + all_scores.append(score_pct) + year_scores[year].append(score_pct) + for t in tags: + tag_counts[t] += 1 + archetype_counts[archetype] += 1 + for g in games: + game_counts[g] += 1 claims = scan.get("claims", []) red_flags = scan.get("red_flags", []) - # Tension classification - for claim in claims: + # All remaining aggregations are empirical-only + if not is_empirical: + pass # skip to index/detail construction below + else: + # Category + question aggregations + for cat in ALL_CATEGORIES: + cat_data = checklist.get(cat, {}) + if not isinstance(cat_data, dict): + continue + for q_name, q_data in cat_data.items(): + if not isinstance(q_data, dict) or "applies" not in q_data: + continue + if q_data["applies"]: + cat_pass_counts[cat]["applicable"] += 1 + question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1 + if q_data.get("answer", False): + cat_pass_counts[cat]["passed"] += 1 + question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1 + # Year × category + if year: + year_cat_scores[year][cat]["applicable"] += 1 + if q_data.get("answer", False): + year_cat_scores[year][cat]["passed"] += 1 + + # Venue scoring + venue_clean = venue.strip() + if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""): + venue_scores[venue_clean].append(score_pct) + + # Citation band scoring + cit = metadata.get("citation_count") + if cit is not None: + if cit == 0: + band = "0" + elif cit <= 50: + band = "1-50" + elif cit <= 500: + band = "51-500" + else: + band = "500+" + citation_band_scores[band].append(score_pct) + + # Benchmark monoculture + if year: + benchmark_only_by_year[year]["total"] += 1 + if tags == ["benchmark-eval"]: + benchmark_only_by_year[year]["benchmark_only"] += 1 + + # Funding gap + fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {}) + if fd.get("applies"): + if fd.get("answer"): + funding_groups["disclosed"].append(score_pct) + else: + funding_groups["not_disclosed"].append(score_pct) + + # Tension classification (empirical papers only) + if is_empirical: + for claim in claims: ct = claim.get("claim", "").lower() entry = {"paper_id": paper_id, "claim": claim["claim"], "supported": claim.get("supported", ""), "score": score_pct, "year": year} @@ -439,6 +457,7 @@ def build(): "doi": doi, "code_url": code_url, "dna": dna, + "paper_type": paper_type, "hn_points": hn_data.get("top_points", 0), "engagement": [scan.get("engagement_factors", {}).get(d, {}).get("score") for d in ["practical_relevance", "surprise_contrarian", "fear_safety", @@ -507,16 +526,20 @@ def build(): if s.get("scan_version", 1) < 2: v1_count += 1 + total_scanned = len(papers_full) + non_empirical_count = total_scanned - total_papers v3_count = sum(1 for p in papers_full if p.get("engagement_factors") is not None) v2_only = total_papers - v3_count pipeline = { "registry_total": reg_total, - "v2_scanned": total_papers, + "v2_scanned": total_scanned, + "empirical": total_papers, + "non_empirical": non_empirical_count, "v3_scanned": v3_count, "v2_only": v2_only, "v1_needs_rescan": v1_count, - "has_text_no_scan": has_text - total_papers - v1_count, + "has_text_no_scan": has_text - total_scanned - v1_count, "no_text": reg_total - has_text, "excluded": reg_status.get("excluded", 0), } @@ -1218,6 +1241,7 @@ def build(): "doi": entry.get("doi", ""), "code_url": None, "dna": None, + "paper_type": None, "hn_points": 0, "engagement": None, }) diff --git a/test-results/.last-run.json b/test-results/.last-run.json @@ -0,0 +1,4 @@ +{ + "status": "failed", + "failedTests": [] +} +\ No newline at end of file

Impressum · Datenschutz