ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (16897B)


      1 {
      2   "paper": {
      3     "title": "Realist and Pluralist Conceptions of Intelligence and Their Implications on AI Research",
      4     "authors": ["Ninell Oldenburg", "Ruchira Dhar", "Anders Søgaard"],
      5     "year": 2025,
      6     "venue": "AAAI 2026",
      7     "arxiv_id": "2511.15282",
      8     "doi": "10.48550/arXiv.2511.15282"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical"],
     13   "key_findings": "The paper identifies two implicit conceptions of intelligence underlying AI research—Intelligence Realism (single universal capacity) and Intelligence Pluralism (diverse, context-dependent capacities)—and shows how they generate different methodological approaches, contradictory interpretations of the same empirical phenomena (e.g., scaling laws, emergent capabilities), and categorically different AI risk assessments. The authors argue that making these assumptions explicit can clarify when disagreements in AI research are empirical vs. philosophical.",
     14   "claims": [
     15     {
     16       "claim": "Current AI research operates on a spectrum between Intelligence Realism and Intelligence Pluralism, and these positions remain largely implicit.",
     17       "evidence": "The paper provides examples from the literature (AIXI, BIG-Bench, ARC-AGI, Sparks of AGI, Mitchell 2021) showing how methodological choices and interpretive conclusions map onto realist vs. pluralist assumptions without researchers explicitly acknowledging them.",
     18       "supported": "moderate"
     19     },
     20     {
     21       "claim": "Realist and pluralist assumptions generate fundamentally different research approaches across methodology, interpretation, and AI risk assessment.",
     22       "evidence": "Sections on Methodology, Interpretation, and Alignment provide parallel analyses showing how each position leads to different benchmark designs, model selection strategies, failure attributions, scaling law interpretations, and governance approaches. Supported by extensive citation of contrasting positions (e.g., Bubeck et al. 2023 vs. Mitchell 2021).",
     23       "supported": "moderate"
     24     },
     25     {
     26       "claim": "Making these philosophical commitments explicit can contribute to clearer scientific discourse and more productive policy debates.",
     27       "evidence": "Stated in the abstract and conclusion but no empirical demonstration that explicit framing actually improves discourse. The paper provides a diagnostic rubric (Table 1) as a tool but does not test its effectiveness.",
     28       "supported": "weak"
     29     }
     30   ],
     31   "red_flags": [
     32     {
     33       "flag": "No empirical validation of the framework",
     34       "detail": "The paper proposes a realism-pluralism spectrum but does not empirically test whether researchers actually hold these positions, whether the spectrum captures meaningful variation, or whether making assumptions explicit improves discourse. The framework is illustrated with examples but not validated."
     35     },
     36     {
     37       "flag": "Selective citation of exemplars",
     38       "detail": "The paper selects prominent examples to illustrate each position (AIXI for realism, octopus cognition for pluralism) but does not systematically survey the field to determine how prevalent each position actually is. The examples may not be representative."
     39     }
     40   ],
     41   "checklist": {
     42     "artifacts": {
     43       "code_released": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Purely theoretical/philosophical paper with no code or computational artifacts to release."
     47       },
     48       "data_released": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No dataset is collected or analyzed. The paper is a conceptual analysis of existing literature."
     52       },
     53       "environment_specified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No computational experiments are run; no environment is needed."
     57       },
     58       "reproduction_instructions": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experiments to reproduce. The paper is a philosophical argument."
     62       }
     63     },
     64     "statistical_methodology": {
     65       "confidence_intervals_or_error_bars": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No quantitative results are reported. This is a theoretical analysis."
     69       },
     70       "significance_tests": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No statistical comparisons are made."
     74       },
     75       "effect_sizes_reported": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No quantitative effects to report."
     79       },
     80       "sample_size_justified": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No sample is collected; purely theoretical paper."
     84       },
     85       "variance_reported": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experimental runs to report variance across."
     89       }
     90     },
     91     "evaluation_design": {
     92       "baselines_included": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No system or method is evaluated; the paper proposes a conceptual framework."
     96       },
     97       "baselines_contemporary": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "No baselines are applicable to a theoretical paper."
    101       },
    102       "ablation_study": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "No system with components to ablate."
    106       },
    107       "multiple_metrics": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "No metrics are used; no evaluation is conducted."
    111       },
    112       "human_evaluation": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "No system outputs to evaluate."
    116       },
    117       "held_out_test_set": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "No data splits; purely theoretical."
    121       },
    122       "per_category_breakdown": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "No quantitative results to break down."
    126       },
    127       "failure_cases_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses limitations of both the realist and pluralist positions, including the pluralist 'analytical vacuity' problem (if everything is intelligent, the term loses discriminatory power) and realist difficulties with distributed cognition (octopus arms)."
    131       },
    132       "negative_results_reported": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No experiments that could yield negative results."
    136       }
    137     },
    138     "claims_and_evidence": {
    139       "abstract_claims_supported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The abstract claims the paper demonstrates how realist/pluralist conceptions shape methodology, interpretation, and risk assessment. The paper body provides extensive analysis across all three areas with literature examples supporting each point."
    143       },
    144       "causal_claims_justified": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not make causal claims. It argues that positions 'shape' and 'generate' different approaches, but these are analytical/philosophical claims about conceptual entailment, not empirical causal claims."
    148       },
    149       "generalization_bounded": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper explicitly states it 'should be seen as a continuum rather than a hard classification,' acknowledges intermediate positions, and states 'We do not claim to resolve the realism-pluralism debate' in the conclusion."
    153       },
    154       "alternative_explanations_discussed": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The paper systematically presents counterarguments from each position against the other. For each realist claim, pluralist rebuttals are given, and vice versa. The intermediate positions section explicitly discusses how researchers may hold mixed views."
    158       },
    159       "proxy_outcome_distinction": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No measurements or proxies are used; the paper is purely theoretical."
    163       }
    164     },
    165     "setup_transparency": {
    166       "model_versions_specified": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No models are used in experiments."
    170       },
    171       "prompts_provided": {
    172         "applies": false,
    173         "answer": false,
    174         "justification": "No prompting is used."
    175       },
    176       "hyperparameters_reported": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "No computational experiments."
    180       },
    181       "scaffolding_described": {
    182         "applies": false,
    183         "answer": false,
    184         "justification": "No agentic scaffolding is used."
    185       },
    186       "data_preprocessing_documented": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No data is collected or preprocessed."
    190       }
    191     },
    192     "limitations_and_scope": {
    193       "limitations_section_present": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No dedicated limitations section. The conclusion acknowledges the debate is unresolved but does not substantively discuss limitations of the framework itself."
    197       },
    198       "threats_to_validity_specific": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No specific threats to validity are discussed. The paper does not address whether its categorization might misrepresent researchers' actual views or whether the realism-pluralism axis is the most useful way to carve the disagreements."
    202       },
    203       "scope_boundaries_stated": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The conclusion explicitly states: 'We do not claim to resolve the realism-pluralism debate — that would require extensive empirical and philosophical work beyond this paper's scope.' The paper also clarifies it is a 'targeted synthesis' rather than a new distinction."
    207       }
    208     },
    209     "data_integrity": {
    210       "raw_data_available": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No data is collected; the paper analyzes published literature through philosophical argument."
    214       },
    215       "data_collection_described": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No systematic data collection; the paper cites literature to illustrate its argument."
    219       },
    220       "recruitment_methods_described": {
    221         "applies": false,
    222         "answer": false,
    223         "justification": "No participants or systematic sample to recruit."
    224       },
    225       "data_pipeline_documented": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "No data pipeline; purely argumentative paper."
    229       }
    230     },
    231     "conflicts_of_interest": {
    232       "funding_disclosed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No funding information is provided anywhere in the paper."
    236       },
    237       "affiliations_disclosed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Author affiliations are clearly listed: Department of Philosophy and Department of Computer Science, University of Copenhagen."
    241       },
    242       "funder_independent_of_outcome": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No funding disclosed; cannot assess independence. Likely unfunded academic work."
    246       },
    247       "financial_interests_declared": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No competing interests or financial interests statement is provided."
    251       }
    252     },
    253     "contamination": {
    254       "training_cutoff_stated": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No pre-trained model is evaluated on any benchmark."
    258       },
    259       "train_test_overlap_discussed": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No model evaluation on benchmarks."
    263       },
    264       "benchmark_contamination_addressed": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No benchmark evaluation."
    268       }
    269     },
    270     "human_studies": {
    271       "pre_registered": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "irb_or_ethics_approval": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "demographics_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "inclusion_exclusion_criteria": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "randomization_described": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "blinding_described": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       },
    301       "attrition_reported": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants."
    305       }
    306     },
    307     "cost_and_practicality": {
    308       "inference_cost_reported": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "Theoretical paper; no computational method with costs to report."
    312       },
    313       "compute_budget_stated": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No computation performed."
    317       }
    318     }
    319   },
    320   "cited_papers": [
    321     {
    322       "title": "On the measure of intelligence",
    323       "authors": ["François Chollet"],
    324       "year": 2019,
    325       "arxiv_id": "1911.01547",
    326       "relevance": "Proposes ARC benchmark as a measure of general fluid intelligence; directly relevant to AI capability evaluation methodology."
    327     },
    328     {
    329       "title": "Sparks of artificial general intelligence: Early experiments with gpt-4",
    330       "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan"],
    331       "year": 2023,
    332       "relevance": "High-profile claims about GPT-4 as early AGI; exemplifies realist interpretation of LLM capabilities."
    333     },
    334     {
    335       "title": "Are emergent abilities of large language models a mirage?",
    336       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    337       "year": 2023,
    338       "relevance": "Challenges the concept of emergent LLM abilities as measurement artifacts; directly relevant to capability evaluation methodology."
    339     },
    340     {
    341       "title": "Constitutional AI: Harmlessness from AI feedback",
    342       "authors": ["Yuntao Bai"],
    343       "year": 2022,
    344       "arxiv_id": "2212.08073",
    345       "relevance": "Foundational AI alignment approach discussed in the paper as reflecting realist assumptions about trainable value systems."
    346     },
    347     {
    348       "title": "Risks from learned optimization in advanced machine learning systems",
    349       "authors": ["Evan Hubinger"],
    350       "year": 2019,
    351       "arxiv_id": "1906.01820",
    352       "relevance": "Mesa-optimization and deceptive alignment concerns relevant to AI safety research methodology."
    353     },
    354     {
    355       "title": "AI and the everything in the whole wide world benchmark",
    356       "authors": ["Inioluwa Deborah Raji", "Emily M. Bender", "Amandalynne Paullada"],
    357       "year": 2021,
    358       "arxiv_id": "2111.15366",
    359       "relevance": "Critiques universal AI benchmarking practices from a pluralist perspective; relevant to evaluation methodology."
    360     },
    361     {
    362       "title": "Big-bench extra hard",
    363       "authors": ["Mehran Kazemi"],
    364       "year": 2025,
    365       "arxiv_id": "2502.19187",
    366       "relevance": "Major multi-task LLM benchmark exemplifying aggregated evaluation approaches."
    367     },
    368     {
    369       "title": "Concrete problems in AI safety",
    370       "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt"],
    371       "year": 2016,
    372       "arxiv_id": "1606.06565",
    373       "relevance": "Foundational AI safety paper discussed as reflecting realist alignment assumptions."
    374     },
    375     {
    376       "title": "Emergent abilities of large language models",
    377       "authors": ["Jason Wei"],
    378       "year": 2022,
    379       "arxiv_id": "2206.07682",
    380       "relevance": "Influential paper on capability emergence in LLMs; central to the realist-pluralist interpretive debate."
    381     },
    382     {
    383       "title": "Stop Anthropomorphizing Intermediate Tokens as Reasoning/Thinking Traces!",
    384       "authors": ["Subbarao Kambhampati"],
    385       "year": 2025,
    386       "arxiv_id": "2504.09762",
    387       "relevance": "Challenges realist interpretation of LLM reasoning capabilities; relevant to AI capability claims methodology."
    388     }
    389   ]
    390 }

Impressum · Datenschutz