scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18449B)
      1 {
      2   "paper": {
      3     "title": "On the Measure of Intelligence",
      4     "authors": ["François Chollet"],
      5     "year": 2019,
      6     "venue": "arXiv",
      7     "arxiv_id": "1911.01547"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The ARC dataset is released at github.com/fchollet/ARC, as stated in Section III.1.1."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The ARC dataset (400 training tasks, 400 public evaluation tasks, 200 private evaluation tasks) is publicly available at github.com/fchollet/ARC."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a theoretical paper with a dataset release. There is no computational environment to specify — the dataset consists of JSON task files."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No computational experiments are run that would require reproduction instructions. The paper is theoretical with a benchmark proposal."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "The paper does not report quantitative experimental results. It is a theoretical paper proposing definitions and a benchmark."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No comparative empirical claims are made that would require significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No empirical experiments are conducted."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical experiments with samples are conducted."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical experiments are conducted."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "The paper does not evaluate any system's performance. It proposes a benchmark and theoretical framework."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No system evaluation is performed."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system is proposed that could be ablated."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No empirical evaluation is conducted."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper claims ARC is 'fully solvable by humans' and that 'each task included in ARC has been successfully solved by at least one member of a group of three high-IQ humans,' but no formal human evaluation with reported metrics or systematic study is presented. The paper acknowledges this as a weakness: 'we hope to be able to further investigate human performance on ARC by gathering a statistically significant amount of human testing data.'"
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No model evaluation is conducted. The benchmark itself has a private evaluation set, but no system is tested on it."
     89       },
     90       "per_category_breakdown": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No empirical results are reported."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No system is evaluated, so no failure cases are discussed. The paper does discuss weaknesses of the ARC benchmark itself (Section III.2) but not failure cases of any evaluated system."
     99       },
    100       "negative_results_reported": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "No experiments are conducted to produce positive or negative results."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract's claims are primarily theoretical (proposing a definition, proposing a benchmark) and argumentative (skill measurement is insufficient). The paper delivers on these: it provides the formal definition in Section II.2, critiques skill-based evaluation in II.1, and presents ARC in Section III."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes theoretical arguments and proposals, not causal empirical claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is explicit about the scope of its claims. Section II.1.2 argues intelligence is tied to a specific scope, and Section III.2 lists known weaknesses. The paper states 'Our claims are highly speculative and may well prove fully incorrect' (III.1.4) and 'ARC is a work in progress, not a definitive solution' (III.2)."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper extensively discusses alternative views of intelligence (task-specific skills vs. general learning ability in I.2), critiques existing approaches (I.3, II.1), and acknowledges competing formalisms (AIXI, Universal Intelligence, C-Test) in II.2.1. Section III.3 discusses alternative benchmark approaches."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No models are used or evaluated in this paper."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are trained or evaluated."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No data preprocessing pipeline is involved. ARC tasks are manually created."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section III.2 'Weaknesses and future refinements' provides a detailed discussion of five specific weaknesses of the ARC benchmark."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section III.2 lists specific threats: generalization is not quantified, test validity is not established, dataset size and diversity may be limited, evaluation format is overly close-ended and binary, and Core Knowledge priors may not be well captured. These are specific to this work, not generic."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries throughout. Section II.1.2 argues intelligence is tied to a scope of application. The paper states 'Our claims are highly speculative and may well prove fully incorrect' (III.1.4). Section II.2.1 notes the definition is not 'the one true definition' and aims to be 'actionable' rather than exhaustive."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The ARC dataset is publicly available at github.com/fchollet/ARC."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section III.1.1-III.1.2 describes how ARC tasks are designed: manually created based on Core Knowledge priors, with specific constraints on grid sizes (1x1 to 30x30), 10 symbols, and the four prior categories described in III.1.2."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are formally studied. The mention of 'three high-IQ humans' checking task feasibility is informal and does not constitute a human subjects study."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper describes the design principles behind ARC tasks but does not document a systematic pipeline for task creation, selection criteria, or quality control beyond the Core Knowledge framework."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed. The author is affiliated with Google, but no funding acknowledgment section is present."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation with Google, Inc. is clearly stated on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed. The author works at Google, which has a vested interest in AI benchmarks and intelligence measurement, but no explicit funding statement is made and no conflict analysis is possible."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model is evaluated on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model is evaluated on any benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No formal human subjects study is conducted."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No formal human subjects study is conducted."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No formal human subjects study is conducted."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No formal human subjects study is conducted."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No formal human subjects study is conducted."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No formal human subjects study is conducted."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No formal human subjects study is conducted."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a theoretical paper with a benchmark proposal. No computational method is run."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No computation is performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Measuring skill at specific tasks is insufficient for measuring intelligence, because skill is heavily modulated by prior knowledge and experience that can mask generalization power.",
    286       "evidence": "Extensive theoretical argument in Sections II.1.1 and II.2, with examples including DeepBlue, OpenAI Five (beaten by non-champion humans after public release despite 45,000 years of training), and a hashtable thought experiment showing local-generalization systems can solve any task with unlimited data.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Intelligence should be formally defined as skill-acquisition efficiency over a scope of tasks, with respect to priors, experience, and generalization difficulty.",
    291       "evidence": "Formal definition provided in Section II.2.1 using Algorithmic Information Theory, with definitions of generalization difficulty, priors, experience, and a mathematical formula for intelligence.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "ARC can be used to measure a human-like form of general fluid intelligence and enables fair general intelligence comparisons between AI systems and humans.",
    296       "evidence": "Section III describes the benchmark design. However, the paper acknowledges this claim is 'highly speculative' (III.1.4), test validity is not established (III.2), and only informal human testing with three individuals was conducted.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "ARC is fully solvable by humans but cannot be meaningfully approached by any existing machine learning technique.",
    301       "evidence": "Section III.1.4 states each task was solved by at least one of three high-IQ humans. The claim about ML is stated without systematic evidence — no ML baselines are reported.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Human general intelligence is not universal but is highly specialized, tied to the scope of human-relevant tasks.",
    306       "evidence": "Section II.1.2 provides theoretical arguments with examples: humans fail at 4D+ navigation, perform poorly on non-Euclidean TSP variants, and cannot do long-term planning or large working memory tasks. Cites cognitive psychology literature on dimensional bias and Core Knowledge.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["theoretical"],
    311   "key_findings": "The paper proposes a formal definition of intelligence as skill-acquisition efficiency, grounded in Algorithmic Information Theory, arguing that measuring task-specific skill alone is insufficient because unlimited priors or training data can artificially inflate performance without generalization. It introduces the Abstraction and Reasoning Corpus (ARC), a benchmark of 1,000 visual reasoning tasks built on Core Knowledge priors, designed to measure developer-aware broad generalization rather than narrow skill. The paper explicitly acknowledges ARC's limitations including unestablished test validity and unquantified generalization difficulty.",
    312   "red_flags": [
    313     {
    314       "flag": "No empirical validation",
    315       "detail": "The core claims about ARC measuring general intelligence and being unsolvable by ML are not empirically tested. No ML baselines are reported. Human solvability is tested only informally with three 'high-IQ' individuals with no systematic data collection."
    316     },
    317     {
    318       "flag": "Author conflict not analyzed",
    319       "detail": "The author works at Google, a major AI company with interests in AI benchmarking and intelligence measurement. No conflict of interest statement is provided."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "Building machines that learn and think like people",
    325       "authors": ["Brenden M. Lake", "Tomer D. Ullman", "Joshua B. Tenenbaum", "Samuel J. Gershman"],
    326       "year": 2016,
    327       "relevance": "Foundational work on cognitive priors for AI that directly informs the ARC benchmark design and the paper's argument about Core Knowledge."
    328     },
    329     {
    330       "title": "Universal intelligence: A definition of machine intelligence",
    331       "authors": ["Shane Legg", "Marcus Hutter"],
    332       "year": 2007,
    333       "relevance": "Alternative AIT-based formal definition of intelligence that the paper critiques and contrasts with its own definition."
    334     },
    335     {
    336       "title": "Evaluation in artificial intelligence: from task-oriented to ability-oriented measurement",
    337       "authors": ["José Hernández-Orallo"],
    338       "year": 2017,
    339       "relevance": "Comprehensive survey of AI evaluation methods that motivates the paper's argument for ability-oriented rather than skill-oriented evaluation."
    340     },
    341     {
    342       "title": "Program Synthesis",
    343       "authors": ["Sumit Gulwani", "Alex Polozov", "Rishabh Singh"],
    344       "year": 2017,
    345       "relevance": "The paper frames ARC as a program synthesis benchmark, connecting intelligence measurement to automated program generation."
    346     },
    347     {
    348       "title": "Quantifying generalization in reinforcement learning",
    349       "authors": ["Karl Cobbe", "Oleg Klimov", "Christopher Hesse", "Taehoon Kim", "John Schulman"],
    350       "year": 2018,
    351       "relevance": "Discusses generalization measurement in RL, which the paper critiques as measuring only local generalization rather than broad abilities."
    352     },
    353     {
    354       "title": "Deep learning: A critical appraisal",
    355       "authors": ["Gary Marcus"],
    356       "year": 2018,
    357       "arxiv_id": "1801.00631",
    358       "relevance": "Critique of deep learning limitations that supports the paper's argument about local generalization being insufficient."
    359     },
    360     {
    361       "title": "A collection of definitions of intelligence",
    362       "authors": ["Shane Legg", "Marcus Hutter"],
    363       "year": 2007,
    364       "relevance": "Survey of 70+ intelligence definitions that provides context for the paper's own formal definition."
    365     },
    366     {
    367       "title": "The Measure of All Minds: Evaluating Natural and Artificial Intelligence",
    368       "authors": ["José Hernández-Orallo"],
    369       "year": 2017,
    370       "relevance": "Comprehensive book on AI evaluation that the paper builds upon for its intelligence measurement framework."
    371     }
    372   ]
    373 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs