ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21441B)


      1 {
      2   "paper": {
      3     "title": "Artificial or Human Intelligence?",
      4     "authors": ["Eric Gao"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.02879"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": false,
     13         "answer": false,
     14         "justification": "This is a purely theoretical economics paper with mathematical proofs and no computational experiments. There is no code to release."
     15       },
     16       "data_released": {
     17         "applies": false,
     18         "answer": false,
     19         "justification": "No empirical data is collected or used. The paper is purely theoretical with mathematical models."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "No computational experiments are performed. The paper contains only mathematical derivations and proofs."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "As a purely theoretical paper, reproduction consists of verifying mathematical proofs, which are provided in the text itself. No experimental reproduction instructions are needed."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "No empirical results are reported. The paper contains only theoretical derivations."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No empirical comparisons are made. All results are mathematical propositions."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No empirical effect sizes. Results are theoretical comparative statics expressed as inequalities on partial derivatives."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Theoretical paper with no samples."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No empirical runs or experiments. All results are analytical."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a theoretical model, not an evaluation of a system. The paper compares its model's predictions to related theoretical and empirical work in the related literature section, but does not perform a baseline comparison in the experimental sense."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No experimental baselines. The paper is purely theoretical."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate. The paper derives comparative statics for different model parameters, which serves a similar conceptual role but is not an ablation study."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No evaluation metrics are used. Results are mathematical propositions about equilibrium behavior."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate. The paper is purely theoretical."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No data or test sets used."
     89       },
     90       "per_category_breakdown": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No empirical results to break down by category."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 7 (Discussion) discusses extensively the limitations and cases where the model's predictions may not hold, including multi-dimensional skills, heterogeneous student circumstances, and the simplistic payoff function. The paper notes 'the model considered in this paper is extremely simplistic' and lists several ways the model could fail to capture real-world dynamics."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that advances in AI technology do not always lead to increases in human learning (Section 5, 7), and that AI can increase inequality between students. These are negative findings about AI in education. Section 5.1 explicitly shows conditions under which AI advances make things worse for students who use AI as a solver."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims are supported by the theoretical results. Key claims: (1) AI alters student incentives to learn — supported by Proposition 1 and comparative statics in Section 5; (2) discontinuous gap between students above/below AI frontier — supported by Proposition 1's proof of discontinuity at threshold T; (3) increasing AI-free assignments can counteract misspecification — supported by Section 6's analysis showing λ = p/p' restores efficiency."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims within a formal model (e.g., 'increasing the accuracy of AI decreases the marginal benefit to learning for students who use AI as a solver'). These are derived as mathematical consequences of the model assumptions through comparative statics (Section 5), which is the appropriate methodology for theoretical economics. The causal claims are valid within the model's assumptions."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 7 (Discussion) extensively bounds the model's generalizability: 'the model considered in this paper is extremely simplistic. There is only one dimension of problems but students often require multiple complementary skills.' The paper lists specific dimensions not covered: multi-dimensional skills, heterogeneous time constraints, different degrees of misspecification across students, and differences between classroom and real-world problem distributions."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 discusses multiple alternative mechanisms that could explain observed patterns differently: complementarity vs substitutability of skills, nuanced AI usage patterns (helpers sometimes using AI as solver for routine tasks), heterogeneous student time constraints, and self-selection into courses as a confound (footnote 3). Section 3.2 also discusses the alternative view that emergence is an artifact of discontinuous benchmarks rather than true model behavior (Schaeffer et al., 2023)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No LLM or AI model is used in the experiments. The paper theoretically models AI capabilities with abstract parameters d and p."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used. This is a theoretical paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No computational experiments requiring hyperparameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding. This is a theoretical paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No data preprocessing. The paper is purely theoretical with no data."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 (Discussion) functions as a substantial limitations section, discussing at length the simplifications of the model and open questions. It spans roughly two pages and identifies multiple specific limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 identifies specific threats: the one-dimensional problem space ('students often require multiple complementary skills'), the simplistic payoff function ('students may make more nuanced decisions about when and how to use AI tools'), the single misspecification parameter ('different students may be misspecified to different degrees'), and the gap between classroom and real-world problem distributions."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 explicitly states what the model does NOT address: multi-dimensional skills, heterogeneous time availability, varying degrees of misspecification, the difference between classroom and real-world problem distributions, and the feedback loop where AI learns from humans. The paper says 'Most interesting would be to take the model to data' — explicitly acknowledging the paper provides no empirical validation."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "No data is collected. The paper is purely theoretical."
    177       },
    178       "data_collection_described": {
    179         "applies": false,
    180         "answer": false,
    181         "justification": "No data collection. Purely theoretical paper."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No participants or data collection. Purely theoretical."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No data pipeline. Purely theoretical paper."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure is present. The paper acknowledges individuals for 'helpful comments and discussions' but does not mention any funding source or grant support."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliation is disclosed: 'Department of Economics, MIT' with email ericgao@mit.edu."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed, and this appears to be unfunded academic work by a single MIT researcher."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model is evaluated on any benchmark. The paper is a theoretical model."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation on benchmarks. Purely theoretical."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmarks used. Purely theoretical paper."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. Purely theoretical paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Purely theoretical paper. No method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Purely theoretical paper. No computation performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Access to AI creates a discontinuous gap in student ability at a threshold type, where students below the threshold use AI as a solver (lower ability than AI) and those above use it as a helper (higher ability than AI).",
    286       "evidence": "Proposition 1 (Section 4) proves the existence of a unique threshold type T with a discontinuity in the ability mapping A(t), showing lim(t→T-) A(t) = as(T) < d < ah(T) = lim(t→T+) A(t).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Increasing AI accuracy (reducing hallucinations) decreases the marginal benefit of learning for students who use AI as a solver but does not impact those who use AI as a helper.",
    291       "evidence": "Section 5.1 shows that the marginal benefit for solvers is (1-p) which decreases as p increases, while the marginal benefit for helpers is 1 regardless of p.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The more advanced AI becomes (higher d and p), the more difficult it is for further AI advances to increase human learning.",
    296       "evidence": "Section 5.2 derives conditions for when the solver-helper threshold T increases with d and p, showing the required complementarities between ability and AI parameters must be larger when d and p are already high.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Instructors can counteract student misspecification about AI accuracy by placing weight on assignments that do not permit AI usage, with optimal weight λ = p/p'.",
    301       "evidence": "Section 6 derives that when students overestimate AI accuracy (p' > p), setting λ = p/p' in the assignment weight makes the misspecified student's problem coincide with the true problem, restoring efficiency.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "AI advances can increase inequality between students of different types, as it is more difficult for advances to incentivize humans who use AI as a solver to invest in higher ability compared to helpers.",
    306       "evidence": "Section 5.1 shows the condition for increasing differences in ability with p is harder to satisfy for solvers (requiring ∂²c/∂a∂p < -1) than for helpers (requiring ∂²c/∂a∂p < 0).",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["theoretical"],
    311   "key_findings": "This theoretical economics paper models how AI tools with hallucinations and emergent capabilities affect student incentives to invest in learning. The key finding is that AI access creates a discontinuous gap in student ability: students below a threshold type use AI as a solver (with lower ability than AI) while those above the threshold use it as a helper (with higher ability). Advances in AI accuracy can increase this inequality by reducing learning incentives for solver-students more than helper-students. The paper shows that educators can counteract student overestimation of AI accuracy by optimally weighting assignments that prohibit AI usage.",
    312   "red_flags": [
    313     {
    314       "flag": "No empirical validation",
    315       "detail": "The paper is entirely theoretical with no empirical tests of its predictions. The author acknowledges this ('Most interesting would be to take the model to data') but the model's predictions remain untested. The stylized assumptions (uniform problem distribution, sharp AI cutoff, one-dimensional ability) may not hold in practice."
    316     },
    317     {
    318       "flag": "Strong simplifying assumptions",
    319       "detail": "The model assumes one-dimensional problems, one-dimensional ability, uniform problem distribution, perfect human accuracy within their ability range, and a sharp AI cutoff. These assumptions drive the key results (especially the discontinuity at threshold T) but are acknowledged as unrealistic by the author in Section 7."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "Generative AI Can Harm Learning",
    325       "authors": ["Hamsa Bastani", "Osbert Bastani", "Alp Sungu", "Haosen Ge", "Özge Kabakcı", "Rei Mariman"],
    326       "year": 2024,
    327       "doi": "10.2139/ssrn.4895486",
    328       "relevance": "Field experiment on GPT-4 in high-school math classes showing 17% decrease in exam performance when AI is later removed, directly relevant to AI impact on learning."
    329     },
    330     {
    331       "title": "Generative AI at Work",
    332       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    333       "year": 2025,
    334       "doi": "10.1093/qje/qjae044",
    335       "relevance": "Empirical study of AI adoption among customer-support agents showing AI benefits lower-ability workers more, foundational evidence for AI productivity effects."
    336     },
    337     {
    338       "title": "Canaries in the Coal Mine? Six Facts about the Recent Employment Effects of Artificial Intelligence",
    339       "authors": ["Erik Brynjolfsson", "Bharat Chandar", "Ruyu Chen"],
    340       "year": 2025,
    341       "relevance": "Uses payroll data to study AI impact on labor markets, finding young workers in AI-exposed fields harmed most, relevant to AI workforce effects."
    342     },
    343     {
    344       "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
    345       "authors": ["Shakked Noy", "Whitney Zhang"],
    346       "year": 2023,
    347       "doi": "10.1126/science.adh2586",
    348       "relevance": "RCT studying how LLM assistance impacts writing-based tasks, key evidence on AI productivity effects and inequality."
    349     },
    350     {
    351       "title": "Generative AI enhances individual creativity but reduces the collective diversity of novel content",
    352       "authors": ["Anil R Doshi", "Oliver P Hauser"],
    353       "year": 2024,
    354       "doi": "10.1126/sciadv.adn5290",
    355       "relevance": "Experimental study showing AI increases individual creativity but reduces population-level diversity, relevant to collective effects of AI assistance."
    356     },
    357     {
    358       "title": "AI Meets the Classroom: When Do Large Language Models Harm Learning?",
    359       "authors": ["Matthias Lehmann", "Philipp B. Cornelius", "Fabian J. Sting"],
    360       "year": 2024,
    361       "arxiv_id": "2409.09047",
    362       "relevance": "Lab study on GPT access in programming courses finding students asked for solutions more than explanations, directly relevant to AI in education."
    363     },
    364     {
    365       "title": "Artificial Intelligence in the Knowledge Economy",
    366       "authors": ["Enrique Ide", "Eduard Talamas"],
    367       "year": 2025,
    368       "doi": "10.1086/737233",
    369       "relevance": "Closest theoretical prior work modeling how AI impacts labor markets with exogenous worker ability, which this paper extends to endogenous investment."
    370     },
    371     {
    372       "title": "Emergent Abilities of Large Language Models",
    373       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    374       "year": 2022,
    375       "doi": "10.48550/arXiv.2206.07682",
    376       "relevance": "Foundational paper documenting emergent abilities in LLMs, one of the two stylized facts motivating this paper's model."
    377     },
    378     {
    379       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    380       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    381       "year": 2023,
    382       "doi": "10.48550/arXiv.2304.15004",
    383       "relevance": "Challenges the emergence hypothesis by arguing discontinuities may come from benchmarks rather than models, directly discussed in Section 3.2."
    384     },
    385     {
    386       "title": "Effects of AI Feedback on Learning, the Skill Gap, and Intellectual Diversity",
    387       "authors": ["Christoph Riedl", "Eric Bogert"],
    388       "year": 2024,
    389       "arxiv_id": "2409.18660",
    390       "relevance": "Studies how AI chess engines impact player learning, finding increased inequality — relevant to AI in education and skill development."
    391     },
    392     {
    393       "title": "The ABC's of Who Benefits from Working with AI: Ability, Beliefs, and Calibration",
    394       "authors": ["Andrew Caplin", "David Deming", "Shangwen Li", "Daniel Martin", "Philip Marx", "Ben Weidmann", "Kadachi Jiada Ye"],
    395       "year": 2024,
    396       "doi": "10.3386/w33021",
    397       "relevance": "Experimental study on how AI aids human decision-makers, finding that miscalibrated humans accrue less benefit — relevant to the misspecification model in this paper."
    398     }
    399   ]
    400 }

Impressum · Datenschutz