scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23643B)
      1 {
      2   "paper": {
      3     "title": "How AI Impacts Skill Formation",
      4     "authors": ["Judy Hanwen Shen", "Alex Tamkin"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.20245",
      8     "doi": "10.48550/arXiv.2601.20245"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["rct", "qualitative"],
     13   "key_findings": "In a randomized experiment with 52 professional developers learning a new Python library (Trio), AI assistance reduced quiz scores by 17% (Cohen's d=0.738, p=0.010) without significantly improving task completion time. Qualitative analysis of screen recordings identified six AI interaction patterns: three low-scoring patterns involving cognitive offloading (delegation, progressive reliance, iterative debugging) and three high-scoring patterns involving cognitive engagement (conceptual inquiry, hybrid code-explanation, generation-then-comprehension). The control group encountered more errors, particularly Trio-related errors, which likely contributed to deeper skill formation.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Annotated transcripts released at https://github.com/safety-research/how-ai-impacts-skill-formation (Section 6, Appendix B.2)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Annotated transcripts of all 51 participants made publicly available at the GitHub repository (Appendix B.2)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The coding platform is described but not the computational environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The study design is described in detail but there are no scripts or README for replicating the experiment."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% CI error bars are shown in Figures 5, 6, and 7. Figure captions state 'Error bars represent 95% CI.'"
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "P-values reported for main effects: quiz score p=0.010, task time p=0.391 (Section 5.2.2). Cohen's d reported as effect sizes."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Cohen's d reported for main study (d=0.738 for quiz score, d=0.725 controlling for warm-up) and pilot D (d=1.11 task time, d=1.7 quiz score). Percentage differences also provided (17% score difference)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Power analysis conducted based on pilot study D. 'We assumed a conservative effect size of d=0.85 (half of the observed learning effect) to account for the potential effect size inflation typical in pilot studies' (Section 5.1)."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Main outcome measures (quiz score, task time) report 95% CIs but not standard deviations. Table 4 reports median and IQR for error counts, but primary outcomes lack explicit variance measures."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Control group (No AI, n=26) serves as the baseline, completing the same tasks without AI assistance."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The no-AI control condition is the natural and appropriate baseline for measuring the effect of AI assistance on learning."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is an RCT comparing two conditions (AI vs no AI), not a system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two primary metrics: task completion time and quiz score. Quiz score further decomposed into conceptual, debugging, and code reading subareas (Figure 8)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a human subjects study measuring human learning outcomes, not a system whose outputs require human evaluation."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not a machine learning evaluation; no train/test split concept applies."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 8 provides per-task and per-skill-type breakdowns (Task 1, Task 2, Conceptual, Debugging, Code Reading). Figure 7 breaks results by years of coding experience."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 provides detailed qualitative analysis of low-scoring AI interaction patterns (AI Delegation, Progressive AI Reliance, Iterative AI Debugging) and discusses why they lead to poor outcomes."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The null result on task completion time (p=0.391, no significant speedup from AI) is a negative result contrary to prior work and is reported prominently."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims that AI 'impairs conceptual understanding, code reading, and debugging abilities, without delivering significant efficiency gains' are supported by Figure 6 (quiz score p=0.010, task time p=0.391) and Figure 8 (per-skill breakdown)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are justified by randomized controlled trial design with balanced groups (Table 1), pre-registration, and multiple pilot studies addressing compliance threats."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title 'How AI Impacts Skill Formation' is very broad. Results are from one specific library (Trio), one AI model (GPT-4o), one task type (learning a new library), and crowd-sourced developers. While Section 7.1 lists limitations, the title and abstract frame findings more broadly than the evidence supports."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 discusses heterogeneity in AI usage patterns as explanation for the null productivity result. Section 7 discusses cognitive engagement as the mechanism. Section 7.1 discusses alternative evaluation designs, task selection, and participant realism."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly frames quiz score as measuring 'skill formation' and discusses what the quiz does and does not capture (Section 4.2). Section 7.1 acknowledges evaluation design as a limitation and suggests alternative measurement strategies."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'The base model used for this assistant is GPT-4o' (Section 4.1) without specifying a version date or API snapshot. Per schema guidance, marketing names without snapshot dates do not count."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper states 'the model is prompted to be an intelligent coding assistant' and 'The AI assistant has access to participants' current version of the code' (Section 4.1) but does not provide the actual system prompt text."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max tokens, or other API parameters are reported for the GPT-4o assistant."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The AI assistant is a simple chat interface on a coding platform."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data collection pipeline well documented: recruitment survey → warm-up task → coding task → quiz → post-survey. Disqualification criteria pre-registered (1 participant excluded for leaving 4 blank questions, Section 5.2.1). Pilot study iterations documented in Table 2."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.1 'Future Work' contains substantive discussion of limitations including task selection, task length, participant realism, prompting skills, and evaluation design."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.1 lists specific threats: single task with chat-based interface (not agentic), one-hour timeframe vs months/years of real skill formation, crowd workers vs actual employees with job incentives, and unmeasured prompting skill differences."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.1 explicitly states what was NOT tested: agentic coding tools, longitudinal skill development, real company settings, differences in prompting fluency, and alternative evaluation strategies."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Annotated transcripts of all participants made publicly available at https://github.com/safety-research/how-ai-impacts-skill-formation (Appendix B.2)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection thoroughly described: coding platform keystrokes, AI chat transcripts, screen recordings, Google Forms surveys, recruitment surveys (Sections 4.3, Appendix B.1). Instruments and timing documented."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Participants recruited through 'a third party crowd-worker platform' with specific criteria: 1+ years Python experience, weekly Python use, prior AI tool experience, no Trio experience (Section 4.3). Balanced on experience and asyncio familiarity."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Pipeline documented: 58 recruited → 53 completed all parts → 1 disqualified per pre-registered criteria → 52 analyzed (Section 5.2.1). One screen recording unavailable (footnote 3)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Anthropic Fellows Program and Anthropic affiliation disclosed in author footnotes: 'Work done as a part of the Anthropic Fellows Program' and author affiliations."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Both authors' Anthropic affiliations clearly stated: Shen via the Anthropic Fellows Program, Tamkin directly at Anthropic."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Anthropic is a major AI company with strategic interest in how AI assistance is perceived. The study's findings (AI impairs learning if used without cognitive engagement) align with Anthropic's safety-oriented positioning. The funder has a stake in the narrative around responsible AI use."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is an RCT studying human learning outcomes, not evaluating a pre-trained model's capability on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not evaluating a pre-trained model on a benchmark. The AI assistant is a tool used by participants, not the subject of capability evaluation."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not evaluating a pre-trained model on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Study pre-registered at https://osf.io/w49e7 (Section 5.1). 'We submitted the grading rubric for the quiz in our study pre-registration before running the experiment' (Section 4.2)."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "'The protocol was reviewed and approved by internal reviewers at Anthropic' (Appendix A.1). While this is internal review rather than an independent IRB, ethics review and approval is explicitly mentioned."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Figure 17 reports age, education, coding context, and student status. Table 1 reports coding experience, Python frequency, asyncio experience, and async quiz scores by condition."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 4.3: 'We only recruited participants who self-reported having more than one year of Python experience, code in Python at least once a week, have tried AI coding assistance at least a few times, and have never used the Trio library before.'"
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Between-subjects randomized experiment with balanced assignment across coding experience, Python frequency, asyncio familiarity, and pre-task quiz score (Table 1, Section 4.3)."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No discussion of blinding. Participants clearly knew their condition: treatment group was prompted to use AI (Figure 24), control group pledged not to use AI (Figure 21). No mention of evaluator blinding for quiz grading or qualitative analysis."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "58 recruited, 53 completed all parts, 1 disqualified for leaving blank questions due to not realizing quiz had multiple parts, 52 analyzed (Section 5.2.1). 4 control participants did not complete task 2 within time limit."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a human subjects RCT studying learning outcomes, not proposing an AI method whose inference cost matters."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a human subjects RCT, not a compute-intensive AI method."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "AI assistance reduces skill formation (quiz scores) by 17% or 2 grade points when learning a new programming library.",
    296       "evidence": "Main study (n=52): Cohen's d=0.738, p=0.010, 4.15 point difference on 27-point quiz. Effect persists when controlling for warm-up task time (d=0.725, p=0.016). Figures 6, 7.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "AI assistance does not significantly improve task completion time when new skills are required.",
    301       "evidence": "Main study: p=0.391 for task time difference between AI and no-AI groups (Figure 6). Qualitative analysis shows time spent interacting with AI offsets potential speedup.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Three AI interaction patterns (Conceptual Inquiry, Hybrid Code-Explanation, Generation-Then-Comprehension) preserve learning outcomes by maintaining cognitive engagement.",
    306       "evidence": "Qualitative analysis of screen recordings (Section 6). High-scoring patterns achieved 65-86% quiz scores vs 24-39% for low-scoring patterns (Figure 11). Small cluster sizes (n=2-7 per pattern).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Debugging skills show the largest gap between AI and non-AI groups.",
    311       "evidence": "Figure 8 shows debugging questions have the largest score difference between conditions, while code reading has the smallest. Attributed to control group encountering more errors (Table 4).",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Participants who fully delegated coding to AI finished fastest but learned the least.",
    316       "evidence": "AI Delegation cluster (n=4): 19.5 min completion, 39% quiz score vs control group 23 min and higher scores (Figure 11, Section 6).",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Company conflict of interest",
    323       "detail": "Both authors are affiliated with Anthropic (a major AI company). The finding that AI impairs learning without cognitive engagement aligns with Anthropic's safety-oriented messaging. No independent competing interests statement is provided."
    324     },
    325     {
    326       "flag": "Small subgroup sizes in qualitative analysis",
    327       "detail": "The six AI interaction patterns have cluster sizes of n=2 to n=7. Quantitative conclusions about these patterns (quiz scores, completion times) are drawn from very small groups."
    328     },
    329     {
    330       "flag": "Internal ethics review only",
    331       "detail": "Ethics review was conducted by 'internal reviewers at Anthropic' rather than an independent IRB, which is unusual for human subjects research and provides less oversight."
    332     },
    333     {
    334       "flag": "AI model setup underspecified",
    335       "detail": "GPT-4o version, system prompt, and hyperparameters are not reported. The AI assistant's behavior could significantly affect results, and this setup is not reproducible."
    336     },
    337     {
    338       "flag": "Short task duration may not reflect real skill formation",
    339       "detail": "The entire coding task is 35 minutes. Real skill formation occurs over months to years. The authors acknowledge this in Section 7.1 but the framing ('How AI Impacts Skill Formation') suggests broader applicability."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    345       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    346       "year": 2023,
    347       "arxiv_id": "2302.06590",
    348       "relevance": "Key prior work on AI coding productivity gains (55.5% speedup claim), directly contrasted with this paper's null productivity result."
    349     },
    350     {
    351       "title": "The effects of generative ai on high skilled work: Evidence from three field experiments with software developers",
    352       "authors": ["Zheyuan Kevin Cui", "Mert Demirer", "Sonia Jaffe", "Leon Musolff", "Sida Peng", "Tobias Salz"],
    353       "year": 2024,
    354       "relevance": "Field experiments showing 26.8% productivity boost from AI code completions, with less experienced coders benefiting more."
    355     },
    356     {
    357       "title": "Navigating the jagged technological frontier: Field experimental evidence of the effects of ai on knowledge worker productivity and quality",
    358       "authors": ["Fabrizio Dell'Acqua", "Edward McFowland III", "Ethan R Mollick"],
    359       "year": 2023,
    360       "relevance": "Influential field experiment on AI-enhanced consulting productivity showing 12.2% more task completions."
    361     },
    362     {
    363       "title": "Generative ai at work",
    364       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    365       "year": 2025,
    366       "relevance": "AI call center study showing 15% productivity improvement, with less experienced workers benefiting most."
    367     },
    368     {
    369       "title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity",
    370       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    371       "year": 2025,
    372       "arxiv_id": "2507.09089",
    373       "relevance": "RCT finding slowdown effects for expert coders, supporting this paper's finding that AI does not always improve productivity."
    374     },
    375     {
    376       "title": "Which economic tasks are performed with ai? evidence from millions of claude conversations",
    377       "authors": ["Kunal Handa", "Alex Tamkin", "Miles McCain"],
    378       "year": 2025,
    379       "arxiv_id": "2503.04761",
    380       "relevance": "Large-scale analysis of real-world AI usage patterns across professional domains including software engineering."
    381     },
    382     {
    383       "title": "Measuring progress on scalable oversight for large language models",
    384       "authors": ["Samuel R Bowman", "Jeeyoon Hyun", "Ethan Perez"],
    385       "year": 2022,
    386       "arxiv_id": "2211.03540",
    387       "relevance": "Foundational work on the scalable oversight problem — supervising increasingly capable AI systems requires maintaining human competence."
    388     },
    389     {
    390       "title": "GenAI as an exoskeleton: Experimental evidence on knowledge workers using genai on new skills",
    391       "authors": ["Emma Wiles", "Lisa Krayer", "Mohamed Abbadi"],
    392       "year": 2024,
    393       "relevance": "Directly related finding that AI-enhanced technical abilities did not persist when workers lost AI access."
    394     },
    395     {
    396       "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
    397       "authors": ["Shakked Noy", "Whitney Zhang"],
    398       "year": 2023,
    399       "relevance": "Early experimental evidence on AI writing productivity effects, finding lower-skilled workers benefit most."
    400     },
    401     {
    402       "title": "Rocks coding, not development: A human-centric, experimental evaluation of llm-supported se tasks",
    403       "authors": ["Wei Wang", "Huilong Ning", "Gaowei Zhang"],
    404       "year": 2024,
    405       "relevance": "Human-centric evaluation of LLM-supported software engineering tasks with diverse interaction pattern analysis."
    406     },
    407     {
    408       "title": "The impact of generative ai on critical thinking: Self-reported reductions in cognitive effort and confidence effects from a survey of knowledge workers",
    409       "authors": ["Hao-Ping Lee", "Advait Sarkar", "Lev Tankelevitch"],
    410       "year": 2025,
    411       "relevance": "Survey finding that generative AI reduces cognitive effort and confidence in knowledge workers, supporting cognitive offloading concerns."
    412     }
    413   ]
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs