scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31005B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How AI Impacts Skill Formation",
      6     "authors": [
      7       "Judy Hanwen Shen",
      8       "Alex Tamkin"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2601.20245",
     13     "doi": "10.48550/arXiv.2601.20245"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All claims in the abstract are substantiated by results: quiz scores were significantly lower in the AI condition (Cohen's d=0.738, p=0.010), task time was not significantly improved (p=0.391), and six AI interaction patterns are identified and categorized in the qualitative analysis.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper uses a between-subjects RCT with random assignment to AI and no-AI conditions, which is an appropriate design for causal inference about the effect of AI assistance on skill formation.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The abstract and discussion recommend that 'AI assistance should be carefully adopted into workflows to preserve skill formation — particularly in safety-critical domains,' but the evidence comes from 52 crowdworkers completing a 35-minute task with a single Python library; the extension to safety-critical professional domains is unsupported.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The authors discuss why AI failed to speed up task completion (time spent composing queries) and why the control group learned more (independently encountering and resolving Trio-specific errors forced deeper engagement with library concepts).",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly distinguishes between the 14-question immediate post-task quiz and the broader claim of 'skill formation,' acknowledging in Section 7.1 that real skill formation takes months to years rather than one hour.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 7.1 'Future Work' explicitly enumerates six specific limitations: task selection (single library, chat interface only), task length (one hour vs. months), participant realism (crowdworkers vs. employees), prompting skills measurement, evaluation design alternatives, and the human-assistance counterfactual.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are addressed: non-compliance rates in early pilots documented across four pilot studies (25–35%), ecological validity concerns (crowdworkers may not behave like employees with genuine learning stakes), and the short task duration limiting skill formation claims.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The authors explicitly state 'This study focuses on a single task using a chat-based interface' and 'We measured skill formation for a specific Python library over a one-hour period,' providing clear scope boundaries in Section 7.1.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The acknowledgments state the work was conducted as part of the Anthropic Safety Fellows Program, disclosing Anthropic as the funding body.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors disclose Anthropic affiliations via their email domains (judy@anthropic.com, atamkin@anthropic.com) and author footnotes identifying them as Anthropic Fellows Program participant and Anthropic employee.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Anthropic funded the study through its Fellows Program and employs both authors; Anthropic is an AI company with direct interest in how AI tools are perceived and adopted, even if its safety mission nominally aligns with reporting harms.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "There is no competing interests or financial interests declaration in the paper; Anthropic affiliation is disclosed through acknowledgments but not framed as a potential conflict of interest.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are operationalized: 'skill formation' is measured via a 14-question quiz covering debugging, code reading, and conceptual understanding; 'productivity' is task completion time; the AI assistant is described as GPT-4o prompted as a coding assistant with access to participant code.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The contribution is clearly stated: a randomized experiment measuring the causal effect of AI coding assistance on both task productivity and skill acquisition when learning a new library, adding causal evidence that prior observational work lacked.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper engages substantively with prior work (Peng et al. 2023, Dell'Acqua et al. 2023, Brynjolfsson et al. 2025, Wu et al. 2025, Macnamara et al. 2024), situating its contribution as adding causal evidence to observational studies and extending analysis beyond end-product productivity metrics.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "The GitHub repository (github.com/safety-research/how-ai-impacts-skill-formation) releases annotated transcripts but not the statistical analysis code used to compute test statistics, effect sizes, or generate figures.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Primary outcome data (quiz scores, task completion times) are stored in Google Drive and the coding platform but are not publicly released; only qualitative annotated screen-recording transcripts are available on GitHub.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No environment specifications (requirements.txt, Dockerfile, or dependency list) are provided; the coding platform and AI assistant infrastructure are mentioned only by type, not reproducible configuration.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction instructions are provided for replicating the experiment setup, data collection, or statistical analysis; the appendix describes the study protocol but not how to computationally reproduce results.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Figures 5 and 6 explicitly show error bars representing 95% confidence intervals for both task time and quiz score across conditions.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Statistical significance tests are used throughout with p-values reported for treatment effects (e.g., p=0.010 for quiz score, p=0.391 for task time in the main study; p=0.016 after covariate adjustment).",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Cohen's d is reported for all main treatment effects: d=0.738 for quiz score in the main study, d=0.725 controlling for warm-up time, and d=1.11 and d=1.7 for pilot study outcomes.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "A power analysis is described: the authors assumed a conservative effect size of d=0.85 (half the observed pilot effect of d=1.7) to account for typical pilot inflation, justifying the target of 50 participants.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "95% confidence intervals are shown in all main results figures; Table 4 reports median and interquartile range (Q1–Q3) for error counts per condition.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The no-AI control condition serves as a direct baseline for both productivity and skill formation outcomes in a between-subjects design.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The comparison is between a GPT-4o-based AI assistant (contemporary as of 2025–2026) and no AI assistance, which is a relevant and current baseline.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": false,
    190           "answer": false,
    191           "justification": "N/A — the study is a two-condition experiment (AI vs. no AI) testing a behavioral intervention, not evaluating a multi-component system that requires ablation.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple metrics are used: task completion time, overall quiz score, quiz scores by question type (debugging, code reading, conceptual) and by task, error counts by type, self-reported learning/enjoyment/difficulty, and active coding time.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "N/A — the study measures human performance outcomes (quiz scores, task time) rather than using human raters to evaluate AI system outputs.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "N/A — this is a behavioral experiment, not a prediction task with a train/test split.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Figure 8 breaks down quiz scores by question type (debugging, code reading, conceptual) and by task (Task 1 vs. Task 2); Figure 7 shows results stratified by years of coding experience.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "The qualitative analysis explicitly characterizes failure cases: three low-scoring AI interaction patterns (AI Delegation, Progressive AI Reliance, Iterative AI Debugging) with quiz scores of 24–39% are analyzed in depth, including sample participant behaviors.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The null result on productivity (p=0.391 for task time in the main study) is explicitly reported and discussed in contrast to prior work (Peng et al. 2023, Cui et al. 2024) that found significant gains.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "The AI assistant is described as 'GPT-4o' without a snapshot date or API version; GPT-4o receives ongoing updates, making exact reproduction impossible.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "The system prompt is described only as configuring GPT-4o to be 'an intelligent coding assistant' with access to participant code; the actual prompt text is not provided.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "No hyperparameters for the GPT-4o assistant (temperature, top-p, max tokens, etc.) are reported.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "N/A — the AI assistant is a simple chat interface with code context access, not an agentic system with multi-step scaffolding.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Pre-registration documents disqualification criteria applied prospectively; one participant was excluded for leaving blank quiz questions due to time, consistent with pre-registered criteria. Quiz item selection via item response theory across five versions is also documented.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "Quiz responses are stored in Google Drive and keystroke data in the coding platform, neither of which is publicly accessible; only annotated qualitative transcripts are released on GitHub.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Data collection is thoroughly described: keystroke logging through the coding platform, AI chat transcripts, screen recordings (51/52 participants), and Google Forms surveys administered before and after the task.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Participant recruitment is described: third-party crowdworker platform, $150 flat fee, pre-screening requiring 1+ year Python experience, weekly Python use, prior AI tool experience, and no prior Trio library use.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The full pipeline is documented: pre-screening survey, warm-up task calibration, main Trio coding task (35 min, AI vs. no-AI), post-task quiz, post-task survey, and manual screen recording annotation with event taxonomy (Table 5).",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "N/A — the study uses GPT-4o as an intervention tool for human participants, not to evaluate model capabilities on a benchmark; training cutoff is not relevant to the study's claims.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "N/A — the evaluation is of human skill formation, not model benchmark performance.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "N/A — the quiz questions were researcher-designed for this study and Trio is not a standard benchmark; contamination is not applicable.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": true,
    312           "answer": true,
    313           "justification": "The study is pre-registered on OSF (https://osf.io/w49e7); the pre-registration included disqualification criteria and the grading rubric submitted before running the main experiment.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": true,
    318           "answer": true,
    319           "justification": "Appendix A.1 states: 'The protocol was reviewed and approved by internal reviewers at Anthropic,' explicitly covering informed consent, risk assessment, right to withdraw, and data anonymization procedures.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": true,
    324           "answer": true,
    325           "justification": "Figure 17 reports participant demographics collected post-task to avoid stereotype threat: age distribution, education level, student status, and coding context (professional developer, freelance, academic, etc.).",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": true,
    330           "answer": true,
    331           "justification": "Inclusion criteria are stated: 1+ year Python experience, weekly Python use, prior AI coding tool experience, no prior Trio library experience. One participant was excluded per pre-registered disqualification criteria.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": true,
    336           "answer": true,
    337           "justification": "Between-subjects random assignment is described with balance verification; Table 1 shows near-equal distribution of coding experience, Python frequency, asyncio familiarity, and warm-up task time across treatment and control groups.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": true,
    342           "answer": false,
    343           "justification": "Participants in the treatment group were explicitly prompted to use the AI assistant; blinding of participants was impossible and not described. Whether quiz graders were blinded to condition assignment is not mentioned.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": true,
    348           "answer": true,
    349           "justification": "53 of 58 recruited participants completed all three study parts; 1 was disqualified per pre-registered criteria (blank quiz questions due to time), yielding n=52. Pilot non-compliance and dropout rates are also systematically reported (Table 2).",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "N/A — this is a human subjects study using AI as an experimental stimulus; API inference cost is not relevant to the paper's claims.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "N/A — no significant compute budget beyond API usage for the chat assistant; not relevant to the study's contribution.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "AI assistance significantly impairs skill formation: participants using AI scored 17% lower on the post-task knowledge quiz (Cohen's d=0.738, p=0.010)",
    372       "evidence": "Main study n=52, pre-registered between-subjects RCT; 4.15-point gap on 27-point quiz; effect persists after controlling for warm-up time (d=0.725, p=0.016)",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "AI assistance did not significantly improve task completion time on average (p=0.391 in main study)",
    377       "evidence": "AI mean ~22 min vs. No AI ~23 min, not significant; qualitative analysis attributes this to query-composition time (some participants spent up to 11 minutes interacting with AI during the 35-minute task)",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Three AI interaction patterns involving cognitive engagement (Conceptual Inquiry, Hybrid Code-Explanation, Generation-Then-Comprehension) preserve skill formation with quiz scores of 65–86%",
    382       "evidence": "Qualitative clustering of 25 AI-condition participants; high-scoring clusters contain only n=2–7 participants each with no statistical tests applied at the pattern level, making point estimates unreliable",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Participants who fully delegated coding to AI (AI Delegation pattern) completed tasks fastest (19.5 min) but scored worst on the quiz (39%)",
    387       "evidence": "n=4 participants in this cluster; very small subgroup makes the specific figures illustrative rather than reliable estimates",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Independently encountering and resolving errors is a key mechanism for skill acquisition, as control group encountered 3× more errors (median 3 vs. 1) and scored higher",
    392       "evidence": "Table 4 and Figure 15 show the error distribution difference; the causal link from error resolution to learning is an interpretive inference from observational qualitative data, not experimentally isolated",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Debugging questions showed the largest score gap between AI and no-AI conditions among question types",
    397       "evidence": "Figure 8 shows this pattern but it is explicitly described as exploratory (not pre-registered); consistent with the error-resolution learning hypothesis",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "rct",
    403     "qualitative",
    404     "observational"
    405   ],
    406   "key_findings": "A pre-registered between-subjects RCT with 52 software developers found that AI coding assistance significantly reduced skill acquisition for a new Python library (17% lower quiz scores, Cohen's d=0.738, p=0.010) without delivering significant productivity gains on average (p=0.391). The lack of speedup is explained by heterogeneous AI usage: ~20% of AI users delegated entirely and finished faster, but most spent substantial time composing queries, raising average completion time. Six qualitatively identified interaction patterns show a stark divide: patterns involving cognitive engagement (asking conceptual questions, requesting explanations) preserved learning (65–86% quiz scores), while passive delegation patterns produced weak learning outcomes (24–39%). The results suggest that independently encountering and resolving Trio-specific errors drives skill formation, and that debugging is the skill most damaged by AI reliance.",
    407   "red_flags": [
    408     {
    409       "flag": "Underpowered subgroup analysis",
    410       "detail": "The six AI interaction pattern clusters contain n=2–7 participants each; no statistical tests are applied to pattern-level differences, yet specific quiz scores and completion times are reported as if reliable."
    411     },
    412     {
    413       "flag": "Ecological validity gap",
    414       "detail": "Crowdworkers paid $150 for a one-hour task have different learning incentives than employees acquiring skills for career advancement; the paper acknowledges this in limitations but still generalizes to 'professional domains' and 'safety-critical settings.'"
    415     },
    416     {
    417       "flag": "Immediate post-task quiz ≠ skill formation",
    418       "detail": "The quiz is administered immediately after the 35-minute task with no follow-up; this measures immediate recall/comprehension, not the durable skill formation the paper claims to study."
    419     },
    420     {
    421       "flag": "GPT-4o model version unspecified",
    422       "detail": "No snapshot date or API version is given for the GPT-4o assistant; exact replication is impossible given ongoing model updates."
    423     },
    424     {
    425       "flag": "Primary quantitative data not released",
    426       "detail": "Quiz scores and task completion times (the primary outcomes) are stored in Google Drive and the coding platform and are not publicly available; only qualitative annotated transcripts are released on GitHub."
    427     },
    428     {
    429       "flag": "Overbroad generalization",
    430       "detail": "Recommendations extend to 'safety-critical domains' and the abstract frames findings against 'professional domains' from a narrow study of crowdworkers learning one Python library in one hour."
    431     },
    432     {
    433       "flag": "Exploratory analyses not distinguished",
    434       "detail": "The concept group question-type breakdown (Figure 8) and qualitative interaction pattern taxonomy are explicitly not pre-registered but are presented with similar emphasis to the pre-registered primary outcomes."
    435     },
    436     {
    437       "flag": "Internal ethics review only",
    438       "detail": "Ethics review was conducted by internal Anthropic reviewers, not an external IRB, which is a non-standard process for human subjects research published in academic venues."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    444       "relevance": "Peng et al. 2023 — key prior work showing 55.5% task speed-up with Copilot; this paper directly contrasts by studying new-skill contexts where productivity gains do not replicate"
    445     },
    446     {
    447       "title": "Navigating the Jagged Technological Frontier: Field Experimental Evidence of the Effects of AI on Knowledge Worker Productivity and Quality",
    448       "relevance": "Dell'Acqua et al. 2023 — BCG consultant RCT showing AI improves productivity for novices but harms performance on tasks outside AI's frontier; a central reference point throughout"
    449     },
    450     {
    451       "title": "Generative AI at Work",
    452       "relevance": "Brynjolfsson et al. 2025 — call center RCT showing AI assistance raises issue resolution by 15%, especially for novices; contrasts with this paper's null productivity finding"
    453     },
    454     {
    455       "title": "The Effects of Generative AI on High Skilled Work: Evidence from Three Field Experiments with Software Developers",
    456       "relevance": "Cui et al. 2024 — field experiment showing 26.8% productivity boost from AI code completions; this paper's null productivity result differs specifically in the new-skill context"
    457     },
    458     {
    459       "title": "Does Using Artificial Intelligence Assistance Accelerate Skill Decay and Hinder Skill Development Without Performers' Awareness?",
    460       "relevance": "Macnamara et al. 2024 — shows medical AI assistance may impair radiologists' visual skill development; directly motivates this paper's research question"
    461     },
    462     {
    463       "title": "The Impact of Generative AI on Critical Thinking: Self-Reported Reductions in Cognitive Effort and Confidence Effects from a Survey of Knowledge Workers",
    464       "relevance": "Lee et al. 2025 — survey finding AI reduces cognitive effort in knowledge workers; provides the cognitive offloading mechanism this paper operationalizes experimentally"
    465     },
    466     {
    467       "title": "GenAI as an Exoskeleton: Experimental Evidence on Knowledge Workers Using GenAI on New Skills",
    468       "relevance": "Wiles et al. 2024 — shows AI-enabled skills don't persist when AI is removed; adjacent finding motivating this paper's skill formation research question"
    469     },
    470     {
    471       "title": "Human-Generative AI Collaboration Enhances Task Performance but Undermines Human's Intrinsic Motivation",
    472       "relevance": "Wu et al. 2025 — finds AI collaboration improves immediate performance but hurts long-term motivation; cited as closely related work on skill retention"
    473     },
    474     {
    475       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    476       "relevance": "Becker et al. 2025 — finds slowdown effects for expert coders on longer tasks; cited to contextualize the null productivity finding and AI interaction time costs"
    477     },
    478     {
    479       "title": "Which Economic Tasks Are Performed with AI? Evidence from Millions of Claude Conversations",
    480       "relevance": "Handa et al. 2025 — documents real-world AI usage patterns across professional domains; provides empirical grounding for the paper's claims about AI prevalence in software engineering"
    481     }
    482   ],
    483   "engagement_factors": {
    484     "practical_relevance": {
    485       "score": 3,
    486       "justification": "Directly actionable for developers, managers, and educators deciding how to integrate AI coding tools, with specific taxonomy of which interaction patterns preserve vs. harm learning."
    487     },
    488     "surprise_contrarian": {
    489       "score": 3,
    490       "justification": "Challenges the dominant 'AI boosts productivity' narrative with a double negative: AI neither speeds up task completion nor preserves learning, and the failure mode comes from an Anthropic-affiliated study."
    491     },
    492     "fear_safety": {
    493       "score": 2,
    494       "justification": "Explicitly connects skill erosion to the inability to debug and oversee AI-generated code in safety-critical domains, linking to scalable oversight concerns."
    495     },
    496     "drama_conflict": {
    497       "score": 2,
    498       "justification": "An Anthropic-funded paper finding that AI assistance harms skill formation creates an inherent tension between the company's commercial interests and its safety research output."
    499     },
    500     "demo_ability": {
    501       "score": 1,
    502       "justification": "The experimental setup could theoretically be replicated but participants cannot easily 'try' the study; the interaction pattern taxonomy is accessible but requires no demonstration."
    503     },
    504     "brand_recognition": {
    505       "score": 2,
    506       "justification": "Anthropic affiliation and Alex Tamkin (known for Clio and AI usage research) provide moderate recognition in the AI research community."
    507     }
    508   },
    509   "hn_data": {
    510     "threads": [
    511       {
    512         "hn_id": "46821360",
    513         "title": "How AI impacts skill formation",
    514         "points": 236,
    515         "comments": 5,
    516         "url": "https://news.ycombinator.com/item?id=46821360",
    517         "created_at": "2026-01-30T07:06:47Z"
    518       }
    519     ],
    520     "top_points": 236,
    521     "total_points": 236,
    522     "total_comments": 5
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs