scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23410B)
      1 {
      2   "paper": {
      3     "title": "Not Everyone Wins with LLMs: Behavioral Patterns and Pedagogical Implications for AI Literacy in Programmatic Data Science",
      4     "authors": ["Qianou Ma", "Kenneth Koedinger", "Tongshuang Wu"],
      5     "year": 2025,
      6     "venue": "CHI 2026",
      7     "arxiv_id": "2509.21890",
      8     "doi": "10.1145/3772318.3791283"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["observational", "qualitative"],
     13   "key_findings": "Technical experience (programming/data science) remains a significant predictor of success even with LLM access (β=6.09, p=.041), while self-reported LLM experience and communication skills do not predict performance. Under time pressure, LLMs may temporarily close the experience gap, but given ample time, technically experienced students outperform novices. Experienced students use AI more strategically (proactive planning, clearer prompts) while novices use it reactively for debugging. Lightweight demonstrations improve prompting quality but evaluation skills remain a persistent bottleneck.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'we open-source our code: https://github.com/mqo00/dspm' in Section 1 contributions."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset release mentioned. The homework logs, surveys, and screen recordings are not made available, likely due to student privacy."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification or dependency requirements mentioned. The LLM annotation uses claude-sonnet-4-20250514 with temperature=0 (Appendix E) but no broader environment setup is provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions provided. The open-sourced code repository is mentioned but no README or reproduction guide is described in the paper."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The regression results (Table 3) report estimates and standard errors but no confidence intervals. Figures show distributions without error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Mann-Whitney U test (p=0.027) for HW0 experience gap, and linear mixed-effects regression with p-values reported (Table 3, p=.041 for technical experience)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Regression coefficients (β=6.09) with standard errors are reported in Table 3, and Cohen's kappa κ=0.71 for inter-rater reliability. These provide magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "N=36 students with no power analysis or justification. The paper acknowledges in Section 8.2 that 'it is a small sample that constrained our ability to derive conclusive subgroup analysis' but does not justify why 36 was sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Standard errors for regression coefficients are reported but no variance across experimental conditions or runs. The LLM annotation was run with temperature=0 so single-run only."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "HW0 includes a within-subjects baseline: half the class completed tasks without AI and half with AI, enabling comparison. Post-demo behaviors are compared to pre-demo as a baseline."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The comparisons are within the study design (with/without AI, pre/post instruction, experienced vs novice). These are appropriate contemporaneous comparisons for this study type."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is an observational classroom study, not a system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: homework grades (rubric-based), AI use frequency, step success rates, appropriate AI use ratios, and qualitative behavioral codes."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Homework was graded by two TAs with inter-rater reliability measured (κ=0.71). Screen recordings and logs were manually analyzed by the authors for codebook development."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not a prediction/classification study. No train/test split is applicable."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by experience level (experienced vs novice), by AI code type, by episode step, and by pre/post instruction (Figures 4-8)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 5 provides detailed failure case comparisons between experienced and novice behaviors. Multiple failure patterns are discussed including 'prompting rabbit holes' and repeated KeyError loops."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key negative result: LLMs do NOT close the experience gap when given ample time. Also: evaluation behaviors remained a bottleneck even after instruction, and LLM/communication experience did not predict grades."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about technical experience predicting success (supported by Table 3), behavioral differences (supported by Figures 4-6), and instruction limitations (supported by Figures 7-8) are all backed by results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper uses causal language like 'time constraints may remove the performance gap brought by prior technical experience' and 'LLMs do not fully close the experience gap.' However, the observational study design (no randomization of experience levels, confounded pre/post comparison with time-on-task) does not support causal inference. The paper partially acknowledges this in Section 8.2."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 8.2 explicitly bounds generalization: small sample (N=36), single course at one R1 university, specific tool (Colab Gemini), tasks that were 'not LLM-hard.' The paper calls for extension to 'more diverse task contexts.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 8.2 discusses alternative explanations: correlated experience dimensions (Python and data science ρ=0.52), time-on-task confound in pre/post comparison, self-report bias (Dunning-Kruger), and undisclosed external AI use."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses rubric-graded homework scores as the outcome measure but frames findings broadly in terms of 'success' and 'performance' without discussing what these grades capture vs. real-world AI-assisted data analysis competence. The gap between classroom grades and authentic AI use effectiveness is not explicitly acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The Gemini model used in Colab is explicitly stated as unknown: 'Specific model used by Google Colab in Spring 2025 is unknown, but it could be gemini-2.0-flash or gemini-1.5-flash.' The annotation model claude-sonnet-4-20250514 is specified with version."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompts for the LLM-based log annotation are provided in Appendix E, including segmentation and step-wise annotation prompts with complete instructions."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "For the LLM annotation: 'temperature of 0 is used with the model claude-sonnet-4-20250514 and max_tokens of 20000' (Appendix E)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The study observes students using Colab Gemini (a third-party tool evaluated as a black box)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.3 describes data filtering criteria: excluding students who used other AI tools, those who disengaged, log length filtering, and the resulting sample (7,315 events from 28 students). Appendix B provides detailed data source information per analysis method."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8.2 'Limitations and Future Work' provides substantive discussion of limitations spanning multiple paragraphs."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8.2 discusses specific threats: small sample (N=36), correlated experience measures, self-report bias citing Kruger & Dunning, undisclosed external AI tool use, Colab's changing interface, and the confound between instruction and time-on-task."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8.2 explicitly states boundaries: tasks were 'not LLM-hard,' single graduate course, specific tool (Colab Gemini), logging limitations compared to established tools like JELAI. The paper calls for extending to 'more complex tasks' and 'more diverse task contexts.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw log data, survey responses, and screen recordings are not made available, likely due to student privacy. Only code is open-sourced."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.2 describes data collection in detail: custom browser script intercepting Colab API requests, localStorage capture, JSON export, survey instruments at multiple time points, and screen recordings. Appendix C provides full survey questions."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.1 describes the context: graduate-level DSPM course at an R1 university with 41 students (36 completing). Section 4.1 characterizes the diverse backgrounds (finance, journalism, physics, music, CS). Recruitment is via course enrollment."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The data pipeline is documented: raw logs → episode segmentation → step-wise annotation (Section 4.3), with filtering criteria and resulting counts (16,315 total events → 7,315 annotated events from 28 students, Appendix B)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists NSF awards CNS-2213791 and 2414915, and Google Academic Research Award."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as Carnegie Mellon University. The study uses Google Colab/Gemini but the authors are not Google employees."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Google Academic Research Award is listed as a funder, and the study evaluates Google's Colab Gemini product. Google has a financial interest in Gemini being perceived positively, creating a potential conflict."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This study does not evaluate a pre-trained model's capability on a benchmark. It observes how students use an LLM tool in coursework."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not a benchmark evaluation study. The study observes human behavior with LLM tools."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not a benchmark evaluation study."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No mention of pre-registration (OSF, AsPredicted, or similar) anywhere in the paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Section 3.1: 'Our IRB-approved study was conducted in Spring 2025.'"
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Section 4.1 describes students from 'finance, telecommunications, journalism, physics, music, and computer science.' Figure 2 shows distribution of experience levels. Appendix C includes demographic questions (age, gender). Pre-survey collected backgrounds."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 4.3 describes filtering criteria for log analysis: excluded students who used external AI tools, those who disengaged (log length below median). Section 4.1 notes 41 started, 36 completed HW4. Appendix B details per-analysis sample sizes."
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "For HW0: 'half the class completed the first two or last two tasks without using GenAI' (Section 3.2), providing counterbalanced conditions. The paper states this was 'counterbalanced across two tasks.'"
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "No mention of blinding. Students knew whether they had AI access. Graders (TAs) are not described as blinded to student experience level or AI condition."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Section 4.1: '41 students completed HW0, with a slight attrition to 36 students by HW4.' Section 4.3 further notes filtering from 36 to 28 students for log analysis with reasons."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a classroom observational study, not proposing a computational method. Cost is irrelevant."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a classroom observational study, not a compute-intensive method."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Technical experience significantly predicts homework performance even with LLM access, while LLM experience and communication skills do not.",
    296       "evidence": "Linear mixed-effects regression (Table 3): technical β=6.09, p=.041; LLM β=-3.05, p=.266; communication β=-2.54, p=.381. N=36 students, HW1-4.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Under time pressure (HW0), LLMs close the technical experience gap, but the gap persists with ample time (HW1-4).",
    301       "evidence": "Mann-Whitney U test: significant gap without LLM (p=0.027), not significant with LLM (p>0.05) in timed HW0. Regression on untimed HW1-4 shows persistent gap (Section 5).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Experienced students use AI more strategically while novices use it reactively.",
    306       "evidence": "Figure 5: experienced students self-resolve challenges more often; when using AI, experienced ask for explanation more (75% vs 57% for challenges) while novices use AI for planning more tied to challenges (64% vs 43%). Figure 6: experienced students have higher AI coding success rate (90% vs 79%) and more prompt quality attributes.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Lightweight demonstrations improve prompting quality but evaluation skills remain a bottleneck.",
    311       "evidence": "Figure 8: post-demo appropriate AI use ratios for prompt quality improved ~30%. However, evaluation behaviors (ai_critique_output, explain_code, explain_error) remained below 50% post-demo. Figure 7 shows more students reach later workflow steps post-demo.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "LLM-based log annotation achieves 75% accuracy for behavior coding.",
    316       "evidence": "Section 4.3: 'An author verified 10% of the log annotations, and LLM achieved an accuracy of 75% (specifically, 81.3% accuracy for step success, 76.7% for encountered challenge, and 70.9% for AI use behaviors annotations).'",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Small sample size for quantitative claims",
    323       "detail": "N=36 students (further filtered to 28 for log analysis) is small for regression analysis with three predictors. The paper acknowledges this but still draws strong conclusions. The Mann-Whitney U test finding (p=0.027 vs p>0.05) could easily be a power issue rather than a real interaction."
    324     },
    325     {
    326       "flag": "Confounded pre/post comparison",
    327       "detail": "The pre-post instruction comparison (RQ3) is confounded with time-on-task (15-minute timed HW0 vs bi-weekly assignments), different datasets, and practice effects. The paper acknowledges this but still attributes improvements to instruction."
    328     },
    329     {
    330       "flag": "LLM-based annotation at 75% accuracy used for quantitative analysis",
    331       "detail": "The automated log annotation (claude-sonnet-4-20250514) achieved only 75% overall accuracy (70.9% for AI use behaviors), and only 10% was verified. This introduces substantial noise into the quantitative behavioral analyses in Sections 6 and 7."
    332     },
    333     {
    334       "flag": "Unknown LLM model version for the intervention tool",
    335       "detail": "The Gemini model students used is unknown ('could be gemini-2.0-flash or gemini-1.5-flash'). Different model capabilities would affect all findings about AI effectiveness."
    336     },
    337     {
    338       "flag": "Google funding + Google product evaluation",
    339       "detail": "The study is partially funded by a Google Academic Research Award and evaluates Google's Colab Gemini product. This potential conflict is not acknowledged."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    345       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    346       "year": 2025,
    347       "arxiv_id": "2507.09089",
    348       "relevance": "RCT measuring AI impact on developer productivity, finding expert performance may be hindered by AI."
    349     },
    350     {
    351       "title": "Learning agent-based modeling with LLM companions: Experiences of novices and experts using ChatGPT & NetLogo chat",
    352       "authors": ["John Chen", "Xi Lu", "Yuzhou Du"],
    353       "year": 2024,
    354       "doi": "10.1145/3613904.3642377",
    355       "relevance": "Studies novice vs expert differences in AI-assisted programming, finding experts benefit more from LLM companions."
    356     },
    357     {
    358       "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    359       "authors": ["Majeed Kazemitabaar"],
    360       "year": 2023,
    361       "doi": "10.1145/3544548.3580919",
    362       "relevance": "Studies how AI code generators affect novice programming learning outcomes."
    363     },
    364     {
    365       "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial",
    366       "authors": ["Elise Paradis"],
    367       "year": 2025,
    368       "relevance": "Enterprise RCT of AI impact on development speed with mixed evidence on productivity gains."
    369     },
    370     {
    371       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    372       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    373       "year": 2023,
    374       "relevance": "Foundational study on GitHub Copilot's productivity impact on developers."
    375     },
    376     {
    377       "title": "How Beginning Programmers and Code LLMs (Mis)read Each Other",
    378       "authors": ["Sydney Nguyen"],
    379       "year": 2024,
    380       "doi": "10.1145/3613904.3642706",
    381       "relevance": "Studies how novice programmers interact with and misunderstand LLM-generated code."
    382     },
    383     {
    384       "title": "What should we engineer in prompts? Training humans in requirement-driven LLM use",
    385       "authors": ["Qianou Ma"],
    386       "year": 2025,
    387       "doi": "10.1145/3731756",
    388       "relevance": "Studies prompt engineering education and requirement specification for LLM interactions."
    389     },
    390     {
    391       "title": "How humans communicate programming tasks in natural language and implications for end-user programming with LLMs",
    392       "authors": ["Madison Pickering"],
    393       "year": 2025,
    394       "doi": "10.1145/3706598.3713271",
    395       "relevance": "Studies how humans express programming tasks to LLMs and whether prior experience matters."
    396     },
    397     {
    398       "title": "LLMs are imperfect, then what? An empirical study on LLM failures in software engineering",
    399       "authors": ["Jiessie Tie"],
    400       "year": 2024,
    401       "arxiv_id": "2411.09916",
    402       "relevance": "Documents LLM failure patterns in software engineering including 'prompting rabbit holes.'"
    403     },
    404     {
    405       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    406       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    407       "year": 2024,
    408       "relevance": "Studies Copilot's impact on collaborative open-source development with mixed findings."
    409     }
    410   ]
    411 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs