scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24672B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "Not Everyone Wins with LLMs: Behavioral Patterns and Pedagogical Implications for AI Literacy in Programmatic Data Science",
      6     "authors": ["Qianou Ma", "Kenneth Koedinger", "Tongshuang Wu"],
      7     "year": 2026,
      8     "venue": "CHI 2026",
      9     "doi": "10.1145/3772318.3791283",
     10     "arxiv_id": "2509.21890"
     11   },
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper states 'we open-source our code: https://github.com/mqo00/dspm' in Section 1."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper does not mention releasing the student log data, survey responses, or homework grades. The GitHub link is for code only. The datasets used in homework (car and game datasets) are not explicitly released."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned. The paper mentions Google Colab and Python but provides no dependency details."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. The study involves a specific classroom setting that cannot be trivially reproduced, and no instructions for replicating the analysis pipeline are given."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The regression results in Table 3 report estimates and standard errors but no confidence intervals. No error bars are mentioned on figures. The Mann-Whitney U test reports only p-values."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper uses Mann-Whitney U tests (Section 5, p=0.027) and linear mixed-effects regression with p-values (Table 3, p=.041 for technical experience). Cohen's kappa is used for inter-rater reliability (κ=0.71)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The regression coefficient β=6.09 is reported in Table 3 with standard errors, providing effect size context. Cohen's kappa κ=0.71 is an effect size measure for agreement."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The sample size of 36 students is not justified with a power analysis. The paper acknowledges in Section 8.2 that 'it is a small sample that constrained our ability to derive conclusive subgroup analysis' but does not justify why 36 was sufficient."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Standard errors are reported for regression coefficients in Table 3, but no variance or standard deviation is reported across experimental conditions or student performance distributions."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "HW0 includes a within-subjects baseline: half the class completed tasks without AI and half with AI (counterbalanced), enabling comparison of AI vs. no-AI performance (Section 3.2)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The baseline is the same students working without AI, which is a direct and appropriate comparison for the research question. The paper also references contemporary prior work."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "This is an observational study of student behaviors, not a system with components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper uses multiple metrics: homework grades (rubric-based), AI usage frequency, step success rate, appropriate AI use ratio, and qualitative behavioral codes across multiple dimensions."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Homework submissions were graded by two TAs with inter-rater reliability assessment (κ=0.71). An author manually verified 10% of LLM annotations achieving 75% accuracy (Section 4.3)."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "This is not a benchmark evaluation; it's a classroom study. There is no train/test split applicable."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by experience level (experienced vs. novice), by AI usage code type (improve, code, explain, evaluate, prompt), by episode step (intent, input, understand, assess), and pre/post instruction (Figures 4-8)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Table 5 provides detailed comparison of failure behaviors between novices and experienced students. Section 7 discusses specific failure patterns like repeated KeyError loops and early stops."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that LLM and communication experience did NOT predict grades (Table 3, p=.266 and p=.381). It also reports that evaluation behaviors remained a bottleneck even after instruction (Section 7)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims are supported: technical experience predicts success (Table 3), students vary in AI leverage across stages (Figures 4-6), and success/failure behaviors are identified (Table 2, Table 5). The claim about lightweight demonstrations being insufficient is supported by Section 7 analysis."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper makes causal-adjacent claims such as 'LLMs may offset technical experience's advantages' and 'technical experience significantly predicted higher grades.' The study design is observational with self-reported experience, not randomized, and cannot support causal claims. Confounds between experience types are acknowledged but not resolved."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 8.2 explicitly bounds generalization: 'small sample,' 'authentic classroom contexts,' and notes the tasks 'were not LLM-hard.' The paper calls for extending to 'more diverse task contexts' and notes the specific tool (Colab Gemini) limits generalizability."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 8.2 discusses multiple alternative explanations: time-on-task vs. instruction effects, correlated experience dimensions (Python and data science, ρ=0.52), potential external AI use, and Dunning-Kruger effects in self-reported experience."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper uses rubric-based homework grades as a measure of performance and discusses limitations of self-reported experience measures. It acknowledges that self-reported LLM experience 'may not straightforwardly transfer to complex programming data science tasks' (Section 5)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper states 'Specific model used by Google Colab in Spring 2025 is unknown, but it could be gemini-2.0-flash or gemini-1.5-flash' (footnote 1). For log annotation, it specifies 'claude-sonnet-4-20250514' which is a versioned model. However, the primary experimental model (Gemini) is unspecified."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Appendix E provides the full prompts used for LLM-assisted log annotation, including segmentation and step-wise annotation prompts with complete text."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4.3 and Appendix E report 'temperature=0 for deterministic generations' and 'max_tokens of 20000' for the log annotation model."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The study examines students using Colab's built-in Gemini assistant, which is a third-party tool evaluated as a black box."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.3 documents data filtering: 'filtered out students who self-reported other tool use like ChatGPT,' filtered by log length relative to median, and describes the annotation pipeline with segmentation and step-wise coding. The filtering from 41 to 36 students is explained (Section 4.1), and further filtering to 28 students for log analysis is documented."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8.2 'Limitations and Future Work' provides substantive discussion across two subsections: 'Going deeper' and 'Going broader.'"
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 8.2 discusses study-specific threats: small sample (36 students), correlated experience dimensions preventing independent analysis, self-report bias (citing Dunning-Kruger), undisclosed external AI use, and reliance on custom logging script that 'can be less robust than existing Jupyter notebook logging toolkits.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 8.2 explicitly states the tasks 'were not LLM-hard,' the study is limited to Google Colab with Gemini, and calls for extension to 'more diverse task contexts.' It notes 'we did not experimentally isolate the effect for time-on-task.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Raw log data, survey responses, grades, and screen recordings are not made available. Only analysis code is open-sourced."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4.2 provides detailed description of all data sources: notebooks/grades, behavioral logs (custom browser script intercepting Colab endpoints), surveys (pre/post/final with questions in Appendix C), and screen recordings. Total: 16,315 logged events and 44 hours of recordings."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 3.1 describes participants as students in a specific graduate-level course (DSPM) at an R1 university (CMU). The course's diverse cohort is described, including product managers, engineers, and non-technical graduates. This is a convenience sample from an existing class."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: raw logs → segmentation into episodes → step-wise annotation → validation (10% manual check, 75% accuracy). Filtering steps are described with counts: 41 students → 36 after attrition, further to 28 after quality filtering. 7,315 events annotated from the filtered set."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The acknowledgments section discloses funding: 'National Science Foundation (award CNS-2213791, 2414915) and Google Academic Research Award.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors are disclosed as Carnegie Mellon University affiliates. The study uses Google Colab with Gemini, and funding includes a Google Academic Research Award, but the paper does not evaluate Google's product favorably — it actually documents many limitations of the Gemini assistant."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Google Academic Research Award is a funder, and the study uses Google's Colab with Gemini assistant. Google has a financial interest in positive perceptions of its AI tools, though the paper's findings are not particularly favorable to the tool."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It studies human behavior when using an LLM tool."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmark evaluation of model capability is performed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation of model capability is performed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No mention of pre-registration (OSF, AsPredicted, or other registry) is found in the paper."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section 3.1 states 'Our IRB-approved study was conducted in Spring 2025.'"
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Section 4.1 reports students' fields (finance, telecommunications, journalism, physics, music, CS), experience distributions (Figure 2), and the pre-survey collected demographics (age, gender per Appendix C). Self-reported experience levels across technical, LLM, and communication dimensions are detailed."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Section 4.3 describes filtering criteria for log analysis: excluded students who self-reported external AI tool use, those showing external tool use in recordings, and those with below-median log length. The broader study included all 36 course completers from the initial 41."
    261       },
    262       "randomization_described": {
    263         "applies": true,
    264         "answer": true,
    265         "justification": "Section 3.2 describes HW0 randomization: 'half the class completed the first two or last two tasks without using GenAI,' providing a counterbalanced within-subjects design."
    266       },
    267       "blinding_described": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No blinding is described. Students knew whether they were in the AI or no-AI condition for HW0. TAs grading homework are not described as blinded to student experience levels or AI usage."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Section 4.1 reports attrition: '41 students completed HW0, with a slight attrition to 36 students by HW4.' For log analysis, further filtering to 28 students is documented with reasons (Section 4.3)."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a classroom study, not a system or method paper. Cost reporting is not applicable."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "This is a classroom study, not a system or method paper. Compute budget is not applicable."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Technical experience significantly predicts homework performance with LLM access, while LLM experience and communication experience do not.",
    294       "evidence": "Linear mixed-effects regression (Table 3): technical experience β=6.09, p=.041; LLM experience β=-3.05, p=.266; communication experience β=-2.54, p=.381. N=36 students across HW1-4.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Under time pressure (HW0), LLMs close the performance gap between technically experienced and novice students.",
    299       "evidence": "Mann-Whitney U test: significant gap without LLMs (p=0.027), gap disappears with LLMs (p>0.05). Section 5.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Experienced students use AI more strategically, asking for explanations when facing challenges (75% vs 57%) and proactively requesting plans, while novices use AI more reactively.",
    304       "evidence": "Figure 5B shows distribution of AI usage by experience level and challenge encounter status, based on annotated logs from 28 students (Section 6).",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "Experienced students generate code with AI more successfully (90% vs 79% step success rate) and write prompts with more quality attributes.",
    309       "evidence": "Figure 6A shows step success rates by experience level; Figure 6B shows count of prompt quality attributes by experience and grade. Based on annotated log analysis.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Lightweight demonstration improves surface AI fluency (prompt quality +30% appropriate use ratio) but evaluation behaviors remain a bottleneck (<50% post-demo).",
    314       "evidence": "Figure 8 shows pre-to-post appropriate AI use ratios. Figure 7 shows shift in last successful step distribution. Section 7. Note: confounded with extended time-on-task.",
    315       "supported": "weak"
    316     }
    317   ],
    318   "methodology_tags": ["observational", "qualitative"],
    319   "key_findings": "In a graduate course with 36 students using Google Colab's Gemini assistant for data analysis, technical experience (not LLM familiarity or communication skills) remained the strongest predictor of homework performance. Under time pressure, LLMs appeared to close the experience gap, but with ample time the gap persisted. Experienced students used AI more strategically (clearer prompts, proactive planning), while novices relied on AI reactively. Lightweight demonstration improved prompting behaviors but evaluative skills (critiquing outputs, explaining errors) remained consistently underutilized.",
    320   "red_flags": [
    321     {
    322       "flag": "Small sample size",
    323       "detail": "N=36 students (further filtered to 28 for log analysis) is small for the quantitative claims being made. Subgroup comparisons (experienced vs. novice) have very limited power, and the regression with 3 predictors on 36 participants is borderline."
    324     },
    325     {
    326       "flag": "Self-reported experience measures",
    327       "detail": "Experience levels are operationalized via self-report survey items, which the paper itself acknowledges are subject to miscalibration (citing Dunning-Kruger, ref 31). The median split into 'experienced' vs 'novice' is a crude categorization."
    328     },
    329     {
    330       "flag": "Time-on-task confound",
    331       "detail": "The pre-post comparison (HW0 vs HW1-2) confounds instruction effects with extended time-on-task. The paper acknowledges this ('any improvement pre- to post-demo is a result of both extended time-on-task and the instruction') but still draws conclusions about teachability."
    332     },
    333     {
    334       "flag": "LLM annotation accuracy",
    335       "detail": "The automated log annotation achieved only 75% accuracy overall (70.9% for AI use behaviors), meaning about 1 in 4 behavioral annotations may be incorrect. Quantitative analyses built on these annotations inherit this error."
    336     },
    337     {
    338       "flag": "Google funding with Google tool evaluation",
    339       "detail": "The study received a Google Academic Research Award while using Google Colab's Gemini assistant as the primary AI tool. Though findings are not particularly favorable to Google, the conflict is undisclosed."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    345       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    346       "year": 2025,
    347       "arxiv_id": "2507.09089",
    348       "relevance": "RCT measuring AI impact on developer productivity, finding experts' performance may be hindered by AI."
    349     },
    350     {
    351       "title": "Learning agent-based modeling with LLM companions: Experiences of novices and experts using ChatGPT & NetLogo chat",
    352       "authors": ["John Chen", "Xi Lu", "Yuzhou Du"],
    353       "year": 2024,
    354       "doi": "10.1145/3613904.3642377",
    355       "relevance": "Studies novice vs expert differences when collaborating with AI for programming, finding experts benefit more."
    356     },
    357     {
    358       "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    359       "authors": ["Majeed Kazemitabaar"],
    360       "year": 2023,
    361       "doi": "10.1145/3544548.3580919",
    362       "relevance": "Studies effect of AI code generators on novice programmers, finding prior experience significantly affects benefit."
    363     },
    364     {
    365       "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial",
    366       "authors": ["Elise Paradis", "Kate Grey"],
    367       "year": 2025,
    368       "relevance": "Enterprise RCT measuring AI impact on development speed with mixed results on productivity."
    369     },
    370     {
    371       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    372       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    373       "year": 2023,
    374       "relevance": "GitHub Copilot productivity study finding less experienced developers benefit more from AI assistants."
    375     },
    376     {
    377       "title": "How Beginning Programmers and Code LLMs (Mis)read Each Other",
    378       "authors": ["Sydney Nguyen", "Hannah Mclean Babe"],
    379       "year": 2024,
    380       "doi": "10.1145/3613904.3642706",
    381       "relevance": "Studies how beginners interact with code LLMs, finding they do not learn from code feedback and self-experimentation."
    382     },
    383     {
    384       "title": "What should we engineer in prompts? Training humans in requirement-driven LLM use",
    385       "authors": ["Qianou Ma", "Weirui Peng"],
    386       "year": 2025,
    387       "doi": "10.1145/3731756",
    388       "relevance": "Studies teaching requirement specification for LLM use, relevant to AI literacy and prompt engineering education."
    389     },
    390     {
    391       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    392       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    393       "year": 2024,
    394       "relevance": "Studies AI impact on collaborative open-source development, reporting conflicting effects on efficiency."
    395     },
    396     {
    397       "title": "How humans communicate programming tasks in natural language and implications for end-user programming with LLMs",
    398       "authors": ["Madison Pickering", "Helena Williams"],
    399       "year": 2025,
    400       "doi": "10.1145/3706598.3713271",
    401       "relevance": "Studies how humans express programming tasks to LLMs, finding no significance of prior programming experience in some settings."
    402     },
    403     {
    404       "title": "LLMs are imperfect, then what? An empirical study on LLM failures in software engineering",
    405       "authors": ["Jiessie Tie", "Bingsheng Yao"],
    406       "year": 2024,
    407       "arxiv_id": "2411.09916",
    408       "relevance": "Empirical study of LLM failure patterns in software engineering including 'prompting rabbit holes.'"
    409     }
    410   ]
    411 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs