scan.json (29369B)
1 { 2 "paper": { 3 "title": "How Beginning Programmers and Code LLMs (Mis)read Each Other", 4 "authors": [ 5 "Sydney Nguyen", 6 "Hannah McLean Babe", 7 "Yangtian Zi", 8 "Arjun Guha", 9 "Carolyn Jane Anderson", 10 "Molly Q Feldman" 11 ], 12 "year": 2024, 13 "venue": "CHI '24 (Conference on Human Factors in Computing Systems)", 14 "arxiv_id": "2401.15232", 15 "doi": "10.1145/3613904.3642706" 16 }, 17 "scan_version": 2, 18 "active_modules": [], 19 "methodology_tags": ["observational", "qualitative"], 20 "key_findings": "In a controlled study of 120 CS1-complete students across 3 institutions attempting 48 CS1-level problems, participants achieved only a 57% eventual success rate and 24% per-attempt success rate when prompting Codex. Students' most common mental model of Code LLMs was keyword-based lookup, which led to unproductive strategies. Prior programming experience (p=0.02) and non-first-generation status (p=0.04) correlated with higher pass@1 rates. Students did not observably improve at prompting during the 75-minute study, and their most common editing strategy — adding detail — was not always effective.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "The paper releases experimental data at https://doi.org/10.17605/OSF.IO/V2C4T but does not mention releasing the source code for the Charlie web application or the analysis scripts." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "Data is publicly available at https://doi.org/10.17605/OSF.IO/V2C4T, including codebooks, supplemental materials, and collected experimental data. Stated in §1 footnote and Appendix A.2." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No environment specification (requirements.txt, library versions, etc.) is provided. The paper mentions using R's lme4 package and Python but does not give version details or reproducible environment specs." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "While the study methodology is described in detail (§3-4), there are no step-by-step reproduction instructions (e.g., scripts to replicate analyses, README with commands). The OSF repository contains data but the paper does not reference reproduction scripts." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Main results (Tables 1, 4) report point estimates of pass@1, success rate, and eventual success rate without confidence intervals or error bars. The mixed-effects model (Table 12) reports standard errors for coefficients, but the primary outcome measures lack uncertainty quantification." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "Extensive use of statistical tests: Welch t-tests for demographic comparisons (Table 11), binomial mixed-effects model for category effects (Table 12), Kendall's tau and Pearson's r for correlations (§6.2, Tables 7-8). Significance level α=0.05 stated in §5.1." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Effect sizes are provided with context: pass@1 0.17 vs 0.24 for no-experience vs experienced students (§6.3), Kendall's tau values reported for NASA-TLX correlations (e.g., τ=-0.4 for self-rated success, §6.2), and percentage differences for success rates across categories (Table 4)." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper argues its 120-participant sample is larger than prior work and enables statistical analyses (§3.1), but no formal power analysis is reported. The sample size appears driven by practical considerations (40 per institution) rather than statistical requirements." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "SDs are reported for some measures (time: SD=10.6, words added: SD=11.34), but the primary outcome measures (pass@1, success rates) in Tables 1 and 4 are reported as means without standard deviations or any spread measure." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper compares against multiple prior studies: Prather et al. (19 students, Minesweeper), Kazemitabaar et al. (33 students, CodingSteps), Denny et al. (54 students, Promptly). Also includes a pilot with 19 more experienced students (68.8% eventual success rate) as an internal comparison (§4.4)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Comparison papers are all from 2023, which are contemporary to the study period (data collected March-July 2023). The model used (code-davinci-002) is compared to contemporary alternatives like gpt-3.5-turbo and CodeLlama on HumanEval (§4.2)." 81 }, 82 "ablation_study": { 83 "applies": false, 84 "answer": false, 85 "justification": "This is an observational user study, not a system with components to ablate. All participants use the same interface and model." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple metrics used: success rate, eventual success rate, pass@1 (§5.2), NASA-TLX workload ratings (Table 2), perception scales (Tables 7-8), plus qualitative analysis of interviews and strategies." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Students evaluated generated code via forced-choice questions ('Did Charlie generate correct code?', 'Would you have written this code yourself?' — §4.1). Additionally, two researchers performed thematic coding of interview responses and open-ended survey answers (§5.1, Appendix A.2)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Problems have both provided test cases (shown to students as feedback) and expert test suites validated with code coverage and mutation testing (§3.2). Success is calculated 'using only the provided test cases' to align feedback with evaluation, but expert tests provide additional validation." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 4 provides per-category breakdowns of pass@1 and eventual success rate across all 8 problem categories (Sorting, Dictionaries, Nested, Math, Loops, Lists, Conditionals, Strings). Institution-level breakdowns are also provided (Table 1, Figure 5)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Extensive failure analysis: §7.3 covers syntax errors (196 cases), model producing different programs from same prompt (107 cases), model producing same program despite prompt edits (104 cases). §7.1-7.2 analyze student-reported difficulties. Appendix B.2 provides detailed case studies of the hardest problems (laugh, total_bill) with all 20 student descriptions." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper is primarily about negative results: students struggle with prompting (24% success rate), don't improve over time (§8.3), develop incorrect mental models (§8.1), and the most common editing strategy (adding detail) is not reliably effective. Code LLMs are found not to be a 'panacea for non-expert programming' (§9.2)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims that beginners 'struggle with writing and editing prompts, even for problems at their skill level and when correctness is automatically determined.' This is supported by 57% eventual success rate and 24% per-attempt success rate (§6.1), and qualitative evidence throughout §7-8." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper is carefully worded with correlational language: 'correlated with' (§6.3), 'may also be linked to' (§9.2). Stronger claims like 'Code LLMs remain inaccessible to non-experts' (§11) are supported by the controlled experimental evidence. The study design adequately supports its descriptive and correlational claims." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "§5.3 explicitly states: 'This research studies students at three selective higher education institutions in the United States... our findings may not generalize to other settings (e.g., community colleges, K-12 education) or cultural contexts.' The specific model and its limitations are also discussed (§9.5)." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "§10 (Threats to Validity) discusses specific alternatives: ChatGPT awareness during data collection, non-homogeneous programming backgrounds, potential bias from anthropomorphic system design, novelty bias, and self-selection bias. §5.3 (Positionality) discusses researcher incentives." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper carefully distinguishes what it measures (success rate, eventual success rate, pass@1) from broader claims. §5.2 explains why pass@1 is preferable to raw success rate for measuring prompt quality (accounts for LLM stochasticity). Claims are framed at the level of the measurements rather than extrapolated to broader constructs." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "The specific Codex model is named: 'code-davinci-002' (§4.2). StarCoder is named for pass@1 resampling (§5.2). The paper also contextualizes these models with HumanEval scores (46% and comparable to gpt-3.5-turbo at 48%)." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Figure 4 shows the exact prompt format sent to Codex (function signature + student description as docstring). Table 13 provides all 20 initial student descriptions for one problem. All prompts are available in the released dataset at OSF." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper states 'We generated output using best practices for hyperparameter and sampler settings [13]' referencing Chen et al. (2021), but does not state the actual temperature, top-p, or sampling parameters used. A 256 token limit is mentioned (§7.3.1) but other settings are not." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The system is a simple prompt-to-completion pipeline: student description → formatted as docstring → Codex API call → code returned (Figure 4)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Problem adaptation from CS1 materials is described (Appendix A.1), problem validation procedure documented (A.1.3), test case validation with mutation testing described (A.1.4), qualitative coding methodology detailed (A.2), and the pass@1 computation procedure explained (§5.2)." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 10 'Threats to Validity' is a dedicated section spanning approximately one full page, discussing multiple specific threats to the study's validity." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "§10 discusses specific threats: ChatGPT release between pilot and main study affecting student awareness, heterogeneous programming backgrounds despite eligibility criteria, power dynamics between students and professors, anthropomorphic system design potentially biasing positive perceptions, and model deprecation risk." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "§5.3 explicitly states findings may not generalize to community colleges, K-12, or other cultural contexts. §9.5 discusses model-specific limitations. §9.6 notes the study captures a specific temporal moment before widespread LLM familiarity. The paper distinguishes its simplified task (automated testing) from the full real-world prompting task." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "Raw data is publicly available at https://doi.org/10.17605/OSF.IO/V2C4T, referenced in §1 footnote and Appendix A.2. Supplemental materials including codebooks are also provided." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Detailed description of the Charlie web application (§4.1, Figure 3-4), Zoom-recorded sessions with audio/video (§4.4), post-task survey and semi-structured interviews with specific questions described (§4.4), and data collection timeline (March-July 2023)." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "§4.3: 'We recruited 40 participants from each institution (n=120). Eligible participants were at least 18 years old, had taken CS1 at their institution between Fall 2021 and Spring 2023, and had not completed any subsequent CS courses.' Recruitment via interest form distributed by other faculty/staff, scheduling by a researcher at another institution." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Full pipeline documented: recruitment → consent/assent → 3 tutorial problems → 8 main problems (4 untimed, 4 timed) → post-survey part 1 → semi-structured interview → post-survey part 2 → debrief (Figure 2). Qualitative coding pipeline detailed in Appendix A.2 with multiple rounds of coding towards consensus." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Acknowledgments section: 'This work is partially supported by the National Science Foundation (SES-2326173, SES-2326174, and SES-2326175).' Computing resources from Northeastern Research Computing and New England Research Cloud also acknowledged." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All author affiliations are listed: Wellesley College, Oberlin College, Northeastern University, and Roblox (for Arjun Guha). §5.3 (Positionality) notes 'Some authors also contribute to the development and evaluation of open-source Code LLMs.'" 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Funding is from NSF, which has no financial stake in whether Code LLMs are found effective or not. The study does not evaluate any product made by the funders or authors' employers." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No formal competing interests statement is included. §5.3 notes some authors contribute to open-source Code LLMs and the research team has 'complex incentives,' but there is no explicit declaration of patents, equity, or other financial interests." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "This is a human interaction study examining how students prompt Code LLMs, not a benchmark evaluation of model capability. The model (Codex) is a tool in the study, not the subject of evaluation." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not a benchmark evaluation study. The research questions are about human prompting behavior, not model performance on benchmarks. However, the paper does validate that problems cannot be solved from function names alone (§3.3, A.1.3)." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Not a benchmark evaluation study. The study examines human-AI interaction, not model capability on a benchmark." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": true, 253 "answer": false, 254 "justification": "No mention of pre-registration (OSF, AsPredicted, or similar). The OSF link (https://doi.org/10.17605/OSF.IO/V2C4T) is for data sharing, not pre-registration." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": true, 258 "answer": true, 259 "justification": "§4.4 states: 'The pilot and main study received IRB approval.' Informed consent procedures are also described." 260 }, 261 "demographics_reported": { 262 "applies": true, 263 "answer": true, 264 "justification": "Extensive demographics: gender (Table 9), race (Table 10), first-generation status, high school type, household language, international status (Table 1), programming experience, math courses, and major (§6.3). Open-ended demographic questions following best practices (§4.4, reference to Spiel et al.)." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": true, 268 "answer": true, 269 "justification": "§4.3: 'Eligible participants were at least 18 years old, had taken CS1 at their institution between Fall 2021 and Spring 2023, and had not completed any subsequent CS courses.' Recruitment continued until sample size of 120 reached." 270 }, 271 "randomization_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "§4.4: 'Participants were randomly assigned experimental lists, balanced by difficulty, using a modified Latin Square design. Four authors independently assessed the difficulty of writing prompts for each problem; we averaged these scores and developed six roughly equal lists.' Problem assignment ensures each problem gets 20 students." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "This is a single-condition within-subjects design — all participants use the same system (Charlie/Codex). There are no between-subjects treatment conditions requiring blinding." 280 }, 281 "attrition_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "The paper reports n=120 throughout, implying no participant dropout. Problem-level attrition is documented: Figure 5d shows attempts ending in success vs give-up, 340 problems where students gave up are reported (§7.3.3). Five missing interview responses out of 960 possible are noted (§4.4)." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "This is a human subjects user study, not a system paper proposing a deployable method. Cost of the Codex API calls and StarCoder computation is not the focus." 292 }, 293 "compute_budget_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "This is a human subjects study. The computational cost of running the study (API calls, StarCoder resampling) is not the focus; the study evaluates human interaction, not computational efficiency." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "Beginning programmers who have completed CS1 struggle to prompt Code LLMs effectively, achieving only 57% eventual success rate and 24% per-attempt success rate on CS1-level problems.", 303 "evidence": "§6.1, Figure 5: 120 students across 3 institutions solved 4.7/8 problems on average, with mean eventual success rate of 57% and mean success rate of 24%. Mean pass@1 of 0.22.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Prior programming experience outside CS1 is positively correlated with prompting success (pass@1 of 0.24 vs 0.17).", 308 "evidence": "§6.3, Table 11: Welch t-test p=0.02 for students with additional coding experience vs CS1-only students.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "First-generation college students struggle more with Code LLM prompting than non-first-generation students.", 313 "evidence": "§6.3, Table 11: pass@1 of 0.17 vs 0.23 with p=0.04 (Welch t-test). N=23 first-generation vs 96 non-first-generation.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "Students do not observably improve at prompting Code LLMs during the 75-minute study.", 318 "evidence": "§8.3: Comparison of success rates for students attempting each problem first vs last (5 students each) shows no significant difference.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "Students' most common mental model of Code LLMs is keyword-based lookup, which is incorrect and may inhibit effective strategy development.", 323 "evidence": "§8.1, Table 5: 46 of ~96 responding students described a keyword-based model. Students with correct ChatGPT/LLM mental models had slightly higher success rates (0.27 vs 0.22, p=0.03).", 324 "supported": "moderate" 325 }, 326 { 327 "claim": "The most common prompting strategy is adding detail, with students adding an average of 9.44 words between first and last prompts.", 328 "evidence": "§8.2.1, Table 6, Figure 7: 48 students mentioned adding detail. Quantitative analysis of 282 successful multi-attempt prompts confirms mean addition of 9.44 words (SD=11.34).", 329 "supported": "strong" 330 }, 331 { 332 "claim": "Model stochasticity significantly impacts student experience: same prompt produces different code (107 cases, 4.2%), and different prompts produce same code (104 cases, 11% of edits).", 333 "evidence": "§7.3.2-7.3.3: 107 cases of identical prompt resubmission with different outputs; 104 submissions where edited prompts produced identical code, occurring in 36/48 problems and affecting 72/120 students.", 334 "supported": "strong" 335 } 336 ], 337 "red_flags": [ 338 { 339 "flag": "Multiple comparisons without correction", 340 "detail": "Table 11 reports 6 independent t-tests on demographic subgroups without Bonferroni or other multiple comparison corrections. The first-generation finding (p=0.04) would not survive correction for 6 comparisons (adjusted threshold ~0.008). This weakens the equity claim." 341 }, 342 { 343 "flag": "Selective institution sample", 344 "detail": "All three institutions are selective US colleges/universities (an R1 university, a liberal arts college, a women's college). The authors acknowledge this in §5.3 but the findings may not generalize to the broader population of beginning programmers (community colleges, bootcamps, non-US contexts)." 345 }, 346 { 347 "flag": "No formal power analysis", 348 "detail": "The 120-participant sample size appears driven by practical constraints (40 per institution) rather than statistical power requirements. Some subgroup analyses (e.g., 23 first-generation students, 6 Black participants) may be underpowered." 349 } 350 ], 351 "cited_papers": [ 352 { 353 "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models", 354 "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"], 355 "year": 2023, 356 "doi": "10.1145/3586030", 357 "relevance": "Studies how experienced programmers interact with Copilot, identifying modes of interaction — key comparison study for expert vs novice LLM use." 358 }, 359 { 360 "title": "Evaluating Large Language Models Trained on Code", 361 "authors": ["Mark Chen", "Jerry Tworek"], 362 "year": 2021, 363 "arxiv_id": "2107.03374", 364 "relevance": "The Codex paper that introduced HumanEval and pass@1 metrics used in this study; foundational for Code LLM evaluation methodology." 365 }, 366 { 367 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 368 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 369 "year": 2023, 370 "arxiv_id": "2302.06590", 371 "relevance": "RCT studying Copilot's impact on developer productivity; relevant to the productivity paradox in AI-assisted programming." 372 }, 373 { 374 "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models", 375 "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"], 376 "year": 2022, 377 "doi": "10.1145/3491101.3519665", 378 "relevance": "Earliest academic usability study of Copilot; found users enjoyed it but it didn't help them code faster or more correctly." 379 }, 380 { 381 "title": "\"It's Weird That it Knows What I Want\": Usability and Interactions with Copilot for Novice Programmers", 382 "authors": ["James Prather", "Brent N. Reeves", "Paul Denny"], 383 "year": 2023, 384 "doi": "10.1145/3617367", 385 "relevance": "Studies 19 CS1 students using Copilot for a final project; found students struggled and often mistakenly accepted incorrect code." 386 }, 387 { 388 "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming", 389 "authors": ["Majeed Kazemitabaar", "Justin Chow", "Carl Ka To Ma"], 390 "year": 2023, 391 "doi": "10.1145/3544548.3580919", 392 "relevance": "Studies 33 K-12 students with Codex access vs 36 without; found Code LLMs may benefit learning but students heavily reused expert descriptions." 393 }, 394 { 395 "title": "Promptly: Using Prompt Problems to Teach Learners How to Effectively Utilize AI Code Generators", 396 "authors": ["Paul Denny", "Juho Leinonen", "James Prather"], 397 "year": 2023, 398 "arxiv_id": "2307.16364", 399 "relevance": "Studies 54 students writing prompts for 3 CS1 problems; found similar challenges to this paper at smaller scale." 400 }, 401 { 402 "title": "StudentEval: A Benchmark of Student-Written Prompts for Large Language Models of Code", 403 "authors": ["Hannah McLean Babe", "Sydney Nguyen", "Yangtian Zi"], 404 "year": 2023, 405 "arxiv_id": "2306.04556", 406 "relevance": "Companion benchmark paper using student-written prompts from this study to evaluate Code LLMs." 407 }, 408 { 409 "title": "StarCoder: may the source be with you!", 410 "authors": ["Raymond Li", "Loubna Ben Allal"], 411 "year": 2023, 412 "arxiv_id": "2305.06161", 413 "relevance": "Open Code LLM used in this study for pass@1 resampling; relevant to open-source model alternatives for reproducibility." 414 }, 415 { 416 "title": "GPT-4 Technical Report", 417 "authors": ["OpenAI"], 418 "year": 2023, 419 "arxiv_id": "2303.08774", 420 "relevance": "GPT-4's HumanEval performance (67%) provides context for the capability level of models used in this study." 421 }, 422 { 423 "title": "Code Llama: Open Foundation Models for Code", 424 "authors": ["Baptiste Rozière", "Jonas Gehring"], 425 "year": 2024, 426 "arxiv_id": "2308.12950", 427 "relevance": "Open Code LLM achieving comparable performance to Codex on HumanEval; supports the study's argument about model-agnostic findings." 428 }, 429 { 430 "title": "Why Johnny Can't Prompt: How Non-AI Experts Try (and Fail) to Design LLM Prompts", 431 "authors": ["J.D. Zamfirescu-Pereira", "Richmond Y. Wong", "Bjoern Hartmann", "Qian Yang"], 432 "year": 2023, 433 "doi": "10.1145/3544548.3581388", 434 "relevance": "Studies non-expert LLM prompting for non-code tasks (recipes); found similar issues with incorrect mental models and ineffective strategies." 435 }, 436 { 437 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 438 "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab"], 439 "year": 2022, 440 "doi": "10.48550/ARXIV.2206.15331", 441 "relevance": "Compares quality of Codex-generated code to student-written code; relevant to understanding Code LLM quality for educational contexts." 442 }, 443 { 444 "title": "CodeCompose: A Large-Scale Industrial Deployment of AI-assisted Code Authoring", 445 "authors": ["Vijayaraghavan Murali", "Chandra Maddila"], 446 "year": 2023, 447 "arxiv_id": "2305.12050", 448 "relevance": "Large-scale industrial study of AI-assisted coding finding that it enhances expert programmer productivity; contrasts with beginner struggles." 449 } 450 ] 451 }