scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26749B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Beginning Programmers and Code LLMs (Mis)read Each Other",
      6     "authors": [
      7       "Sydney Nguyen",
      8       "Hannah McLean Babe",
      9       "Yangtian Zi",
     10       "Arjun Guha",
     11       "Carolyn Jane Anderson",
     12       "Molly Q Feldman"
     13     ],
     14     "year": 2024,
     15     "venue": "International Conference on Human Factors in Computing Systems",
     16     "arxiv_id": "2401.15232",
     17     "doi": "10.1145/3613904.3642706"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims are supported: the study is large-scale (120 students, 3 institutions, 48 problems), and the finding that beginners struggle with prompts is documented with 24% success rate and 57% eventual success rate with supporting qualitative evidence.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper consistently uses correlational language ('correlated with success,' 'may also be linked to') rather than causal claims; quasi-causal claims about mental models affecting strategies are appropriately hedged.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The positionality section explicitly states 'our findings may not generalize to other settings (e.g., community colleges, K-12 education) or cultural contexts,' bounding generalization to selective U.S. higher education.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 10 discusses multiple alternative explanations, including ChatGPT's mid-study release explaining late-study improvement, and the Charlie mascot's anthropomorphic qualities potentially biasing perception reports.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 5.2 carefully distinguishes success rate, eventual success rate, and pass@1, explicitly explaining why pass@1 is preferred as it accounts for LLM non-determinism.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 10 'THREATS TO VALIDITY' is a dedicated limitations section.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats named include: ChatGPT release causing statistically significant performance improvement in late participants; non-homogeneous programming backgrounds; power dynamics from professor involvement; Charlie mascot anthropomorphism effects; novelty bias; self-selection bias.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The positionality section explicitly states scope is limited to three selective U.S. higher education institutions and results may not extend to community colleges, K-12, or other cultural contexts.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments disclose NSF funding (SES-2326173, SES-2326174, SES-2326175) and computing resources from Northeastern Research Computing and New England Research Cloud.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All author affiliations are listed in the header; the positionality section additionally discloses that some authors contribute to open-source Code LLM development and evaluation.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "NSF is the sole funder and is independent of Code LLM commercial outcomes; no Code LLM company funded the research.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No formal competing interests or financial interests declaration appears; the positionality section mentions LLM contributions but does not declare patents, equity, or consulting relationships.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.2 formally defines 'success rate,' 'eventual success rate,' and 'pass@1'; the introduction defines the three-step text-to-code process; 'near-novice' population is precisely characterized as CS1 completers with no subsequent CS courses.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The introduction explicitly states the paper differentiates in 'three key ways: scale, population, and experimental design,' and articulates three research questions (RQ1–RQ3) with expected contributions.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 provides extensive comparison with Prather et al., Kazemitabaar et al., Denny et al., and Vaithilingam et al., explicitly contrasting scale, experimental design choices, and what questions each prior study could not address.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "The Charlie platform source code is not released; only study data is available at the OSF repository. No code repository is mentioned.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Study data is publicly available: 'Data collected as part of this work is publicly available at https://doi.org/10.17605/OSF.IO/V2C4T' (footnote, page 2).",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, requirements files, or Dockerfiles are provided for the Charlie platform or analysis code.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step computational reproduction instructions are provided; Appendix A.2 describes qualitative coding methodology but not analysis reproduction.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Main results (pass@1 rates by group, eventual success rates) are reported as means without CIs; Table 12 reports regression coefficients with standard errors but primary descriptive outcomes lack CIs or error bars.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Welch t-tests, Kendall's τ correlations, ANOVAs with Tukey HSD post-hoc tests, and binomial mixed-effects models are used with α=0.05 significance threshold (Section 5.1).",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect sizes are reported as Kendall's τ and Pearson's r correlation coefficients throughout, and raw mean differences in pass@1 (e.g., 0.17 vs. 0.24 for programming experience groups).",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Sample size (n=120) is justified only by comparison to prior work's smaller samples; no formal power analysis is conducted.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "SD is reported for timing (SD=10.6 min) and word changes (SD=11.34), but primary comparative outcomes (pass@1 rates by group) are reported as means only without variance measures.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "A function-signature-only pass@1 baseline (mean 0.0519) validates that problems require descriptions, and expert-authored prompts establish an upper bound for each problem's solvability.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": false,
    188           "answer": false,
    189           "justification": "This is a human user study, not a system benchmarking comparison; contemporary system baselines are not applicable.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is a human user study evaluating behavior, not a system with ablatable components.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple metrics are used: success rate, eventual success rate, pass@1, NASA-TLX workload scales, perception/trust surveys, and qualitative thematic codes.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "The entire study is human evaluation of Code LLM interactions; 120 participants' prompting behaviors are analyzed across sessions.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Not a prediction or ML training task; unit tests are fixed per problem to evaluate Code LLM output correctness, not a held-out ML evaluation set.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Table 4 reports pass@1 and eventual success rates for all 8 problem categories (Conditionals, Dictionaries, Lists, Loops, Math, Nested, Sorting, Strings) with student difficulty rankings.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 7 extensively discusses failure modes: syntax errors (§7.3.1), stochastic outputs producing different programs (§7.3.2), and repeated same output despite prompt edits (§7.3.3, affecting 72/120 students). Appendix B.2.2 manually analyzes all 20 initial descriptions for the least-solved problem.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Section 8.3 reports students do not observably improve at prompting within the study; §6.3 reports no significant differences for math courses, international status, household language, major type, or high school type.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "code-davinci-002 is specified as the study model throughout; StarCoder is named for pass@1 computation; HumanEval scores for both models are cited for capability comparison.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Figure 4 shows the exact prompt format: function signature + participant description formatted as a docstring sent via API, with the structure fully illustrated.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "'We generated output using best practices for hyperparameter and sampler settings [13]' cites Chen et al. but does not report actual temperature, top-p, or sampling parameters.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used; the system makes direct single-turn API calls to Codex.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Pass@1 computation using StarCoder (200 samples per prompt) is described in §5.2; qualitative transcript preprocessing (grammar correction, filler removal, anonymization) is described in §5.1; full codebook development is in Appendix A.2.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "Raw study data is publicly available at https://doi.org/10.17605/OSF.IO/V2C4T (footnote, page 2).",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 4 describes the Charlie interface, Zoom-based format, audio/video recording, experimental sequence, timing, compensation, and the semi-structured interview process in detail.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Recruitment is described: participants recruited March–July 2023 via interest forms distributed by faculty/staff (not the participant's own professor), with specific eligibility criteria and IRB approval (§4.3).",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline is documented: participant description → Codex API → test execution → pass/fail feedback; StarCoder for pass@1 post-hoc; qualitative coding process with codebook development described in Appendix A.2.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "The training data cutoff for code-davinci-002 is never stated; model performance is discussed but training data provenance is not.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper indirectly mitigates memorization by renaming functions with high signature-only pass@1, but does not formally discuss whether CS1 problems appear in Codex's training data.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No formal contamination analysis is performed; function renaming addresses name-level memorization but not general contamination of CS1-style programming problems in training data.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned anywhere in the paper.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": true,
    323           "justification": "'The pilot and main study received IRB approval.' (§4.3).",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Demographics are reported in Tables 1, 9, and 10: gender, race, first-generation status, household language, high school type, prior programming experience, and major.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Eligibility criteria stated: at least 18 years old, completed CS1 at their institution between Fall 2021 and Spring 2023, had not completed subsequent CS courses (§4.3).",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "'Participants were randomly assigned experimental lists, balanced by difficulty, using a modified Latin Square design.' (§4.4).",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "No blinding procedure is described; participants are aware they are interacting with an AI system, and researcher blinding is not mentioned. Automated testing removes subjectivity from the primary outcome but this is not framed as blinding.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No formal attrition report is provided; '5 missing question responses across the possible 960 interview datapoints' is noted incidentally but not as a formal attrition analysis.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "The paper notes sampling 200 Codex generations per prompt 'would be very expensive' without reporting actual API costs or inference latency for the study.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Computing resources from Northeastern Research Computing and New England Research Cloud are acknowledged but no specific compute budget or GPU hours are stated.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "CS1 completers achieve only 57% eventual success rate and 24% overall success rate when prompting Code LLMs for tasks at their skill level.",
    376       "evidence": "Figure 5a reports mean eventual success rate 57% and success rate 24% across 120 participants and 48 problems with automated correctness feedback.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Prior programming experience outside CS1 significantly predicts higher pass@1 rates (0.24 vs. 0.17, p=0.02).",
    381       "evidence": "Section 6.3 and Table 11 report Welch t-test result comparing students with additional experience vs. CS1-only (n≈40 vs. 80).",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "First-generation college students have reliably lower pass@1 rates than non-first-generation students (p=0.04).",
    386       "evidence": "Section 6.3 and Table 11: 23 first-gen participants averaged 0.17 vs. 0.23 for non-first-gen.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Students do not observably improve at prompt writing within the 75-minute study.",
    391       "evidence": "Section 8.3 compares first vs. last students on each problem (n=5 per cell) and finds no significant difference in success rates.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "The most common student mental model is a keyword-based dictionary lookup, which is incorrect and cannot explain LLM stochasticity.",
    396       "evidence": "Table 5 reports 46 students described keyword-based models; Section 8.1 explains why this model fails to account for observed LLM behavior.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "The Code LLM repeatedly generates the same incorrect code despite prompt edits in 11% of submissions, affecting 72/120 students.",
    401       "evidence": "Section 7.3.3 reports 104 submissions where the model repeated the same code despite changes, occurring in 36/48 problems and encountered by 72 students.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Students who correctly identified Charlie as an LLM had slightly higher pass@1 rates (0.27 vs. 0.22, p=0.03).",
    406       "evidence": "Section 8.1 reports this for only 19 students with the correct mental model; small sample warrants caution.",
    407       "supported": "weak"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "observational",
    412     "qualitative"
    413   ],
    414   "key_findings": "Beginning programmers who completed a single CS course struggle significantly with Code LLM prompting, achieving only 57% eventual success and 24% overall success on CS1-level tasks—even with automated correctness feedback removing the challenge of evaluating output. Students' dominant mental model (keyword-based dictionary lookup) is incorrect and drives ineffective strategies: primarily adding detail to failed prompts rather than fundamental restructuring. Prior programming experience predicts higher success, and first-generation college students perform significantly worse, raising equity concerns about Code LLM adoption. Students do not observably improve within the study, suggesting prompt engineering is a non-obvious skill requiring explicit instruction.",
    415   "red_flags": [
    416     {
    417       "flag": "Cross-model pass@1 substitution",
    418       "detail": "Pass@1 is computed using StarCoder rather than Codex (the study model) due to API cost. This introduces model-specific bias into the primary metric, as StarCoder may fail on prompts that Codex succeeds on, potentially underestimating student success."
    419     },
    420     {
    421       "flag": "ChatGPT mid-study confound",
    422       "detail": "ChatGPT launched between pilot and main experiment; the paper reports a statistically significant performance improvement for students who participated in the final month, indicating a temporal confound that makes the dataset non-uniform."
    423     },
    424     {
    425       "flag": "No pre-registration",
    426       "detail": "The study was not pre-registered, and the paper conducts many subgroup analyses (gender, race, language, major, first-gen status, math background, institution), raising the possibility of undisclosed hypothesis specification."
    427     },
    428     {
    429       "flag": "Underpowered subgroup comparisons",
    430       "detail": "The key improvement analysis (§8.3) compares only 5 students per cell (first vs. last attempt on each problem). The first-generation finding (n=23) is also at the edge of reliability."
    431     },
    432     {
    433       "flag": "No hyperparameters reported",
    434       "detail": "Temperature, top-p, and other Codex sampling parameters are not disclosed; only a citation to Chen et al. best practices is given, making exact replication of LLM outputs impossible."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    440       "relevance": "Key comparison for experienced programmer interaction modes; directly contrasted with beginner behavior throughout"
    441     },
    442     {
    443       "title": "Studying the effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    444       "relevance": "Most closely related prior work; compared in detail for scale, study design differences, and findings about novice LLM interaction"
    445     },
    446     {
    447       "title": "Promptly: Using Prompt Problems to Teach Learners How to Effectively Utilize AI Code Generators",
    448       "relevance": "Direct comparison study at smaller scale (54 students, 3 problems); used to benchmark contributions of this paper's larger scale"
    449     },
    450     {
    451       "title": "'It's Weird That it Knows What I Want': Usability and Interactions with Copilot for Novice Programmers",
    452       "relevance": "Related qualitative study of novice Copilot use; compared for experimental design and findings on student behaviors"
    453     },
    454     {
    455       "title": "Evaluating Large Language Models Trained on Code",
    456       "relevance": "Source of pass@1 methodology used as primary evaluation metric throughout this paper"
    457     },
    458     {
    459       "title": "StarCoder: may the source be with you!",
    460       "relevance": "Open model used for pass@1 computation as cost-effective alternative to Codex API"
    461     },
    462     {
    463       "title": "StudentEval: A Benchmark of Student-Written Prompts for Large Language Models of Code",
    464       "relevance": "Companion paper by overlapping authors using student-written prompts from this study to evaluate Code LLMs"
    465     },
    466     {
    467       "title": "Why Johnny Can't Prompt: How Non-AI Experts Try (and Fail) to Design LLM Prompts",
    468       "relevance": "Related study of non-expert prompting for non-code tasks; comparison for mental model and strategy findings"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "Directly informs CS educators and Code LLM product teams on whether AI coding tools democratize programming for non-experts, with equity implications for first-generation students"
    475     },
    476     "surprise_contrarian": {
    477       "score": 2,
    478       "justification": "Empirically challenges 'end of programming' and democratization narratives; the first-generation student performance gap is a notable unexpected finding"
    479     },
    480     "fear_safety": {
    481       "score": 1,
    482       "justification": "Raises equity concerns about Code LLMs potentially widening the digital divide, though framed constructively rather than alarmingly"
    483     },
    484     "drama_conflict": {
    485       "score": 1,
    486       "justification": "Mild tension with optimistic industry claims about AI democratizing programming, but paper is measured and academic in tone"
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Data available on OSF; the Charlie interface exists but is not publicly deployed for external replication"
    491     },
    492     "brand_recognition": {
    493       "score": 1,
    494       "justification": "No top-tier lab affiliation; Northeastern/Wellesley/Oberlin are respected but not prominent brands. OpenAI Codex involvement adds marginal recognition."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [],
    499     "top_points": 0,
    500     "total_points": 0,
    501     "total_comments": 0
    502   }
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs