scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26814B)
      1 {
      2   "paper": {
      3     "title": "Changes in Coding Behavior and Performance Since the Introduction of LLMs",
      4     "authors": ["Yufan Zhang", "Jaromir Savelka", "Seth Copen Goldstein", "Michael Conway"],
      5     "year": 2026,
      6     "venue": "LAK'26 (The 15th International Learning Analytics and Knowledge Conference)",
      7     "arxiv_id": "2601.11835",
      8     "doi": "10.1145/3785022.3785075"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, code archive, or supplementary material link is provided anywhere in the paper. The analysis scripts for computing edit distances, correlations, and generating figures are not released."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The dataset of 2,066 student submissions is not released. No download link or data repository is mentioned. This is understandable given student privacy concerns, but the data is not publicly available."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No environment specifications, library versions, or dependency files are mentioned. The paper uses Myers algorithm (git diff default) for edit distances but does not specify the analysis environment or tools used for statistical computations."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No reproduction instructions are provided. The paper describes the metrics (edit distance via Myers algorithm, number of submissions, scores) but does not provide scripts or step-by-step instructions to replicate the analysis."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The figures use box plots which show quartiles, but the paper's main quantitative claims (correlations, percentage changes) are reported as point estimates without confidence intervals. For example, corr=-0.16 and corr=0.240 are reported without CIs."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper reports p-values for correlations: 'negative correlation between IP Score and Average Edit Distance (corr=-0.16, p<0.001)' and 'moderate, positive correlation (corr=0.240, p<0.001)' in Section 5.3."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports correlation coefficients (corr=-0.16, corr=0.240) as effect sizes and provides contextual magnitude: 'for each additional 60 lines in Average Edit Distance, there is an associated 1% decrease in one's IP Score.' Percentage changes are also given (300% increase in median Average Edit Distance, 500% increase in Total Edit Distance)."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample consists of 721 enrollments across 10 semesters, but no justification for this sample size is provided and no power analysis is discussed. The sample size is dictated by the available institutional data rather than statistical considerations."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Box plots in Figures 4-13 show median, quartiles, and outliers for all metrics across semesters. The distributions are visually displayed with spread measures (IQR visible in box plots)."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The study uses a pre-ChatGPT vs. post-ChatGPT comparison (5 semesters before Fall 2022, 5 semesters after), which serves as the baseline comparison. Pre-ChatGPT semesters are the natural control group."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The pre/post comparison is inherent to the study design. The baseline (pre-ChatGPT semesters Fall 2020 - Spring 2022) is the most appropriate comparison for this quasi-longitudinal design."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "This is an observational study analyzing naturally occurring data, not a system with components that can be ablated."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses multiple metrics: Number of Submissions, Total Edit Distance, Average Edit Distance, Task Score, IP Score, TP Score, and Average Submission Effect (Sections 4.1, 4.2, and 5.3)."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "Human evaluation of outputs is not relevant here. The study analyzes automatically recorded student submissions and auto-graded scores. The claims are about behavioral patterns, not about the quality of outputs requiring human judgment."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a predictive modeling study. There is no model being trained or evaluated on a test set. The study analyzes the full population of student submissions."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down per semester (10 semesters shown individually in Figures 4-13), providing detailed temporal granularity. Pre-ChatGPT and Post-ChatGPT breakdowns are also provided in the correlation analyses (Figures 12-13)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses where their analysis is limited: the ceiling effect in Task Score (Section 5.2: 'nearly all students who attempted the PageRank task ultimately received a perfect Task Score'), the disappearance of the negative correlation for TP Score Post-ChatGPT (Section 5.3), and the anomalous f20 COVID semester."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports several null or counter-hypothesis results: no discernible trend in IP Score (Section 5.2), TP Scores actually increased post-ChatGPT contrary to what might be expected (Section 5.2), and the disappearance of the negative edit distance-TP score correlation post-ChatGPT (Section 5.3)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims about increased edit distances, more submissions, and decreased average score improvement are supported by the results in Sections 5.1-5.3 and Figures 4-13. The abstract appropriately hedges: 'we cannot definitively attribute them to LLM misuse' and 'suggesting that both student productivity and learning have decreased.'"
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper frequently uses causal language ('LLMs are impacting our students,' 'negatively affecting their learning outcomes,' 'has changed how students interact') while acknowledging in the Limitations section that they 'do not know definitively to what extent, or if at all, any student utilized LLMs.' The observational design cannot support causal claims. The paper hedges in some places but makes causal attributions in others, particularly in the Discussion (Section 6) and Conclusion (Section 8)."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title 'Changes in Coding Behavior and Performance Since the Introduction of LLMs' is broad, but the study examines only one task (PageRank in Scala) in one course at one university. The Limitations section acknowledges Scala-specificity but the title, abstract, and conclusion generalize to 'students,' 'junior developers,' and workforce impacts without adequate bounding. Section 8 speculates about 'junior developers' producing 'less effective commits' based on one course's data."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The Limitations section (Section 7) discusses two specific alternative explanations: (1) students sharing solutions across semesters could explain changes, and (2) the inability to confirm LLM usage. The paper also discusses the COVID outlier for f20 and the ceiling effect in Task Scores as confounds."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "The study does not use any LLM models in its methodology. It observes student behavior changes coinciding with ChatGPT's release but does not itself use or test any LLM."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The study does not use prompting. It analyzes historical student submission data."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No LLM or machine learning model is used in the study methodology. The study performs statistical analysis of observational data."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. This is an observational analysis of student submissions."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3 describes the data processing: 936 total enrollments filtered to 721 who attempted the PageRank task, source code files concatenated (Bash and Scala into single text file per submission), edit distances computed via Myers algorithm (git diff default). The selection criteria for the PageRank task (unchanged over 10 semesters) are explained."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 7 is titled 'Limitations and Future Work' and contains substantive discussion of three specific limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 7 discusses specific threats: (1) inability to confirm LLM usage and students' incentive to under-report, (2) solution sharing across semesters as an alternative explanation for behavioral changes, (3) Scala-specific findings may not generalize to other languages. These are specific to this study."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "While the Limitations section mentions Scala and one course, the paper does not explicitly state what the results do NOT show. The Conclusion (Section 8) extrapolates to workforce impacts ('junior developers') and broader educational implications without explicitly bounding the scope. The paper would benefit from a clear statement like 'these results apply only to this specific course/task/population.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The raw student submission data is not available for independent verification. No data repository or supplementary materials are provided."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3 describes the data collection in detail: submissions from 10 semesters of the Cloud Computing course at CMU, Fall 2020 - Spring 2025, auto-graded PageRank task, students submit source code as often as they like before the deadline. The course structure, grading components, and team project setup are all described."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "This is not a recruitment-based study. The data comes from institutional records of all students enrolled in a university course. The population is defined by course enrollment, not active recruitment."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 3 documents the pipeline: 936 total enrollments → 721 who attempted PageRank task (filtering criterion: made at least one submission). Section 4.1 describes how source code files are concatenated and edit distances computed. The metrics derivation is documented in Sections 4.1 and 4.2."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This research is funded in part by the Carnegie Mellon-Accenture Center of Excellence in AI-Enabled Workforce Training (ACE-AI). The Cloud Computing course at CMU is sponsored in part by Amazon Web Service, Google Cloud Platform, and Microsoft Azure.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: three authors from Carnegie Mellon University and one from Udacity. The course being studied is at CMU where three of the four authors are affiliated."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funders (Accenture via ACE-AI, and cloud providers AWS/Google/Microsoft) do not have a direct financial interest in whether LLMs are shown to help or harm student learning. The funding is for the center and course infrastructure, not contingent on particular findings about LLM effects."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper. One author is affiliated with Udacity, an online education platform, which could have interests related to findings about AI and education, but no conflicts statement addresses this."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is an observational study of student coding behavior."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "No model is being evaluated on a benchmark. This is an observational study of student submissions."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No benchmark evaluation is conducted. The study analyzes institutional data about student behavior."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This is an analysis of existing institutional records (student code submissions), not a human subjects experiment. Students were not recruited or asked to participate in a study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "This is an analysis of existing course submission records. Students were not recruited as research subjects. However, the paper does not mention IRB approval or exemption status, which might be relevant given the analysis of student data."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "This is an analysis of institutional records, not a human subjects study. The paper does note that students are 'mostly graduate students' (Figure 3) from CMU's cloud computing course, which provides some population characterization."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "Not a human subjects study. The inclusion criterion for the analysis population (students who made at least one submission for the PageRank task) is documented in Section 3."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "This is an observational study, not an experiment with randomized conditions."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "This is an observational study analyzing historical data. Blinding is not applicable."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "This is an analysis of existing records, not a human subjects study with participants who could drop out of a study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No AI inference is performed as part of this study's methodology. It is an observational analysis of student data."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "The study performs statistical analysis of student submissions. No significant compute budget is involved."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Student coding behavior has changed significantly since Fall 2022: median Average Edit Distance increased by 300%, median Total Edit Distance increased by 500%, and median Number of Submissions increased by 50%.",
    287       "evidence": "Section 5.1 and Discussion Section 6, supported by Figures 4, 5, 6 showing distributions per semester. The increases are visually apparent in the box plots starting from s23.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "There is a statistically significant negative correlation between Average Edit Distance and IP Score (corr=-0.16, p<0.001).",
    292       "evidence": "Section 5.3 and Figure 12. The correlation is computed across all enrollments with the specific magnitude quantified: 'for each additional 60 lines in Average Edit Distance, there is an associated 1% decrease in one's IP Score.'",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Average Submission Effect (score improvement per submission) has decreased steadily since s24, suggesting students make less effective changes between submissions.",
    297       "evidence": "Section 5.3, Figure 10 and Figure 11. Figure 11 shows an increasing proportion of submissions with no score change or decreased scores in recent semesters.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The negative correlation between Average Edit Distance and TP Score that existed Pre-ChatGPT disappeared Post-ChatGPT.",
    302       "evidence": "Section 5.3 and Figure 13. The pre-ChatGPT data shows a negative slope while post-ChatGPT data appears flat.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "The behavioral changes are consistent with students using LLMs to generate code, though this cannot be definitively attributed to LLM usage.",
    307       "evidence": "Sections 5.1, 6, and 7. The paper bases this hypothesis on the observation that LLMs tend to introduce unnecessary edits when prompted to modify code. The Limitations section (Section 7) explicitly acknowledges they cannot confirm LLM usage.",
    308       "supported": "weak"
    309     }
    310   ],
    311   "methodology_tags": ["observational"],
    312   "key_findings": "This quasi-longitudinal study of 721 student enrollments across 10 semesters at CMU finds that coding behavior changed significantly after ChatGPT's release: edit distances increased dramatically (300-500% in medians), while score improvement per submission decreased. There is a statistically significant but small negative correlation (r=-0.16) between average edit distance and individual project scores. However, the authors cannot confirm whether students actually used LLMs, and task completion rates remained near-perfect, suggesting a ceiling effect that limits the sensitivity of performance metrics.",
    313   "red_flags": [
    314     {
    315       "flag": "Causal language without causal design",
    316       "detail": "Despite being an observational study with no ability to confirm LLM usage, the paper uses causal framing ('LLMs are impacting our students,' 'negatively affecting their learning outcomes,' 'changed how students interact'). The temporal correlation with ChatGPT's release is treated as suggestive of causation throughout, though alternative explanations (solution sharing, cohort differences, course evolution) are only briefly acknowledged."
    317     },
    318     {
    319       "flag": "Overgeneralization from narrow data",
    320       "detail": "The study examines one task (PageRank in Scala) in one course at one university, but the title, abstract, and conclusion generalize broadly to 'students,' 'junior developers,' and workforce productivity implications. The Conclusion speculates about industry effects ('increased requirements on companies' testing and quality assurance workflows') without evidence beyond this single classroom context."
    321     },
    322     {
    323       "flag": "Small effect size presented with alarm",
    324       "detail": "The negative correlation between Average Edit Distance and IP Score (r=-0.16) explains only about 2.6% of variance in IP scores. The paper contextualizes it as 'for each additional 60 lines in Average Edit Distance, there is an associated 1% decrease in one's IP Score,' which is very small, yet the abstract and conclusion frame findings as 'raising an alarm.'"
    325     },
    326     {
    327       "flag": "No data or code released",
    328       "detail": "Neither the student submission data nor the analysis code is released, making independent verification impossible. While student privacy is a valid concern, anonymized or aggregated data could have been provided."
    329     },
    330     {
    331       "flag": "Ceiling effect undermines core metric",
    332       "detail": "The paper acknowledges that nearly all students achieved perfect scores on the PageRank task and the median IP Score is 94%, creating a ceiling effect. This severely limits the ability to detect meaningful performance differences, yet behavioral conclusions about learning degradation are still drawn."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "The Influence of Artificial Intelligence Tools on Learning Outcomes in Computer Programming: A Systematic Review and Meta-Analysis",
    338       "authors": ["Manal Alanazi", "Ben Soh", "Halima Samra", "Alice Li"],
    339       "year": 2025,
    340       "doi": "10.3390/computers14050185",
    341       "relevance": "Meta-analysis of AI tools' impact on programming learning outcomes, directly relevant to understanding LLM effects on developer education."
    342     },
    343     {
    344       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    345       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    346       "year": 2025,
    347       "arxiv_id": "2507.09089",
    348       "relevance": "Studies the impact of AI tools on experienced developer productivity, finding that perceived productivity gains may not match actual gains."
    349     },
    350     {
    351       "title": "Beyond Code Generation: An Observational Study of ChatGPT Usage in Software Engineering Practice",
    352       "authors": ["Ranim Khojah", "Mazen Mohamad", "Philipp Leitner", "Francisco Gomes de Oliveira Neto"],
    353       "year": 2024,
    354       "doi": "10.1145/3660788",
    355       "relevance": "Observational study of ChatGPT usage patterns in professional software engineering practice."
    356     },
    357     {
    358       "title": "The impact of large language models on programming education and student learning outcomes",
    359       "authors": ["Gregor Jošt", "Viktor Taneski", "Sašo Karakatič"],
    360       "year": 2024,
    361       "relevance": "Studies the relationship between LLM reliance and final grades in programming education, finding a negative correlation."
    362     },
    363     {
    364       "title": "The Widening Gap: The Benefits and Harms of Generative AI for Novice Programmers",
    365       "authors": ["James Prather", "Brent Reeves", "Juho Leinonen", "Stephen MacNeil"],
    366       "year": 2024,
    367       "arxiv_id": "2405.17739",
    368       "relevance": "Examines how generative AI affects novice programmers, exploring both benefits and harms of over-reliance."
    369     },
    370     {
    371       "title": "An Empirical Study on Usage and Perceptions of LLMs in a Software Engineering Project",
    372       "authors": ["Sanka Rasnayaka", "Guanlin Wang", "Ridwan Shariffdeen", "Ganesh Neelakanta Iyer"],
    373       "year": 2024,
    374       "doi": "10.1145/3643795.3648379",
    375       "relevance": "Empirical study finding 40.5% of undergraduate teams used LLMs in software engineering projects."
    376     },
    377     {
    378       "title": "Would ChatGPT-facilitated programming mode impact college students' programming behaviors, performances, and perceptions? An empirical study",
    379       "authors": ["Dan Sun", "Azzeddine Boudouaia", "Chengcong Zhu", "Yan Li"],
    380       "year": 2024,
    381       "relevance": "Studies how ChatGPT affects student programming behaviors, performance, and perceptions including use patterns like 'reading feedback' and 'understanding code.'"
    382     },
    383     {
    384       "title": "Does ChatGPT Help With Introductory Programming? An Experiment of Students Using ChatGPT in CS1",
    385       "authors": ["Yuankai Xue", "Hanlin Chen", "Gina R. Bai", "Robert Tairas", "Yu Huang"],
    386       "year": 2024,
    387       "doi": "10.1145/3639474.3640076",
    388       "relevance": "Controlled experiment studying student ChatGPT usage vs. traditional resources in introductory programming."
    389     },
    390     {
    391       "title": "Codehelp: Using large language models with guardrails for scalable support in programming classes",
    392       "authors": ["Mark Liffiton", "Brad E Sheese", "Jaromir Savelka", "Paul Denny"],
    393       "year": 2023,
    394       "relevance": "Proposes guardrail-based LLM tutoring for programming education, relevant to understanding beneficial LLM usage."
    395     },
    396     {
    397       "title": "A Review on Vibe Coding: Fundamentals, State-of-the-art, Challenges and Future Directions",
    398       "authors": ["Partha Pratim Ray"],
    399       "year": 2025,
    400       "doi": "10.36227/techrxiv.174681482.27435614/v1",
    401       "relevance": "Reviews vibe coding practices where natural language directives orchestrate end-to-end software creation, relevant to understanding AI-assisted coding paradigms."
    402     }
    403   ]
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs