scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24759B)
      1 {
      2   "paper": {
      3     "title": "Teaching Programming in the Age of Generative AI: Insights from Literature, Pedagogical Proposals, and Student Perspectives",
      4     "authors": [
      5       "Clemente Rubio-Manzano",
      6       "Jazna Meza",
      7       "Rodolfo Fernández-Santibáñez",
      8       "Christian Vidal-Castro"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2507.00108",
     13     "doi": "10.48550/arXiv.2507.00108"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No code, tools, or repository links are provided anywhere in the paper. The proposed visual simulation approach is described conceptually but no implementation is released."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Under 'Availability of data and material' the paper states 'Contact the corresponding author for data requests.' While Table I contains individual survey responses, no formal dataset is released. The data underlying the word cloud and emotion analysis is not provided."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper uses computational tools for generating the word cloud (Figure 11) and emotion analysis (Figure 12) but does not specify what software, libraries, or environment was used for these analyses."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No reproduction instructions are provided for either the survey methodology or the computational analyses (word cloud, emotion detection)."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Survey results are reported as simple percentages (e.g., '86.1% indicated that code visualization was useful') with no confidence intervals or error bars, despite the small sample size (N=36) where uncertainty would be substantial."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No statistical significance tests are performed on any of the survey results. Claims about usefulness are based on raw percentages alone."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No effect sizes are reported. The paper reports only raw percentages from the survey without any formal effect size measures."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The sample size of 36 students is not justified. No power analysis is mentioned, and no acknowledgment is made that N=36 may be insufficient for drawing reliable conclusions."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "This is a single cross-sectional survey, not a repeated experiment. There are no multiple runs or trials for which variance would be relevant."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No comparison is made against alternative teaching approaches, prior surveys of student perceptions, or a control group that did not receive the visual simulation intervention."
     70       },
     71       "baselines_contemporary": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experimental baselines are included, so the question of whether baselines are contemporary is inapplicable."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "The pedagogical proposal is a single intervention (visual program simulation), not a multi-component system where components could be ablated."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The primary evaluation metric is a single yes/no question about perceived usefulness. While grades and free-text comments are collected, they are not used as formal evaluation metrics for the intervention's effectiveness."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "The paper does not produce system outputs that require human evaluation. The study itself IS a human survey, but this criterion asks about evaluating system outputs."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No test set is involved in this study. This is a survey-based pedagogical proposal, not a machine learning evaluation."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Figure 10 provides a gender breakdown of students who found visual representation useful versus those who did not. Table I provides individual-level responses with grades and comments."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section IV-B discusses three specific areas where the approach could be improved based on negative feedback: need for standardization, importance of considering different learning styles, and the need to introduce the strategy earlier in the course. Individual negative comments are also reported (e.g., Student 34, Student 36)."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that 16.7% of students did not find the visualization useful, and includes specific negative comments such as Student 34's critique ('If you show this to someone who has never studied programming in their life, they won't understand') and Student 36's difficulty comparing visualizations with code."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims are appropriately hedged: it promises to 'review the most relevant studies' (done in Section II), 'proposes enriching teaching and learning methodologies' (done in Sections III-IV), and presents 'preliminary context supporting the incorporation of visual simulations' (done in Section IV-B survey). No claims exceed what is presented."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper implies visual simulation 'foster[s] a deeper understanding among students' (abstract) and that VPS 'promotes meaningful learning' (Section III-A). These are causal claims, but the study design—a single-group post-hoc survey with no control group, no pre-post measurement, and no randomization—is inadequate for causal inference."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title ('Teaching Programming in the Age of Generative AI') and abstract frame claims broadly for all programming education, but the empirical evidence comes from 36 students in one OOP course at one Chilean university. The paper even generalizes to 'Java (or other languages)' without evidence from other languages."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No alternative explanations are considered for the survey results. Social desirability bias (students responding positively to their instructor's method), novelty effects, instructor quality, or selection bias from voluntary participation are not discussed."
    132       },
    133       "proxy_outcome_distinction": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper measures student self-reported perceptions of usefulness via a yes/no question but frames results as evidence that visual simulation is effective for actual understanding. The distinction between 'students say it helped' and 'it actually improved comprehension' is not acknowledged."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No AI models are used in the paper's own experiments. The paper discusses LLMs conceptually but does not run any model-based experiments."
    144       },
    145       "prompts_provided": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No prompting is used in this study. The paper discusses LLM-based code generation conceptually but does not use prompting in its own methodology."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No hyperparameters are involved. This is a pedagogical proposal and survey study, not a computational experiment."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used in this paper."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The paper does not describe how student comments were preprocessed for the word cloud (Figure 11) or what tool/method was used for the emotion analysis (Figure 12). The transformation from raw comments to these visualizations is undocumented."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "There is no dedicated limitations or threats-to-validity section. The paper moves directly from survey results to 'Conclusions and Future Works' (Section V) without discussing limitations."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No threats to validity are discussed. Key threats such as the tiny sample size, lack of control group, potential instructor bias, voluntary response bias, and single-institution scope are not acknowledged."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show or what populations/settings are excluded from its claims."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Table I provides all 36 individual survey responses including yes/no answers, grades, and free-text comments. This allows independent verification of the reported results (and reveals inconsistencies in the reported percentages)."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper describes the three survey questions (Section IV-B) and states 36 students responded, but does not describe when the survey was administered, how (online/paper), whether it was voluntary, during or after the course, or any other details of the data collection procedure."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No recruitment methods are described. The paper says students 'who took the object-oriented programming course' but does not explain how they were recruited, whether participation was voluntary, or what proportion of the class responded."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No documentation of the data pipeline from raw survey responses to the word cloud (Figure 11) or emotion analysis (Figure 12). The tools, methods, and parameters used for these analyses are not described."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The Funding section explicitly states: 'This study was not supported by a grant.'"
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are clearly listed: Universidad del Bío-Bío and DuocUC, both in Concepción, Chile."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "The study is unfunded, so the question of funder independence is not applicable."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Under Competing Interests, the paper states: 'Authors declare that they have no competing interests.'"
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark. It is a pedagogical proposal with a student survey, so contamination concerns are not applicable."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No model evaluation is performed, so train/test overlap is not applicable."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluation is performed, so benchmark contamination is not applicable."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No pre-registration is mentioned. The study surveyed 36 students but there is no reference to any pre-registration platform."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "Under 'Ethics approval and consent to participate' the paper states 'Not applicable.' This is concerning since data was collected from students (a potentially vulnerable population), yet no ethics review was sought or mentioned."
    254       },
    255       "demographics_reported": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "Only gender is shown (Figure 10 pie chart). Key demographics for a CS education study—year of study, age, prior programming experience—are not reported. Participants are identified only as OOP students at a Chilean university."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No inclusion or exclusion criteria are stated. The paper says only that respondents were students 'who took the object-oriented programming course' with no further selection criteria or screening described."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "This is a cross-sectional survey, not an experimental study with treatment/control groups, so randomization is not applicable."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "This is a cross-sectional survey, not an experimental study, so blinding is not applicable."
    274       },
    275       "attrition_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper reports 36 respondents but does not state how many students were enrolled in the course. The response rate is unknown, making it impossible to assess selection/attrition bias."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a pedagogical proposal and survey paper, not a system with inference costs."
    286       },
    287       "compute_budget_stated": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a pedagogical proposal and survey paper with no significant compute requirements."
    291       }
    292     }
    293   },
    294   "scan_version": 3,
    295   "active_modules": [],
    296   "claims": [
    297     {
    298       "claim": "86.1% of students indicated that code visualization was useful for understanding object and reference concepts in the OOP course.",
    299       "evidence": "Section IV-B survey of 36 students, Figure 10, Table I. However, counting Table I directly yields 32/36 = 88.9% YES and 4/36 = 11.1% NO, which contradicts the reported 86.1% and 16.7% (which also do not sum to 100%).",
    300       "supported": "weak"
    301     },
    302     {
    303       "claim": "LLMs lack true understanding of the code they generate, operating on statistical correlations rather than semantic understanding.",
    304       "evidence": "Section I cites Shojaee et al. 2025 [4] ('The illusion of thinking') and Valmeekam et al. 2022 [5] ('Large language models still can't plan') to support this claim. No original evidence is presented.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "Visual program simulation (VPS) is an effective pedagogical tool for teaching programming that promotes meaningful learning.",
    309       "evidence": "Section III reviews prior literature on visualization in education (Mayer, Python Tutor, Marton et al.) and Section IV-B presents a preliminary survey. The survey has no control group, no pre-post measurement, and N=36.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Assessment in programming courses should shift from evaluating code functionality to evaluating student understanding of code execution.",
    314       "evidence": "Sections III-IV present this as a pedagogical proposal supported by literature on visualization and the argument that LLMs cannot truly understand code. No comparative evidence is provided showing this approach yields better learning outcomes.",
    315       "supported": "weak"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "qualitative",
    320     "theoretical"
    321   ],
    322   "key_findings": "This paper reviews advantages and disadvantages of LLMs in programming education and proposes visual program simulation (VPS) as a complementary assessment strategy focusing on code comprehension over code writing. A preliminary survey of 36 OOP students found that the majority reported visual representations were useful for understanding object and reference concepts, though the reported percentages (86.1%) do not match the raw data in Table I (88.9%). The paper argues that since LLMs lack true code understanding, assessment should verify students' ability to mentally simulate program execution using box-and-arrow visual representations.",
    323   "red_flags": [
    324     {
    325       "flag": "Data inconsistency",
    326       "detail": "The paper reports 86.1% of students found visualization useful and 16.7% did not (summing to 102.8%), but counting Table I yields 32 YES and 4 NO out of 36 (88.9% and 11.1%). The reported percentages are internally inconsistent and do not match the published data."
    327     },
    328     {
    329       "flag": "Tiny uncontrolled sample",
    330       "detail": "N=36 students with no control group, no pre-post measurement, no randomization, and a single yes/no question as the primary metric. This design cannot support claims about pedagogical effectiveness."
    331     },
    332     {
    333       "flag": "Instructor bias risk",
    334       "detail": "Students were surveyed about the usefulness of their own instructor's teaching method. Social desirability bias is highly likely, especially given no anonymity safeguards are described beyond the acknowledgment noting anonymous participation."
    335     },
    336     {
    337       "flag": "Unknown response rate",
    338       "detail": "The paper does not report how many students were enrolled in the course versus how many responded (36). If the response rate is low, the results may suffer from severe self-selection bias."
    339     },
    340     {
    341       "flag": "No ethics review",
    342       "detail": "The paper explicitly states 'Not applicable' for ethics approval despite collecting data from students, a potentially vulnerable population whose grades could be influenced by the instructor-researchers."
    343     },
    344     {
    345       "flag": "Narrative literature review without systematic methodology",
    346       "detail": "Section II reviews only 5-6 studies on LLMs in programming education with no structured search protocol, inclusion/exclusion criteria, or quality assessment of sources. The review is selective and may not represent the full body of evidence."
    347     },
    348     {
    349       "flag": "Claims outrun evidence",
    350       "detail": "Broad claims about programming education ('Teaching Programming in the Age of Generative AI') are supported only by a tiny survey at one Chilean university in one Java OOP course. The paper generalizes to 'other languages' without evidence."
    351     },
    352     {
    353       "flag": "Sloppy reference",
    354       "detail": "Reference [3] lists the venue as 'Name of Journal' — an apparent placeholder that was never corrected, suggesting incomplete quality control in manuscript preparation."
    355     }
    356   ],
    357   "cited_papers": [
    358     {
    359       "title": "Programming education and learner motivation in the age of generative AI: student and educator perspectives",
    360       "authors": ["S. Boguslawski", "R. Deer", "M. G. Dawson"],
    361       "year": 2025,
    362       "relevance": "Directly studies the relationship between student motivation and LLM use in introductory programming courses."
    363     },
    364     {
    365       "title": "Generative AI in introductory programming",
    366       "authors": ["B. A. Becker", "M. Craig", "P. Denny", "H. Keuning", "N. Kiesler", "J. Leinonen", "A. Luxton-Reilly", "L. Malmi", "J. Prather", "K. Quille"],
    367       "year": 2023,
    368       "relevance": "Argues for the transformative potential of generative AI in introductory programming education with implications for CS curricula."
    369     },
    370     {
    371       "title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity",
    372       "authors": ["P. Shojaee", "I. Mirzadeh", "K. Alizadeh", "M. Horton", "S. Bengio", "M. Farajtabar"],
    373       "year": 2025,
    374       "arxiv_id": "2506.06941",
    375       "relevance": "Evaluates reasoning limitations of LLMs, directly relevant to claims about AI coding capability boundaries."
    376     },
    377     {
    378       "title": "Large language models still can't plan (a benchmark for LLMs on planning and reasoning about change)",
    379       "authors": ["K. Valmeekam", "A. Olmo", "S. Sreedharan", "S. Kambhampati"],
    380       "year": 2022,
    381       "relevance": "Benchmarks LLM planning and reasoning limitations, relevant to the argument that LLMs lack true code understanding."
    382     },
    383     {
    384       "title": "Teaching and learning computer programming using ChatGPT: A rapid review of literature amid the rise of generative AI technologies",
    385       "authors": ["M. B. Garcia"],
    386       "year": 2025,
    387       "relevance": "Reviews literature on using ChatGPT for teaching programming, directly within the survey scope of LLM impact on programming education."
    388     },
    389     {
    390       "title": "The impact of AI use in programming courses on critical thinking skills",
    391       "authors": ["C. J. S. F. Clarke", "A. Konak"],
    392       "year": 2025,
    393       "relevance": "Studies the effect of AI tools on critical thinking in programming courses, relevant to concerns about LLM impact on learning."
    394     },
    395     {
    396       "title": "Online Python Tutor: embeddable web-based program visualization for CS education",
    397       "authors": ["P. J. Guo"],
    398       "year": 2013,
    399       "relevance": "Foundational work on code visualization tools for programming education, directly relevant to the visual simulation approach discussed."
    400     }
    401   ],
    402   "engagement_factors": {
    403     "practical_relevance": {
    404       "score": 1,
    405       "justification": "The VPS pedagogical concept could interest CS educators, but no tool, software, or materials are released for practical use."
    406     },
    407     "surprise_contrarian": {
    408       "score": 0,
    409       "justification": "The proposal that visualization aids learning is well-established; nothing here challenges conventional wisdom."
    410     },
    411     "fear_safety": {
    412       "score": 0,
    413       "justification": "No AI risk or security concerns are raised; the paper discusses pedagogical methodology."
    414     },
    415     "drama_conflict": {
    416       "score": 0,
    417       "justification": "No controversy or provocative claims; the paper takes a measured pedagogical position."
    418     },
    419     "demo_ability": {
    420       "score": 0,
    421       "justification": "No code, demo, or tool is provided; the visual simulations are only hand-drawn figures in the paper."
    422     },
    423     "brand_recognition": {
    424       "score": 0,
    425       "justification": "Authors are from Universidad del Bío-Bío and DuocUC in Chile, not widely known in the AI research community."
    426     }
    427   }
    428 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs