scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (40509B)
      1 {
      2   "paper": {
      3     "title": "The Robots are Here: Navigating the Generative AI Revolution in Computing Education",
      4     "authors": [
      5       "James Prather",
      6       "Paul Denny",
      7       "Juho Leinonen",
      8       "Brett A. Becker",
      9       "Ibrahim Albluwi",
     10       "Michelle Craig",
     11       "Hieke Keuning",
     12       "Natalie Kiesler",
     13       "Tobias Kohn",
     14       "Andrew Luxton-Reilly",
     15       "Stephen MacNeil",
     16       "Andrew Petersen",
     17       "Raymond Pettit",
     18       "Brent N. Reeves",
     19       "Jaromir Savelka"
     20     ],
     21     "year": 2023,
     22     "venue": "ITiCSE 2023",
     23     "arxiv_id": "2310.00658",
     24     "doi": "10.1145/3587103.3594206"
     25   },
     26   "scan_version": 3,
     27   "active_modules": ["experimental_rigor", "data_leakage", "survey_methodology"],
     28   "methodology_tags": ["meta-analysis", "qualitative", "observational", "benchmark-eval"],
     29   "key_findings": "This large working group report synthesizes 71 papers on LLMs in computing education and finds that LLMs perform at or above average student level on introductory programming tasks, with GPT-4 dramatically outperforming Codex within two years. A survey of 228 respondents across 20 countries shows students and instructors agree GenAI use should have some restrictions, but disagree on the clarity of existing policies. Interviews with 22 educators reveal a shift toward process-over-product assessment, greater emphasis on code reading, and reduced weight on unsupervised homework. Available computing education benchmarks are heavily biased toward introductory Python problems, limiting the scope of current LLM evaluations.",
     30   "checklist": {
     31     "artifacts": {
     32       "code_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No analysis code, scripts, or tools are released. The paper releases benchmark problem descriptions and test cases on OSF (Section 8.3, footnote 5) but no code for the survey analysis, interview analysis, or benchmarking pipeline."
     36       },
     37       "data_released": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper releases the problem descriptions and test cases from the Finnie-Ansley et al. replication on OSF (https://osf.io/bu9h3/). The benchmarking also uses publicly available datasets (HumanEval, APPS, FalconCode). However, survey data and interview transcripts are not released."
     41       },
     42       "environment_specified": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No environment specifications, requirements files, or dependency information is provided for reproducing the benchmarking experiments or survey/interview analyses."
     46       },
     47       "reproduction_instructions": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No step-by-step reproduction instructions are provided. The benchmarking methodology is described in prose (Sections 8.3-8.4) but lacks concrete commands or scripts to reproduce results."
     51       }
     52     },
     53     "statistical_methodology": {
     54       "confidence_intervals_or_error_bars": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No confidence intervals or error bars are reported for any results. Survey responses are presented as percentages and Likert distributions. Benchmarking results are raw pass/fail counts. APPS results (Table 10) report averages without uncertainty bounds."
     58       },
     59       "significance_tests": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No statistical significance tests are used despite making comparative claims. Student vs. instructor survey responses are compared descriptively without formal tests. LLM performance comparisons across models (Table 9, Table 10) use only raw numbers."
     63       },
     64       "effect_sizes_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No effect sizes (Cohen's d, odds ratios, etc.) are reported. The benchmarking comparisons report raw percentages (e.g., GPT-4 at 51.5% vs GPT-3.5 at 39.2% on APPS) but without formal effect size measures."
     68       },
     69       "sample_size_justified": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No justification for sample sizes. The survey had 57 instructors and 171 students with no power analysis or sample size rationale. The interview sample of 22 educators is described but not justified. The paper acknowledges selection bias (Section 4.2.1) but does not discuss statistical adequacy."
     73       },
     74       "variance_reported": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No variance or standard deviation is reported. Benchmarking uses single-run results (temperature 0.9 with up to 10 attempts, temperature 0.0 for APPS). No spread measures across experimental runs are provided."
     78       }
     79     },
     80     "evaluation_design": {
     81       "baselines_included": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The replication study (Section 8.3) explicitly compares GPT-4, GPT-3.5-turbo, and Copilot against the original Codex results from Finnie-Ansley et al. Figure 3 directly compares original vs replication. The literature review compares quality metrics against Hellas et al. (Table 5)."
     85       },
     86       "baselines_contemporary": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The replication uses GPT-4 (state-of-the-art at the time), GPT-3.5-turbo (free ChatGPT model), and GitHub Copilot, all contemporary models. The comparison against Codex is the explicit purpose of the replication to show improvement over time."
     90       },
     91       "ablation_study": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "This is a multi-method working group report (literature review, survey, interviews, benchmarking), not a single system with components to ablate."
     95       },
     96       "multiple_metrics": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "For APPS (Table 10), both 'test case average' and 'strict accuracy' are reported. For FalconCode (Table 11), 'Full' and 'Clean' success rates are reported. The replication reports both attempt number and pass/fail. The literature review uses four quality metrics (Table 5)."
    100       },
    101       "human_evaluation": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The benchmarking evaluation is entirely automated via test cases. While the authors manually analyzed failure cases (e.g., FalconCode thematic analysis of failures, Table 12), this is error analysis rather than human evaluation of LLM outputs. The schema specifies that automated pass/fail on test suites is NO."
    105       },
    106       "held_out_test_set": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The models are evaluated on established benchmark problems without any tuning. The replication uses exam problems from Finnie-Ansley et al. APPS and FalconCode are standard benchmarks. No model tuning or development decisions were made on these test sets."
    110       },
    111       "per_category_breakdown": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 9 breaks down replication results per problem. Table 10 breaks APPS results by difficulty level (introductory/interview/competition). Table 11 breaks FalconCode by assignment type (Skill/Lab/Project). Table 8 categorizes benchmark problems by course level."
    115       },
    116       "failure_cases_discussed": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section 8.3 discusses specific problems models failed on (T2-Q12 bar graph problem, Rainfall variants) with analysis of why. Section 8.4.2 contains detailed thematic analysis of GPT-4 failures on FalconCode (Table 12), identifying missing instructions, incorrect structure, and genuinely incorrect solutions."
    120       },
    121       "negative_results_reported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Multiple negative results reported: all models failed on T2-Q12; Copilot failed on 3 exam problems and 3 rainfall variants; GPT-4 only achieved 45.4% on FalconCode raw (Table 11); Section 8.4.2 documents extensive dataset quality issues. The APPS competition-level problems showed poor performance (28.7% for GPT-4)."
    125       }
    126     },
    127     "claims_and_evidence": {
    128       "abstract_claims_supported": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The abstract lists five specific contributions (literature review of 71 papers, survey across 20 countries, 22 educator interviews, ethics analysis via ACM Code of Ethics, LLM benchmarking). All five are delivered in corresponding sections (3, 4, 5, 6-7, 8). Claims are descriptive and matched by the paper's content."
    132       },
    133       "causal_claims_justified": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "The paper makes no causal claims of its own. It reports descriptive survey findings, qualitative interview themes, literature synthesis, and benchmarking comparisons. Performance comparisons between models ('GPT-4 outperforms Codex') are direct measurement comparisons, not causal claims."
    137       },
    138       "generalization_bounded": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Each component includes explicit scope limitations. Section 3.7 bounds the literature review to April-August 2023. Section 4.2.1 acknowledges selection bias in the survey sample. Section 5.8 notes geographic skew (55% US). Section 8.5.1 acknowledges the benchmarking is limited to code generation tasks and mostly introductory Python."
    142       },
    143       "alternative_explanations_discussed": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "For benchmarking, data contamination is discussed as an alternative explanation for high performance: 'There is a large chance that solutions for it have been included in training recent models' (Section 8.4.1). For the survey, selection bias toward engaged educators and high-achieving students is discussed (Section 4.2.1). For FalconCode, missing information rather than model failure explains many poor results (Section 8.4.2)."
    147       },
    148       "proxy_outcome_distinction": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 8.5.1 explicitly distinguishes between what test cases measure and what educators care about: 'testing generated code can be easily automated by running test cases, although this might not capture all aspects relevant to computing educators, such as code quality and suitability of the solution with regards to which concepts the student has learned so far.'"
    152       }
    153     },
    154     "setup_transparency": {
    155       "model_versions_specified": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The paper uses 'GPT-4', 'GPT-3.5-turbo', and 'GitHub Copilot' without specifying exact versions or snapshot dates (e.g., no 'gpt-4-0613'). Section 8.3 notes 'GPT-4' and 'GPT-3.5-turbo' without version identifiers. The original study used 'code-davinci-001' which is more specific."
    159       },
    160       "prompts_provided": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 8.4.1 provides the full system prompt used for APPS and FalconCode experiments, including the example input/output format. Section 8.3 describes the prompt format: 'problem description surrounded by triple-quotes as was done in the original study.' The prompt text is fully reproducible."
    164       },
    165       "hyperparameters_reported": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 8.3 states 'temperature value of 0.9 similar to the original study.' Section 8.4.1 reports 'temperature=0.0, max_tokens=4000, and default values for Top P (1), Frequency penalty (0), and Presence penalty (0).' Key generation parameters are fully specified."
    169       },
    170       "scaffolding_described": {
    171         "applies": false,
    172         "answer": false,
    173         "justification": "No agentic scaffolding is used. The benchmarking sends direct prompts to models and collects completions. Copilot is used via IDE plugin as a black box."
    174       },
    175       "data_preprocessing_documented": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The literature review pipeline is documented with counts at each stage (Figure 1): 19 seed papers → 513 snowballed → inclusion criteria applied → 46 included → 8 excluded after deep review → 38 → second snowballing → 71 final papers. For FalconCode, de-duplication steps are documented (661 → 422 by ID → 385 by prompt). Filtering criteria are stated at each stage (Section 3.1)."
    179       }
    180     },
    181     "limitations_and_scope": {
    182       "limitations_section_present": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Multiple dedicated limitations subsections are present: Section 3.7 (literature review limitations and threats to validity), Section 5.8 (interview limitations and threats to validity), and Section 8.5.1 (benchmarking limitations). Each contains substantive discussion."
    186       },
    187       "threats_to_validity_specific": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Specific threats are discussed: literature review was bounded to April-August 2023 and used only one step of snowballing (Section 3.7); interview sample was geographically skewed (55% US) with only 14% women (Section 5.8); Codex was deprecated during the study (Table 4); FalconCode dataset had missing starter code and data files causing artificial failures (Section 8.4.2)."
    191       },
    192       "scope_boundaries_stated": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "While limitations are discussed, the paper does not explicitly state what claims it is NOT making. The title ('Navigating the Generative AI Revolution in Computing Education') is broad, and the paper does not provide a structured statement of what the results do not show. Limitations are framed as caveats rather than explicit scope exclusions."
    196       }
    197     },
    198     "data_integrity": {
    199       "raw_data_available": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "Survey responses are not released. Interview transcripts were deleted per ethics protocol (Section 5.1: 'we deleted all audio and video recordings'). Only the benchmarking problem descriptions and test cases are released on OSF. The primary datasets (survey n=228, interview transcripts) are not available for independent verification."
    203       },
    204       "data_collection_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Survey design and distribution described in Section 4.2 (SIGCSE mailing lists, snowball sampling). Interview methodology described in Section 5.1 (semi-structured, Zoom, transcription process). Literature review search described in Section 3.1 (databases, keywords, dates). Benchmarking data collection described in Sections 8.3-8.4."
    208       },
    209       "recruitment_methods_described": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Survey recruitment via SIGCSE mailing lists for instructors (n=57) and snowball sampling for students (n=171) is described in Section 4.2.1, including an attempt to mitigate response bias by including language targeting struggling students. Interview recruitment via purposeful sampling through author networks, SIGCSE mailing list, and survey volunteers is described in Section 5.1."
    213       },
    214       "data_pipeline_documented": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The literature review pipeline is fully documented in Figure 1 with counts at each stage and criteria for inclusion/exclusion. Survey analysis process (thematic analysis) is described in Section 4.2.3. Interview analysis pipeline (verbatim transcription → thematic analysis → theme review) is described in Section 5.1. FalconCode pipeline (extraction → de-duplication → generation → testing → analysis) is documented in Section 8.4.2."
    218       }
    219     },
    220     "conflicts_of_interest": {
    221       "funding_disclosed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding sources are disclosed. The acknowledgments section thanks interviewees but does not mention any grants, corporate sponsors, or funding agencies."
    225       },
    226       "affiliations_disclosed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "All 15 author affiliations are clearly listed on the first page, spanning universities in the USA, New Zealand, Ireland, Jordan, Canada, Netherlands, Germany, Austria."
    230       },
    231       "funder_independent_of_outcome": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No funder is disclosed, so independence cannot be assessed. The work appears to be unfunded academic research."
    235       },
    236       "financial_interests_declared": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No competing interests or financial interests statement is present in the paper."
    240       }
    241     },
    242     "contamination": {
    243       "training_cutoff_stated": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No training data cutoff dates are stated for GPT-4, GPT-3.5-turbo, or GitHub Copilot. The paper evaluates these models on benchmarks without specifying when their training data was collected."
    247       },
    248       "train_test_overlap_discussed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "For APPS, Section 8.4.1 discusses overlap: 'A major downside of the APPS dataset is that it is a public dataset with problems from popular online coding websites. There is a large chance that solutions for it have been included in training recent models.' The replication explicitly releases new data to facilitate future uncontaminated testing."
    252       },
    253       "benchmark_contamination_addressed": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "The APPS contamination risk is explicitly acknowledged (Section 8.4.1). For the replication, the authors note the dataset was not previously public and release it specifically to support future replication (Section 8.3). However, no formal contamination analysis (canary strings, membership inference) is performed."
    257       }
    258     },
    259     "human_studies": {
    260       "pre_registered": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No mention of pre-registration for the survey or interview study. No link to OSF, AsPredicted, or any pre-registration platform."
    264       },
    265       "irb_or_ethics_approval": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "Section 5.1 states the interview protocol was 'submitted to the University of Toronto Research Ethics Board, who approved it prior to the study.' Audio and video recordings were deleted in accordance with this protocol."
    269       },
    270       "demographics_reported": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Section 4.3.1 reports detailed demographics: instructors from 12 countries, avg 18.2 years experience, 77.2% men, class sizes; students from 17 countries, year of study (48.5% first year), avg 4.6 courses, majors (42.7% CS). Table 6 shows interviewee countries."
    274       },
    275       "inclusion_exclusion_criteria": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "For interviews, Section 5.1 states: 'The most important criterion for inclusion was that educators would have concrete plans or views toward changing their current course structure, assessment, or classroom practices in light of LLMs.' Survey recruitment targeted computing education mailing lists (Section 4.2.1)."
    279       },
    280       "randomization_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Not an experimental study. The survey and interviews are observational/qualitative studies without treatment conditions requiring randomization."
    284       },
    285       "blinding_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Not applicable for cross-sectional surveys and qualitative interviews. No treatment conditions exist that would require blinding."
    289       },
    290       "attrition_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No attrition or dropout information is reported. The paper states final counts (57 instructors, 171 students, 22 interviewees) but does not report how many were initially contacted, started the survey, or declined interviews."
    294       }
    295     },
    296     "cost_and_practicality": {
    297       "inference_cost_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No API costs, token counts, or latency figures are reported for the benchmarking experiments despite running GPT-4, GPT-3.5-turbo, and Copilot across hundreds of problems with up to 10 attempts each."
    301       },
    302       "compute_budget_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No total computational budget is stated. No information on API spend, GPU hours, or total wall-clock time for the benchmarking experiments."
    306       }
    307     },
    308     "experimental_rigor": {
    309       "seed_sensitivity_reported": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Single-run results are reported. For GPT-4/GPT-3.5, temperature 0.9 is used with up to 10 attempts, but no sensitivity analysis across different seeds or runs is performed. For APPS, temperature 0.0 produces deterministic output from a single run."
    313       },
    314       "number_of_runs_stated": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Section 8.3 states 'We generate up to ten solutions for each problem, stopping if the LLM creates a solution that passes the tests.' Section 8.4.1 implicitly states one run per problem at temperature=0.0. The attempt protocol is clearly specified."
    318       },
    319       "hyperparameter_search_budget": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No hyperparameter search was conducted. The replication uses temperature 0.9 from the original study, and APPS uses temperature 0.0, but no exploration of alternative settings is reported or justified beyond following the original study's choices."
    323       },
    324       "best_config_selection_justified": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "For the replication (Section 8.3), using temperature 0.9 is justified as matching 'the original study.' For APPS (Section 8.4.1), temperature=0.0 and max_tokens=4000 are stated with defaults. While the APPS temperature choice isn't explicitly justified, the replication's rationale is clear."
    328       },
    329       "multiple_comparison_correction": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "No statistical tests are performed, so no multiple comparison correction is needed."
    333       },
    334       "self_comparison_bias_addressed": {
    335         "applies": false,
    336         "answer": false,
    337         "justification": "The authors are not evaluating their own system. They evaluate third-party models (GPT-4, GPT-3.5, Copilot) on existing benchmarks. Self-comparison bias does not apply."
    338       },
    339       "compute_budget_vs_performance": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "GPT-4, GPT-3.5, and Copilot have vastly different computational costs, but performance is not reported as a function of compute. The models are compared on accuracy alone without discussing cost-performance tradeoffs."
    343       },
    344       "benchmark_construct_validity": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Section 8.2 provides extensive analysis of what the benchmarks actually measure, finding most problems cover only introductory Python (Table 8: 98.8% of HumanEval, 100% of FalconCode are introductory). Section 8.5 discusses that available datasets 'may not reflect the kinds of problems student programmers solve in upper year courses.'"
    348       },
    349       "scaffold_confound_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "GPT-4 and GPT-3.5 are tested via API with identical prompting, but Copilot is tested via a different interface (VS Code plugin with IDE completion). The paper describes the different interfaces (Section 8.3) but does not explicitly discuss how this scaffold difference could confound the model comparison."
    353       }
    354     },
    355     "data_leakage": {
    356       "temporal_leakage_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "Section 8.4.1 explicitly addresses temporal leakage for APPS: 'A major downside of the APPS dataset is that it is a public dataset with problems from popular online coding websites. There is a large chance that solutions for it have been included in training recent models.'"
    360       },
    361       "feature_leakage_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether the evaluation setup leaks answer information through context (e.g., whether problem descriptions contain hints that wouldn't be available in authentic educational settings)."
    365       },
    366       "non_independence_addressed": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No discussion of whether benchmark problems share structural similarities, come from the same sources, or have near-duplicate variants that could inflate performance estimates."
    370       },
    371       "leakage_detection_method": {
    372         "applies": true,
    373         "answer": false,
    374         "justification": "No concrete leakage detection method is used. The contamination concern for APPS is discussed conceptually but no canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are applied."
    375       }
    376     },
    377     "survey_methodology": {
    378       "prisma_or_structured_protocol": {
    379         "applies": true,
    380         "answer": true,
    381         "justification": "The literature review follows a structured scoping review protocol with defined search queries across three databases (ACM, Taylor & Francis, IEEE), explicit date filters (January 2021), inclusion/exclusion criteria (5 criteria listed in Section 3.1.2), two rounds of snowballing, dual evaluation with third-evaluator resolution, and a documented pipeline (Figure 1)."
    382       },
    383       "quality_assessment_of_sources": {
    384         "applies": true,
    385         "answer": true,
    386         "justification": "Table 5 assesses quality metrics of included papers using criteria adapted from Hellas et al.: clearly defined research question (Yes:44, No:18), research process described (Yes:55), results detail (Yes:57), threats to validity (Yes in separate section:38, Yes but not separate:15, No:18)."
    387       },
    388       "publication_bias_discussed": {
    389         "applies": true,
    390         "answer": false,
    391         "justification": "No discussion of publication bias. While the paper acknowledges including arXiv and grey literature 'driven by pragmatism' (Section 3.7), it does not discuss whether published papers in the review skew toward positive results about LLMs, nor does it use funnel plots or other bias detection methods."
    392       }
    393     }
    394   },
    395   "claims": [
    396     {
    397       "claim": "LLMs perform at a level equivalent to or better than average students on introductory code generation tasks.",
    398       "evidence": "Section 3.3.1 synthesizes findings from 35+ papers evaluating LLM performance, and the replication (Section 8.3, Figure 3) shows GPT-4 solving nearly all CS1 exam problems, performing as 'one of the top students in the class.'",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "GPT-4 dramatically outperforms Codex on CS1 exam problems within two years.",
    403       "evidence": "Section 8.3 and Table 9: GPT-4 solved all problems except T2-Q12, mostly on the first attempt. The original Codex study showed performance 'similar to students in the top quartile' while GPT-4 'would have been one of the top students in the class.' Figure 3 visually compares original vs replication.",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "Students and instructors are generally aligned in their belief that some restrictions should be placed on GenAI tool use in coursework.",
    408       "evidence": "Section 4.3.2 and Figure 2: Both groups show similar patterns on Likert responses about restrictions. However, they diverge on policy clarity — instructors find policies less clear than students.",
    409       "supported": "moderate"
    410     },
    411     {
    412       "claim": "Around half of instructors (50.8%) believe many or almost all of their students are using GenAI tools unethically.",
    413       "evidence": "Section 4.3.4 reports this figure directly from the instructor survey (n=57). The question asked 'To what extent do you think students at your school are using GenAI tools in ways that you would not approve of?'",
    414       "supported": "moderate"
    415     },
    416     {
    417       "claim": "Available computing education benchmarks are heavily biased toward introductory Python problems.",
    418       "evidence": "Section 8.2, Table 8: 98.8% of HumanEval, 100% of FalconCode, and 83.3% of the CS2 dataset problems are classified as introductory level. Table 7 shows most datasets focus on Python. Manual tagging by experienced instructors confirmed the homogeneity.",
    419       "supported": "strong"
    420     },
    421     {
    422       "claim": "The most common concern about LLMs in computing education is that students will become over-reliant on them.",
    423       "evidence": "Section 3.6.1: 'The most common concern expressed by authors about student learning was the potential for students to become over-reliant on generative AI tools to solve problems [34, 82, 115, 118, 158] and assist in debugging code [116, 140, 187].' Corroborated by survey (Section 4.3) and interviews (Section 5).",
    424       "supported": "strong"
    425     },
    426     {
    427       "claim": "AI text detection tools are unreliable and produce many false positives.",
    428       "evidence": "Section 3.3.1 cites Orenstrakh et al. [134]: 'tools that assess whether a given text was generated by an LLM show a large number of false positives and should not be trusted blindly.' Noted to be 'even worse when evaluating code.'",
    429       "supported": "moderate"
    430     },
    431     {
    432       "claim": "Educators are shifting toward process-over-product assessment and increasing the weight of invigilated exams.",
    433       "evidence": "Section 5.4: Multiple interviewees describe reducing homework weight (one changed from 50% to 0%), adding coding interviews, oral exams (Jean Mehta), and reflection assignments. Section 5.4.3 describes the shift to evaluating learning processes. Based on 22 interviews.",
    434       "supported": "moderate"
    435     }
    436   ],
    437   "red_flags": [
    438     {
    439       "flag": "Selection bias in survey sample",
    440       "detail": "Instructors were recruited via SIGCSE mailing lists, likely overrepresenting educators highly engaged with computing education research. Students were recruited via snowball sampling through those instructors. Section 4.2.1 acknowledges this: 'the resulting sample likely results in a selection bias of instructors who are particularly invested in computing education.' The sample of 57 instructors and 171 students across 20 countries may not represent the global computing education community."
    441     },
    442     {
    443       "flag": "No statistical tests for comparative claims",
    444       "detail": "Student vs. instructor survey responses are compared descriptively (Section 4.3.2) with language like 'some important differences emerged' but without statistical tests. The paper cannot distinguish real differences from sampling noise. Similarly, benchmarking comparisons between models lack significance tests."
    445     },
    446     {
    447       "flag": "Interview sample geographic skew",
    448       "detail": "22 educators, 55% from USA, only 1 each from Asia, Oceania, and South America, none from Africa (Section 5.8, Table 6). Only 14% women. Findings about how 'educators are changing' are heavily weighted toward US perspectives."
    449     },
    450     {
    451       "flag": "Benchmark contamination unaddressed for replication",
    452       "detail": "While APPS contamination is discussed, the CS1 exam problems used in the Finnie-Ansley replication may have entered GPT-4's training data through academic dissemination, citations, or the original paper's availability. The high GPT-4 performance could be partly explained by memorization rather than genuine problem-solving capability."
    453     },
    454     {
    455       "flag": "Literature review explicitly not systematic",
    456       "detail": "Section 3.1 states 'We explicitly considered but decided not to perform a systematic review.' While a scoping review is a valid choice, the one-step snowballing from only 10 seed papers, combined with the acknowledged rapid pace of publication, means important work may be systematically missed."
    457     }
    458   ],
    459   "cited_papers": [
    460     {
    461       "title": "The Robots Are Coming: Exploring the Implications of OpenAI Codex on Introductory Programming",
    462       "authors": ["James Finnie-Ansley", "Paul Denny", "Brett A. Becker", "Andrew Luxton-Reilly", "James Prather"],
    463       "year": 2022,
    464       "doi": "10.1145/3511861.3511863",
    465       "relevance": "Seminal paper evaluating Codex on CS1 programming exercises, replicated in this working group report with newer models."
    466     },
    467     {
    468       "title": "Automatic Generation of Programming Exercises and Code Explanations Using Large Language Models",
    469       "authors": ["Sami Sarsa", "Paul Denny", "Arto Hellas", "Juho Leinonen"],
    470       "year": 2022,
    471       "doi": "10.1145/3501385.3543957",
    472       "relevance": "Early evaluation of LLMs for generating educational content (programming exercises and code explanations)."
    473     },
    474     {
    475       "title": "Studying the Effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    476       "authors": ["Majeed Kazemitabaar", "Justin Chow", "Carl Ka To Ma", "Barbara J. Ericson", "David Weintrop", "Tovi Grossman"],
    477       "year": 2023,
    478       "doi": "10.1145/3544548.3580919",
    479       "relevance": "Human study finding AI code generators improve novice productivity (1.15x progress, 0.57x time) without negative learning effects."
    480     },
    481     {
    482       "title": "Repairing Bugs in Python Assignments Using Large Language Models",
    483       "authors": ["Jialu Zhang", "José Cambronero", "Sumit Gulwani", "Vu Le", "Ruzica Piskac", "Gustavo Soares", "Gust Verbruggen"],
    484       "year": 2022,
    485       "arxiv_id": "2209.14876",
    486       "relevance": "Demonstrated 96.5% repair rate using Codex with few-shot examples and iterative prompting for student bug repair."
    487     },
    488     {
    489       "title": "Using Large Language Models to Enhance Programming Error Messages",
    490       "authors": ["Juho Leinonen", "Arto Hellas", "Sami Sarsa", "Brent Reeves", "Paul Denny", "James Prather", "Brett A. Becker"],
    491       "year": 2023,
    492       "doi": "10.1145/3545945.3569770",
    493       "relevance": "Evaluated Codex for enhancing programming error messages, finding success in 54% of cases."
    494     },
    495     {
    496       "title": "Lost at C: A User Study on the Security Implications of Large Language Model Code Assistants",
    497       "authors": ["Gustavo Sandoval", "Hammond Pearce", "Teo Nys", "Ramesh Karri", "Siddharth Garg", "Brendan Dolan-Gavitt"],
    498       "year": 2023,
    499       "relevance": "User study on security implications of LLM code assistants, finding no significant new security risks introduced."
    500     },
    501     {
    502       "title": "Detecting LLM-Generated Text in Computing Education: A Comparative Study for ChatGPT Cases",
    503       "authors": ["Michael Sheinman Orenstrakh", "Oscar Karnalim", "Carlos Anibal Suarez", "Michael Liut"],
    504       "year": 2023,
    505       "arxiv_id": "2307.07411",
    506       "relevance": "Evaluated AI text detection tools in computing education, finding high false positive rates especially for code."
    507     },
    508     {
    509       "title": "Generative AI for Programming Education: Benchmarking ChatGPT, GPT-4, and Human Tutors",
    510       "authors": ["Tung Phung", "Victor-Alexandru Pădurean", "José Cambronero", "Sumit Gulwani", "Tobias Kohn", "Rupak Majumdar", "Adish Singla", "Gustavo Soares"],
    511       "year": 2023,
    512       "relevance": "Benchmarked GPT-4 against human tutors for programming education, finding performance approaching human tutors."
    513     },
    514     {
    515       "title": "Practical and Ethical Challenges of Large Language Models in Education: A Systematic Literature Review",
    516       "authors": ["Lixiang Yan", "Lele Sha", "Linxuan Zhao", "Yuheng Li", "Roberto Martinez-Maldonado", "Guanliang Chen", "Xinyu Li", "Yueqiao Jin", "Dragan Gašević"],
    517       "year": 2023,
    518       "arxiv_id": "2303.13379",
    519       "relevance": "Systematic review of LLM challenges in education covering assessment, grading, and ethical concerns."
    520     },
    521     {
    522       "title": "Computing Education in the Era of Generative AI",
    523       "authors": ["Paul Denny", "James Prather", "Brett A. Becker", "James Finnie-Ansley", "Arto Hellas", "Juho Leinonen", "Andrew Luxton-Reilly", "Brent N. Reeves", "Eddie Antonio Santos", "Sami Sarsa"],
    524       "year": 2023,
    525       "arxiv_id": "2306.02608",
    526       "relevance": "Companion survey paper on how computing education should adapt to generative AI."
    527     },
    528     {
    529       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    530       "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"],
    531       "year": 2023,
    532       "relevance": "Identified programmer interaction patterns with code generators (exploration and acceleration modes)."
    533     },
    534     {
    535       "title": "Evaluating Large Language Models Trained on Code",
    536       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    537       "year": 2021,
    538       "arxiv_id": "2107.03374",
    539       "relevance": "Introduced Codex and the HumanEval benchmark, foundational for LLM code generation evaluation."
    540     },
    541     {
    542       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    543       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    544       "year": 2023,
    545       "relevance": "Demonstrated that existing benchmarks have insufficient test cases, allowing incorrect LLM solutions to pass — introduced EvalPlus."
    546     },
    547     {
    548       "title": "Programming Is Hard - Or at Least It Used to Be: Educational Opportunities and Challenges of AI Code Generation",
    549       "authors": ["Brett Becker", "James Prather", "Paul Denny", "Andrew Luxton-Reilly", "James Finnie-Ansley", "Eddie Antonio Santos"],
    550       "year": 2023,
    551       "relevance": "Position paper on educational opportunities and challenges of AI code generation in computing education."
    552     },
    553     {
    554       "title": "Measuring Coding Challenge Competence With APPS",
    555       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    556       "year": 2021,
    557       "relevance": "Introduced the APPS benchmark dataset (10,000 programming problems) used in this paper's benchmarking analysis."
    558     }
    559   ],
    560   "engagement_factors": {
    561     "practical_relevance": {
    562       "score": 3,
    563       "justification": "Immediately useful for computing educators with concrete advice sections for educators, students, and policy makers, plus a sample student handout (Appendix D)."
    564     },
    565     "surprise_contrarian": {
    566       "score": 1,
    567       "justification": "Generally confirms expected trends — LLMs are improving rapidly, educators are concerned about cheating, students are using the tools — rather than challenging conventional wisdom."
    568     },
    569     "fear_safety": {
    570       "score": 1,
    571       "justification": "Discusses academic integrity concerns and student over-reliance on AI tools, but does not raise novel AI risk or security concerns."
    572     },
    573     "drama_conflict": {
    574       "score": 1,
    575       "justification": "Some tension between 'ban' and 'embrace' perspectives from educators, but presented as balanced analysis rather than controversy."
    576     },
    577     "demo_ability": {
    578       "score": 0,
    579       "justification": "No code, tool, or demo to try. The released OSF data is benchmark problems and test cases, not an interactive artifact."
    580     },
    581     "brand_recognition": {
    582       "score": 2,
    583       "justification": "Evaluates GPT-4, ChatGPT, and GitHub Copilot (high brand recognition). Published at ITiCSE, a major computing education venue. Authors are well-known in computing education research."
    584     }
    585   }
    586 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs