scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25908B)
      1 {
      2   "paper": {
      3     "title": "Beyond the Hype: A Comprehensive Review of Current Trends in Generative AI Research, Teaching Practices, and Tools",
      4     "authors": [
      5       "James Prather",
      6       "Juho Leinonen",
      7       "Natalie Kiesler",
      8       "Jamie Gorson Benario",
      9       "Sam Lau",
     10       "Stephen MacNeil",
     11       "Narges Norouzi",
     12       "Simone Opel",
     13       "Vee Pettit",
     14       "Leo Porter",
     15       "Brent N. Reeves",
     16       "Jaromir Savelka",
     17       "David H. Smith IV",
     18       "Sven Strickroth",
     19       "Daniel Zingaro"
     20     ],
     21     "year": 2024,
     22     "venue": "ITiCSE-WGR 2024",
     23     "arxiv_id": "2412.14732",
     24     "doi": "10.1145/XXXXXXX"
     25   },
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No source code or analysis scripts are mentioned as released. The paper does not provide a repository URL for any code used in data analysis."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The systematic literature review data is published on OSF: https://osf.io/wpxjb/?view_only=cebf366db7f5423792b39de754972400 (stated in the Data Availability section)."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment or dependency specifications are provided. The paper does not describe any computational environment setup for its analyses."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided. While the SLR methodology is described, there are no scripts or instructions for reproducing the quantitative analyses."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper reports only descriptive statistics (percentages, medians, IQRs for some distributions). No confidence intervals or error bars are reported for the main survey results or comparisons."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper makes several comparative claims (e.g., educators underestimate developer GenAI usage, tools with guardrails produce more positive results) but uses no statistical significance tests to support these comparisons."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No effect sizes are reported. Comparisons are made using raw percentages (e.g., 55% vs 73%) without any formal effect size measures."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No power analysis or sample size justification is provided. The educator survey has N=76 complete responses out of 209 received, and the developer survey has N=39 (29 complete). No discussion of whether these samples are sufficient for the claims made."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Some variance measures are reported: IQRs for participant population sizes (e.g., 'mixed-methods median=52.0 IQR=(24.0-105.0)') and medians for study characteristics in the SLR."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper compares its findings against prior surveys (e.g., Prather et al. 2023 working group survey with 57 respondents, StackOverflow 2024 developer survey with 46,000 developers) to contextualize results."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Comparisons include the StackOverflow 2024 developer survey (conducted May 2024) and the 2023 ITiCSE working group survey, both of which are recent and relevant."
     86       },
     87       "ablation_study": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Not applicable — this is a survey/SLR paper, not a system with components to ablate."
     91       },
     92       "multiple_metrics": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Not applicable — this is a survey/SLR paper, not a system being evaluated with metrics."
     96       },
     97       "human_evaluation": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Not applicable — the paper does not evaluate a system's outputs. It is a survey and literature review."
    101       },
    102       "held_out_test_set": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "Not applicable — no test set is involved in this survey/SLR study."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper provides extensive breakdowns: by tool type vs. outcomes (Table 7), by task type vs. outcomes (Table 8), by tool type and guidance vs. outcomes, and by evidence type vs. findings nature (Table 6)."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper discusses negative findings from the SLR (Table 6 shows 5 negative and 15 mixed results out of 71 studies), negative perceived outcomes from interviews (academic dishonesty, student unpreparedness), and scenarios where GenAI tools caused harm."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Negative results are reported throughout: 58% of code writing studies reported positive results (meaning 42% did not), hint generation had only 50% positive results, and multiple interview quotes describe negative outcomes including increased cheating and student unpreparedness."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims are descriptive and appropriately hedged: it describes summarizing literature, conducting surveys, and interviews. The results sections provide the data supporting these descriptive claims."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper makes several implicit causal claims, such as 'when students were not provided guidance and use a general purpose generative AI model, only 55% of studies found positive results' (Section 3.4), implying that guidance or guardrails cause better outcomes. These are cross-tabulations, not causal designs, and the paper does not acknowledge this limitation in interpreting the SLR findings."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title claims to be a 'comprehensive review of current trends' broadly, but the SLR is limited to computing education (not industry or general AI usage). The educator survey has N=76 (predominantly from the US, Germany, Canada) and the developer survey has N=39, yet the paper's conclusions ('Key takeaways') make broad claims like '80% of developers use GenAI tools in their professional roles' without bounding this to the sample."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The threats-to-validity section discusses several alternative explanations: publication bias inflating positive results in the SLR (Section 7.1), self-selection bias in both surveys (Section 7.2), self-reporting bias, and non-representative developer sample. The discussion also notes that educators who haven't explored GenAI may have opinions 'based less on facts than opinions.'"
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "Not applicable — the paper does not use LLMs as part of its methodology. It studies how others use GenAI tools."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "Not applicable — no prompting is used in the paper's own methodology."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "Not applicable — no model inference is performed as part of this study."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "Not applicable — no agentic scaffolding is used in this study's methodology."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The SLR filtering pipeline is well-documented with counts at each stage (1536 → 169 → 71, Figure 1) and explicit exclusion criteria (7 criteria listed in Section 2.2). Survey data filtering is also described: educator survey had 209 responses, 100 removed for incomplete/no consent, 33 test data removed, yielding N=76. Developer survey: 94 → 39 after filtering."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 7 'Threats to Validity' contains three substantive subsections: 7.1 Systematic Literature Review, 7.2 Educator and Developer Surveys, and 7.3 Interview Study."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The threats are specific to this study: SLR limitations include specific database choices, arXiv search being limited to title/abstract, and publication bias. Survey threats include specific mailing list recruitment potentially over-sampling engaged educators, consent form dropout, and the low N=39 for the developer survey. Interview threats include English-only recruitment bias."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While threats to validity are discussed, the paper does not explicitly state what it does NOT claim or what settings are excluded from its conclusions. The 'Key takeaways' box makes broad claims (e.g., '80% of developers use GenAI tools') without explicitly bounding them to the study's sample. The paper does not state e.g., 'these findings apply only to computing education contexts in Western universities' or similar scope restrictions."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The SLR data is published on OSF (https://osf.io/wpxjb/). However, the survey and interview data are not mentioned as being publicly available, so this is partial. The SLR data being available earns a YES since it is one of the major data sources."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Data collection is described in detail for all three methods: SLR search strings, databases, and dates (Section 2); survey distribution channels and dates (Sections 4.5, 4.6); and interview recruitment and process (Sections 4.7, 4.8)."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Recruitment channels are explicitly listed for both surveys (Section 4.5: specific mailing lists including ACM SIG, German CS society, ITiCSE participants, LinkedIn, departmental lists) and interviews (Section 4.8: selected based on literature review results, networking at ITiCSE 2024, email to mailing lists, with diversity criteria described)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented: SLR from search to final set (Figure 1, 1536→169→71 with criteria at each stage); survey from raw responses to final N (209→76 for educators, 94→39 for developers); and interview coding process (initial joint review, independent tagging, thematic analysis described in Section 4.9)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Funding is disclosed in the Acknowledgments section: Research Council of Finland (Academy Research Fellow grant 356114), U.S. National Science Foundation IUSE Award #2417374, and Google Award for Inclusion Research Program."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are prominently listed at the top of the paper. One author (Jamie Gorson Benario) is affiliated with Google, and Google is also listed as a funder via the 'Google Award for Inclusion Research Program.' All other authors are at academic institutions."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The Research Council of Finland, U.S. NSF, and Google Award for Inclusion Research are all independent of the survey outcomes. The paper does not evaluate Google's products specifically, and the funders have no financial stake in particular results about GenAI in education."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper. One author works at Google, which produces GenAI products discussed in the paper, but no competing interests declaration addresses this."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not applicable — the paper does not evaluate any pre-trained model's capability on a benchmark."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Not applicable — no model benchmarking is performed in this study."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Not applicable — no benchmark evaluation is conducted."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No pre-registration is mentioned for either the surveys or the interview study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "IRB approval is implicitly referenced: Section 4.8.2 states 'To comply with Institutional Review Board (IRB) protocol requirements, these recordings were deleted once the transcriptions were completed.' Consent forms were used for both surveys and interviews."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": true,
    264         "justification": "Demographics are reported for all three samples: educator survey (country, gender, institution type, teaching experience in Tables 10-13), developer survey (country, job title, company type in Table 14), and interview sample (categories: tool creators, educators studying GenAI, educators using GenAI, with selection criteria for diversity in Section 5.1.3)."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "For the SLR, seven explicit exclusion criteria are listed in Section 2.2. For the surveys, filtering criteria are described (consent form agreement, completeness). For interviews, three categories of interest are defined with descriptions of who qualifies for each category (Section 4.7)."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Not applicable — this is not an experimental study with random assignment. The surveys are cross-sectional and the interviews are purposive sampling."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Not applicable — this is a survey and interview study, not an experimental study where blinding would be feasible."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Attrition is reported for both surveys: educator survey received 209 responses, 100 dropped (no consent or incomplete), 33 test data removed, yielding N=76. Developer survey received 94 responses, yielding N=39 after filtering, with 29 fully complete. The consent form is identified as a potential cause of dropout (Section 7.2)."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "Not applicable — this is a survey/SLR paper, not a system with inference costs."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "Not applicable — this is a survey/SLR paper with no computational experiments."
    297       }
    298     }
    299   },
    300   "claims": [
    301     {
    302       "claim": "80% of surveyed developers use GenAI tools in their professional software development role.",
    303       "evidence": "DS-1 results: 31 out of 39 developers (79.5%) reported using GenAI tools (Section 5.7).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Educators underestimate how frequently professional developers use GenAI tools.",
    308       "evidence": "56.5% of developers report daily GenAI use, while only 30.7% of educators expected daily use (Figure 7, Section 5.7). However, no statistical test is performed on this comparison.",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "Studies using custom tools with pedagogical guardrails report more positive results than those using general-purpose GenAI without guidance.",
    313       "evidence": "Table 7 shows 73% (19/26) positive results for guardrailed tools without guidance vs. 55% (12/22) for general purpose tools without guidance (Section 3.4).",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "75% of educators believe the skills to create software have changed after the advent of GenAI.",
    318       "evidence": "ES-3: 57 out of 76 educators (75%) responded that skills have changed (Section 5.8.1).",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "Code comprehension tasks show more positive results with GenAI (80%) compared to code writing tasks (58%).",
    323       "evidence": "Table 8 shows 8/10 (80%) positive for code comprehension vs. 15/26 (58%) positive for writing code (Section 3.4). No statistical test of this difference is reported.",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "Only 35.5% of educators are actively incorporating GenAI into their courses despite 75% believing skills have changed.",
    328       "evidence": "ES-2: 27 out of 76 (35.5%) are incorporating GenAI (Section 5.2.1), while ES-3: 57 out of 76 (75%) believe skills have changed (Section 5.8.1).",
    329       "supported": "strong"
    330     }
    331   ],
    332   "methodology_tags": [
    333     "meta-analysis",
    334     "qualitative",
    335     "observational"
    336   ],
    337   "key_findings": "This working group report combines a systematic literature review of 71 papers with educator (N=76) and developer (N=39) surveys and 17 semi-structured interviews to understand GenAI integration in computing education. The SLR finds that custom tools with pedagogical guardrails produce more positive educational outcomes than unguided general-purpose tools. While 75% of educators believe programming skills are changing due to GenAI (shifting from code writing to code reading, higher-level thinking, and prompt engineering), only 35.5% have actively incorporated GenAI into their courses. Developer GenAI usage (80%) exceeds educator expectations, with a gap between educator perceptions and actual industry practice.",
    338   "red_flags": [
    339     {
    340       "flag": "Very small developer sample",
    341       "detail": "The developer survey has only N=39 respondents (29 complete), yet the paper's key takeaways present findings as '80% of developers use GenAI tools' without prominent caveats about sample size. Many sub-questions have even smaller N (e.g., N=21 for usefulness ratings, N=18 for country data)."
    342     },
    343     {
    344       "flag": "High survey attrition",
    345       "detail": "The educator survey received 209 responses but only 76 were usable (63% attrition). 100 responses were dropped due to incomplete data or no consent, plus 33 test responses. This suggests significant self-selection bias among completers."
    346     },
    347     {
    348       "flag": "No statistical tests for comparative claims",
    349       "detail": "The paper makes multiple comparative claims (guardrailed vs. general tools, code comprehension vs. code writing, educator expectations vs. developer reality) based solely on percentage comparisons with no significance tests, confidence intervals, or effect sizes."
    350     },
    351     {
    352       "flag": "Convenience sampling for both surveys",
    353       "detail": "Both surveys were distributed through author networks, mailing lists, and conference contacts. The developer survey was further limited by the authors' acknowledgment that as 'computing education researchers who do not have access to developers within large tech companies worldwide.' This introduces strong selection bias."
    354     },
    355     {
    356       "flag": "Potential publication bias in SLR not quantified",
    357       "detail": "The paper acknowledges that publication bias likely inflates positive results in the SLR (Section 7.1) but does not attempt to quantify this bias or perform sensitivity analysis."
    358     }
    359   ],
    360   "cited_papers": [
    361     {
    362       "title": "Computing Education in the Era of Generative AI",
    363       "authors": ["Paul Denny", "James Prather", "Brett A. Becker"],
    364       "year": 2024,
    365       "relevance": "Foundational discussion of how GenAI is transforming computing education, providing context for understanding AI tool adoption in programming pedagogy."
    366     },
    367     {
    368       "title": "Studying the Effect of AI Code Generators on Supporting Novice Learners in Introductory Programming",
    369       "authors": ["Majeed Kazemitabaar"],
    370       "year": 2023,
    371       "doi": "10.1145/3544548.3580919",
    372       "relevance": "Empirical study on how AI code generators affect novice programmers, directly relevant to understanding GenAI impact on learning outcomes."
    373     },
    374     {
    375       "title": "CodeAid: Evaluating a Classroom Deployment of an LLM-based Programming Assistant that Balances Student and Educator Needs",
    376       "authors": ["Majeed Kazemitabaar"],
    377       "year": 2024,
    378       "doi": "10.1145/3613904.3642773",
    379       "relevance": "Evaluation of an LLM-based programming assistant deployed in a classroom, relevant to understanding agentic AI tools in education."
    380     },
    381     {
    382       "title": "Do Users Write More Insecure Code with AI Assistants?",
    383       "authors": ["Neil Perry"],
    384       "year": 2023,
    385       "relevance": "Empirical study on security implications of AI code assistants, relevant to evaluating risks and quality of AI-generated code."
    386     },
    387     {
    388       "title": "Lost at C: A User Study on the Security Implications of Large Language Model Code Assistants",
    389       "authors": ["Gustavo Sandoval"],
    390       "year": 2023,
    391       "relevance": "User study examining security implications of LLM code assistants, relevant to AI code generation safety evaluation."
    392     },
    393     {
    394       "title": "CodeHelp: Using Large Language Models with Guardrails for Scalable Support in Programming Classes",
    395       "authors": ["Mark Liffiton", "Brad E Sheese", "Jaromir Savelka", "Paul Denny"],
    396       "year": 2024,
    397       "doi": "10.1145/3631802.3631830",
    398       "relevance": "LLM-based programming tool with pedagogical guardrails, directly relevant to understanding safety and quality of AI-assisted coding tools."
    399     },
    400     {
    401       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    402       "authors": ["Shraddha Barke", "Michael B James", "Nadia Polikarpova"],
    403       "year": 2023,
    404       "doi": "10.1145/3586030",
    405       "relevance": "Empirical study of programmer interaction with AI code generation tools, relevant to understanding productivity and workflow impacts."
    406     },
    407     {
    408       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    409       "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"],
    410       "year": 2024,
    411       "doi": "10.1145/3597503.3608128",
    412       "relevance": "Large-scale survey on AI programming assistant usability, providing complementary data on developer experience with AI coding tools."
    413     },
    414     {
    415       "title": "It's Weird That it Knows What I Want: Usability and Interactions with Copilot for Novice Programmers",
    416       "authors": ["James Prather"],
    417       "year": 2023,
    418       "relevance": "Study on novice programmer interactions with GitHub Copilot, relevant to understanding how AI assistants affect programming learning."
    419     },
    420     {
    421       "title": "How Far Are We? The Triumphs and Trials of Generative AI in Learning Software Engineering",
    422       "authors": ["Rudrajit Choudhuri"],
    423       "year": 2024,
    424       "doi": "10.1145/3597503.3639201",
    425       "relevance": "Empirical investigation of GenAI effectiveness in software engineering education, directly relevant to the survey scope."
    426     },
    427     {
    428       "title": "How Beginning Programmers and Code LLMs (Mis)read Each Other",
    429       "authors": ["Sydney Nguyen"],
    430       "year": 2024,
    431       "doi": "10.1145/3613904.3642706",
    432       "relevance": "Study on mismatches between novice programmers and LLMs, relevant to understanding challenges in AI-assisted programming."
    433     }
    434   ]
    435 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs