scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27565B)
      1 {
      2   "paper": {
      3     "title": "Analysis of Student-LLM Interaction in a Software Engineering Project",
      4     "authors": [
      5       "Agrawal Naman",
      6       "Ridwan Shariffdeen",
      7       "Guanlin Wang",
      8       "Sanka Rasnayaka",
      9       "Ganesh Neelakanta Iyer"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2502.01273"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL, Zenodo archive, or any code release is mentioned in the paper. The analysis scripts, conversation data processing code, and complexity measurement tools are not provided."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The collected dataset of 730 code snippets, 62 ChatGPT conversations, and student annotations is not released. No download link or data repository is provided."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No environment specifications, dependency files, or library versions are mentioned. The tools used for analysis (Tree-sitter, VADER, complexity metrics) are named but no setup details are given."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the study design at a high level but does not include commands, scripts, or detailed procedures to replicate the analysis."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No confidence intervals or error bars are reported for any results. The paper presents density plots, histograms, and box plots but without uncertainty quantification. Figures show distributions but point estimates lack CI notation or ± values."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims (e.g., 'Copilot generated significantly more outliers than GPT', 'ChatGPT generates responses with lower computational complexity compared to CoPilot') but no statistical significance tests (t-tests, Mann-Whitney U, etc.) are reported. Comparisons are based solely on visual inspection of plots and raw percentages."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper provides raw percentages (e.g., '53.6% required minor intervention', '26% requiring no manual modification') but without formal effect size measures or baseline context sufficient for the reader to assess magnitude of differences."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The sample size of 126 students in 21 groups is stated but no justification for why this is adequate is given. No power analysis is discussed. The 62 ChatGPT conversations and 730 code snippets are described without justifying whether these counts are sufficient for the claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No standard deviations, interquartile ranges, or other variance measures are reported in numerical form. Box plots in figures show some distributional information visually, but no numerical variance or spread measures accompany the reported results."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The study compares ChatGPT-generated code against Copilot-generated code across complexity metrics, and also compares generated code against repository-integrated code. The comparison between the two LLM tools serves as a mutual baseline."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "ChatGPT and GitHub Copilot are contemporary tools that were actively used by students during the study period (2024 academic year). These represent current state-of-the-art LLM code generation tools."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "This is an observational study of student behavior, not a system with components that can be ablated. There is no system design with removable components."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The study uses four code complexity metrics: Total Lines of Code (LOC), Cyclomatic Complexity, Maximum Control Flow Graph (CFG) Depth, and Halstead Effort. It also uses Jaccard similarity for code comparison and VADER compound scores for sentiment analysis."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "There is no human evaluation of the quality of the LLM-generated code outputs. Students self-reported intervention levels (0, 1, 2) but there was no systematic expert review or human rating of code quality. The intervention levels are self-reported metadata, not a structured human evaluation of system outputs."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "This is an observational study analyzing student interaction data, not a benchmark evaluation. There is no train/test split concept applicable here."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Results are broken down by tool (ChatGPT vs Copilot), by milestone (MS1, MS2, MS3), by team (Table I, Table II), by code type (test vs functionality), and by intervention level. Per-team breakdowns are provided in Tables I and II."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses cases where AI-generated code was removed or refactored (teams 5, 8, 10, 13, 17 showed declining cumulative snippets), and notes that Copilot produced 'highly complex solutions, sometimes exceeding the average values by 40 to 50 times.' Mid-conversation frustration patterns are also discussed."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that 5 out of 21 teams did not use any LLMs despite having access, that Copilot occasionally produced unnecessarily complex code, that AI usage declined across milestones, and that sentiment analysis showed mid-conversation frustration. These are negative or mixed findings."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims that students prefer ChatGPT over Copilot (supported by usage data in Table I showing higher ChatGPT adoption by more teams), that ChatGPT generates lower complexity code (supported by Figure 2 and density plots), and that conversational interaction improves code quality (supported by Figure 3 showing decreasing complexity over conversation). However, the claim about 'preference' is based on usage patterns rather than stated preference, which is a stretch."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims such as 'conversational-based interaction helps improve the quality of the code generated' and 'ChatGPT's conversational interface enables users to iteratively refine prompts, resulting in more efficient code generation.' These are causal assertions from an observational study without controlling for confounds (e.g., self-selection: students who chose ChatGPT may differ from those who chose Copilot; task complexity differences are not controlled)."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The abstract states 'Early adoption of LLMs in software engineering is crucial to remain competitive' and 'the next generation of software engineers must acquire the necessary skills to interact with AI' — these are broad generalizations from a single-course study at one university with one specific project type (SPA). The title 'Analysis of Student-LLM Interaction in a Software Engineering Project' is appropriately scoped, but the conclusions extend well beyond the tested setting."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for its findings. For example, the lower complexity of ChatGPT code could be due to different task types being given to ChatGPT vs Copilot, or self-selection effects (students who prefer simpler code might choose ChatGPT). The declining usage across milestones could reflect project phase characteristics rather than student learning. The Threats to Validity section mentions self-reporting bias and VADER limitations but does not address alternative explanations for the core findings."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper refers to 'ChatGPT' and 'GitHub Copilot' without specifying model versions, snapshot dates, or API versions. No GPT model version (e.g., GPT-3.5, GPT-4, specific API version) is stated. The 'paid versions of ChatGPT via both the Chat and Playground interfaces' is mentioned but not which underlying model."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper provides example prompts in Chat Listings 1, 2, and 3, showing actual prompt text used by students. While not all 318 messages are reproduced, representative examples with actual text are provided. However, this is a study of student prompting behavior, not a system that uses fixed prompts."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No temperature, top-p, max tokens, or other API settings are reported for either ChatGPT or Copilot. The paper does not mention what settings students used when interacting with the LLMs."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "The study evaluates students using ChatGPT and Copilot as third-party tools (black boxes). There is no custom agentic scaffolding built by the authors."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section II describes the data collection procedure: students tagged LLM-generated code with generator used, intervention level, and conversation links. The code extraction process follows their prior work [3]. The analysis pipeline for conversations is described: Tree-sitter extraction, Jaccard similarity calculation with LCS threshold of 90%, and the filtering approach for conversation analysis (excluding reused code, omitting zero-similarity conversations)."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section IV 'Threats to Validity' is a dedicated section discussing limitations of the study."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Threats to Validity section identifies specific threats: (1) voluntary self-reports may include underreporting or selective disclosure, (2) Copilot chat feature was not widely used limiting comparison, (3) VADER misclassifies technical terms as neutral, resulting in many near-zero scores. These are specific to this study rather than generic boilerplate."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. There are no explicit boundaries on generalization — no statement about the results being limited to this specific course, project type (SPA), university (NUS), or student population. The conclusions section makes broad recommendations without qualifying their scope."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The raw conversation logs, code snippets, student annotations, and team repository data are not released or made available for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section II describes the data collection: students tagged LLM-generated code with generator, intervention level, and conversation links at each milestone. Collection was cumulative across three milestones. ChatGPT access was through organizational accounts enabling monitoring. The process follows their prior work [3] with addition of conversation link collection."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The participants are 126 undergraduate students enrolled in a software engineering course at the National University of Singapore. The recruitment is implicit — they are course participants, not separately recruited. The study was conducted within the course with DERC approval. This is a convenience sample of an entire course cohort."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is described: students tag code at each milestone → code snippets extracted with metadata → conversations collected via organizational ChatGPT access → complexity metrics computed (LOC, cyclomatic complexity, CFG depth, Halstead effort) → similarity computed using Tree-sitter + LCS + Jaccard → sentiment analyzed with VADER. However, specific counts of filtered/removed data at each stage are not always provided."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or grant acknowledgment is mentioned in the paper. The university funded ChatGPT and Copilot access but no explicit funding disclosure section or acknowledgments section is present."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are listed as affiliated with School of Computing, National University of Singapore, with email addresses provided. They are evaluating tools (ChatGPT, Copilot) they have no commercial affiliation with."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so independence cannot be assessed. The paper mentions the university funded ChatGPT and Copilot access for the study but does not disclose whether there was any external research funding or state that the work was unfunded."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This study analyzes student interaction patterns with LLMs in an educational context. It does not evaluate model capability on any benchmark. The focus is on how students use the tools, not on testing model knowledge."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Not applicable — the study does not evaluate a pre-trained model on any benchmark. It is an observational study of student-LLM interaction patterns."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Not applicable — no benchmark evaluation is conducted. The study examines student behavior and code characteristics, not model benchmark performance."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No pre-registration link or mention of pre-registration is found in the paper."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Section II states: 'In compliance with institutional guidelines, approval for our research was obtained from the Departmental Ethics Review Committee (DERC) before conducting the study.'"
    249       },
    250       "demographics_reported": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The paper states '126 undergraduate students' and '21 groups' but provides no demographic details such as year of study, prior programming experience, gender distribution, age, or other characteristics. The students are not characterized beyond being undergraduates in a software engineering course."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No inclusion or exclusion criteria for participants are stated. The implicit criterion is course enrollment, but no screening or selection criteria are described. Five teams that did not use LLMs are mentioned but it is unclear whether their data was excluded from all analyses."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "This is an observational study, not an experimental study with treatment conditions. Students were not randomly assigned to use ChatGPT or Copilot — they had access to both and chose which to use. Randomization is not applicable."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "This is an observational study, not an experimental study. There are no treatment conditions to blind. Students naturally chose which tools to use."
    269       },
    270       "attrition_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "The paper mentions 126 students in 21 groups but does not report whether all students completed the semester, whether any dropped out, or how many students in each team actually engaged with LLMs. Five teams did not use LLMs but attrition within teams is not discussed."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is an observational educational study, not a system or method with inference costs. The paper studies how students use existing tools, not a new method being proposed."
    281       },
    282       "compute_budget_stated": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is an observational educational study. The analysis involves standard complexity metric computation and sentiment analysis, not significant computational work requiring budget disclosure."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Students prefer ChatGPT over Copilot for code generation in software engineering projects.",
    292       "evidence": "Table I shows cumulative model usage, but ChatGPT total (223 snippets at MS3) is actually lower than Copilot (507 snippets at MS3). More teams used Copilot (6 primarily Copilot vs 5 primarily ChatGPT). The abstract claim of preference is not clearly supported by the usage data.",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "ChatGPT generates responses with lower computational complexity compared to Copilot.",
    297       "evidence": "Figure 1 (density plots) and Figure 2 (comparison box plots) show that Copilot-generated code has higher Halstead Effort and more outliers across complexity metrics. Section III.B discusses this finding.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Conversational-based interaction helps improve the quality of the code generated compared to auto-generated code.",
    302       "evidence": "Figure 3 shows average code complexity decreasing across conversation turns. Chat Listing 1 provides a qualitative example. However, no statistical tests compare conversational vs auto-generated code quality, and the comparison conflates tool differences with interaction mode differences.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "Students demonstrate improved prompting skills over the semester.",
    307       "evidence": "Chat Listings 2 and 3 show qualitative examples of prompts becoming more detailed and project-specific from MS1 to MS3. Figure 5 shows increasing similarity between generated and integrated code across milestones. However, this is based on qualitative examples and indirect evidence (similarity scores), not systematic measurement of prompt quality.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Sentiment in student-ChatGPT conversations follows a pattern of initial positivity, mid-conversation decline, and eventual recovery.",
    312       "evidence": "Figure 8 shows VADER compound scores over conversation progression estimated using LOESS smoothing. Section III.E describes this pattern. The Threats section notes VADER misclassifies technical terms as neutral.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Of the total 582,117 lines of code across all teams, 40,482 lines (6.95%) were produced with an LLM's help.",
    317       "evidence": "Stated in Section I as a direct measurement from the study data.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "observational",
    323     "case-study"
    324   ],
    325   "key_findings": "The study analyzed 126 undergraduate students using ChatGPT and Copilot over a 13-week software engineering course, finding that LLM-generated code accounted for 6.95% of total code. Copilot-generated code was more complex (higher Halstead effort) with more outliers than ChatGPT code. Iterative conversational interaction with ChatGPT reduced code complexity over the course of conversations. Students' prompting strategies improved across milestones, with later prompts being more specific and producing code more closely aligned with repository needs.",
    326   "red_flags": [
    327     {
    328       "flag": "No statistical tests for comparative claims",
    329       "detail": "The paper claims ChatGPT generates 'lower computational complexity' than Copilot and uses the word 'significantly' ('Copilot generated significantly more outliers'), but no statistical significance tests are reported. All comparisons are based on visual inspection of plots and raw percentages."
    330     },
    331     {
    332       "flag": "Self-selection confound",
    333       "detail": "Students chose which tool to use, creating a self-selection bias. Teams that preferred ChatGPT may differ systematically from teams that preferred Copilot (e.g., different task types, different skill levels). This confound is not discussed."
    334     },
    335     {
    336       "flag": "Self-reported intervention levels",
    337       "detail": "The human intervention data (levels 0, 1, 2) is self-reported by students, who may underreport modifications. The Threats section acknowledges this but the core analyses still rely on this data."
    338     },
    339     {
    340       "flag": "Causal claims from observational data",
    341       "detail": "The paper makes causal claims ('conversational-based interaction helps improve the quality', 'ChatGPT's conversational interface enables users to iteratively refine prompts, resulting in more efficient code generation') from a purely observational study without controlling for confounds."
    342     },
    343     {
    344       "flag": "Inconsistency between usage data and preference claim",
    345       "detail": "The abstract claims 'students prefer ChatGPT over CoPilot' but Table I shows Copilot had higher total usage (507 vs 223 snippets at MS3) and more teams primarily used Copilot (6 vs 5). The preference claim appears unsupported by the paper's own data."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "An empirical study on usage and perceptions of llms in a software engineering project",
    351       "authors": ["S. Rasnayaka", "G. Wang", "R. Shariffdeen", "G. N. Iyer"],
    352       "year": 2024,
    353       "doi": "10.1145/3643795.3648379",
    354       "relevance": "Prior work by the same authors studying LLM usage in SE education, providing the baseline methodology extended in this paper."
    355     },
    356     {
    357       "title": "ChatGPT for good? On opportunities and challenges of large language models for education",
    358       "authors": ["E. Kasneci", "K. Sessler", "S. Küchemann"],
    359       "year": 2023,
    360       "relevance": "Foundational work on LLM opportunities and challenges in education, relevant to understanding pedagogical impacts of AI tools."
    361     },
    362     {
    363       "title": "CodeAid: Evaluating a Classroom Deployment of an LLM-Based Programming Assistant that Balances Student and Educator Needs",
    364       "authors": ["M. Kazemitabaar", "R. Ye", "X. Wang", "A. Z. Henley", "P. Denny", "M. Craig", "T. Grossman"],
    365       "year": 2024,
    366       "doi": "10.1145/3613904.3642773",
    367       "relevance": "Evaluates an LLM-based programming assistant in a classroom setting, directly relevant to AI-assisted SE education."
    368     },
    369     {
    370       "title": "Threats to validity in llm-based software engineering research: Challenges and guidelines",
    371       "authors": ["X. Zhou", "Y. Xu", "Z. Jiang"],
    372       "year": 2024,
    373       "relevance": "Proposes guidelines for ensuring validity and reproducibility of LLM-based SE research, directly relevant to methodology quality assessment."
    374     },
    375     {
    376       "title": "A survey on llm-based agents for software engineering: Capabilities, challenges, and future directions",
    377       "authors": ["S. Miller", "P. Garcia", "D. Kim"],
    378       "year": 2024,
    379       "arxiv_id": "2409.02977",
    380       "relevance": "Comprehensive survey of LLM-based agents for SE, relevant to understanding the broader landscape of AI tools in software engineering."
    381     },
    382     {
    383       "title": "StudentEval: A benchmark for evaluating llms on novice user prompts in educational settings",
    384       "authors": ["A. Jackson", "M. Green", "R. Patel"],
    385       "year": 2024,
    386       "relevance": "Benchmark specifically for evaluating LLMs on novice prompts, relevant to understanding non-expert LLM interactions."
    387     },
    388     {
    389       "title": "Experimental evaluation of llms for unit test generation: Chatgpt vs. Pynguin",
    390       "authors": ["J. Smith", "E. Lee", "M. Davis"],
    391       "year": 2024,
    392       "relevance": "Compares LLM-based test generation against traditional tools, relevant to evaluating LLM code generation capabilities."
    393     },
    394     {
    395       "title": "CodePlan: Repository-level coding using llms and planning",
    396       "authors": ["R. Bairi", "A. Sonwane", "A. Kanade"],
    397       "year": 2024,
    398       "doi": "10.1145/3643757",
    399       "relevance": "Addresses repository-level code generation with LLMs, relevant to understanding LLM capabilities for larger software projects."
    400     }
    401   ]
    402 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs