scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25441B)
      1 {
      2   "paper": {
      3     "title": "AI-powered peer review process: An approach to enhance computer science students' engagement with code review in industry-based subjects",
      4     "authors": ["Eduardo Araujo Oliveira", "Shannon Rios", "Zhuoxuan Jiang"],
      5     "year": 2023,
      6     "venue": "ASCILITE 2023",
      7     "doi": "10.14742/apubs.2023.482"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, code archive, or link to the GitHub Actions workflow implementation is provided in the paper. The paper describes the GitHub Actions integration but does not release it."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset download link or data repository is provided. The review data, student reflections, and defect counts are reported only in aggregate in the paper."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency lists, or setup instructions are provided. The paper mentions gpt3.5-turbo and GitHub Actions but provides no technical setup details for reproduction."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are included. The paper describes the process at a high level (Figure 3) but does not provide enough detail to replicate the GitHub Actions integration or the prompt configuration."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported. The paper reports means and standard deviations but no CIs. Figures 4, 5, and 6 show bar charts without error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "A t-test is reported with p-value=0.01 for the difference in number of issues identified between genAI and peer reviews (Results section, 'Overall number and types of code defects')."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal effect sizes (Cohen's d, odds ratio, etc.) are reported. The paper provides means (26.713 vs. 9.512 for issues identified) which gives some context, but no standardized effect size measure is calculated despite running a t-test."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No power analysis or justification for the sample size of 80 students (16 groups) is provided. The paper does not discuss whether this sample is sufficient for the statistical tests performed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations are reported alongside means: 'an average of 26.713 (SD=27.412) issues identified compared to 9.512 (SD=3.013)' and 'a mean of 9.165 (SD=14) by genAI compared to a mean of 5.667 (SD 2.16) in peer reviews.'"
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The study compares genAI-powered code review against peer-to-peer code review as a baseline using a crossover design (Group A and Group B swap conditions between weeks 8 and 10)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Peer-to-peer code review is the standard practice in the educational context being studied. It is the contemporary and relevant baseline for this comparison."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "The system has essentially one component (the genAI-powered review via GitHub Actions with a prompt). There are no distinct subcomponents to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: number of reviews performed, number of issues identified, types of issues (documentation, visual representation, structure, logic, etc.), severity of issues (trivial vs. critical), and number of issues fixed."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Human evaluation is included through student self-reflection reports (400-word individual reports) and qualitative analysis of student feedback about the genAI review process. However, the severity classification of defects was assessed by students, not independently verified by the researchers."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is an educational intervention study, not a machine learning evaluation. There is no train/test split concept applicable here."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by issue type (documentation, visual representation, structure, new functionality, resource, check, interface, logic defects) in Figure 5, and by severity in Figure 6."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses where genAI failed: it performed poorly at identifying structural and logic issues compared to peer review (Figure 5), and students reported that genAI feedback was sometimes 'very generic' (student quotes in Results section). The paper also notes genAI lacked project context."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that peer reviewers outperformed genAI at identifying structural and logic issues, and that genAI feedback was sometimes generic or inaccurate. Student quotes critical of genAI are included."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that genAI 'significantly increased students' engagement in code review' and 'could also identify a larger number of code issues in short times, leading to more fixes.' Both are supported by the results: 70% of reviews used genAI, a t-test (p=0.01) showed more issues identified, and more issues were fixed."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims ('genAI-powered reviewing process significantly increased students' engagement') but the crossover design has confounds: the intervention was not blinded, participation was voluntary (self-selection bias), and engagement was incentivized by assessment marks (2 points for code reviews). The higher number of genAI reviews could reflect ease-of-use rather than genuine engagement. No attempt is made to control for these confounds."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract and conclusion make broad claims about 'CS students' engagement with code review' and implications for 'higher education settings' generally. However, the study involves only 80 Master's students at one university, in one capstone subject, using gpt3.5-turbo. These boundary conditions are not prominently stated when making claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss key alternative explanations: the novelty effect of using AI, the incentive structure (assessment marks for code reviews), self-selection bias (only 80 of 170 students participated), or whether the higher volume of genAI reviews reflects lower effort per review rather than genuine engagement. The discussion section offers some nuance about genAI limitations but does not systematically consider confounds."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'gpt3.5-turbo' as the model used, identified as 'OpenAI's v2 GPT API (gpt3.5-turbo)' in the Methods section. While no specific snapshot date is given, gpt-3.5-turbo is a specific API model identifier, though it does change over time. This is borderline but the paper at least names the specific API model."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 2b shows the prompt created to automate the code review process with generative AI. The actual prompt text appears to be provided in the figure, making it possible to see what was sent to the model."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for the GPT API calls. These settings significantly affect output quality and consistency."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The GitHub Actions integration workflow is described in the Methods section and Figure 3, showing the trigger mechanism (pull request approval), the code review process flow, and how results are stored in the GitHub repository."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not document how raw data (GitHub review outputs, spreadsheet entries, student reflections) was processed into the aggregate statistics reported. The classification of issues into defect categories and severity levels is not described in terms of methodology — only that it was 'extracted from data provided by students and from responses generated by genAI.'"
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The Discussion section includes a paragraph beginning 'This study has two main limitations' that discusses (1) the lack of collaborative learning in genAI reviews and (2) the generic/inaccurate nature of genAI feedback."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The two stated limitations are specific to the genAI tool's capabilities (lack of context, generic feedback) but do not address threats to the study's validity: self-selection bias (only 80/170 students participated), assessment incentives confounding engagement, novelty effects, lack of blinding, or the small number of groups (16). These are methodological threats, not tool limitations."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. There is no statement bounding the generalizability to this specific setting (one university, one subject, Master's students, gpt-3.5-turbo). The conclusion makes broad claims about 'CS students' engagement' without scoping caveats."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (individual review outputs, student reflections, defect classifications) is made available. Only aggregate statistics are reported in the paper."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The Data Collection section describes the process: students were randomly assigned to two groups with a crossover design, peer reviews used spreadsheets on the LMS, genAI reviews were documented on GitHub, and self-reflection reports were submitted at week 12."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The Participants section states: 'Participants were recruited via Canvas Learning Management System (LMS) and provided informed consent (Ethics approval #24272). The sample included a total of 80 (out of 170) students enrolled in a Software Project subject.'"
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not document how it went from raw review data to the reported statistics. How were defects classified into categories? Who classified them? How were student reflections selected for quotation? The paper states 'This classification was extracted from data provided by students and from responses generated by genAI' but does not explain the extraction process."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of funding at all."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are disclosed: all three authors are from The University of Melbourne, which is the institution where the study was conducted."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed or apparent. The study appears to be conducted as part of normal teaching activities at the University of Melbourne without external funding."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses GPT-3.5-turbo as a tool in an educational intervention; the study measures student engagement outcomes, not model knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — this is not a benchmark evaluation study. The study measures the effect of an AI tool on student engagement, not the model's performance on a test set."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above — no benchmark evaluation is performed. The study is an educational intervention, not a model capability evaluation."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry) is found in the paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Ethics approval is mentioned: 'Participants were recruited via Canvas Learning Management System (LMS) and provided informed consent (Ethics approval #24272).'"
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Basic demographics are reported: 80 students total (out of 170 enrolled), 56 males and 24 females, all enrolled in the MIT program's Software Project capstone subject at the University of Melbourne, working in teams of five."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria are stated beyond enrollment in the subject. The paper does not explain why only 80 out of 170 students participated or whether any students were excluded after consent."
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The paper states 'Participants were randomly assigned to two groups' but provides no detail about the randomization procedure: how randomization was done, at what level (individual or team), or what tool was used. Given that students work in teams of 5, it is unclear whether randomization was at the team or individual level."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No blinding is described. Students clearly knew which condition they were in (using a spreadsheet for peer review vs. GitHub Actions for genAI review). No discussion of whether this knowledge could affect outcomes."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "Attrition is not reported. The paper states 80 students participated but does not indicate whether all 80 completed both weeks of data collection, or whether any groups dropped out. The number of groups performing reviews decreased from week 8 (19 total) to week 10 (17 total) with no explanation."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs or inference latency for the GPT-3.5-turbo calls are reported. The paper mentions genAI completed reviews 'in just a few seconds' (student quote) but provides no systematic cost data."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, API spend, or resource usage is reported for running the genAI-powered code reviews across all 36 reviews."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "GenAI-powered code review significantly increased students' engagement compared to peer review, with 70% of all reviews conducted using genAI.",
    286       "evidence": "Results section reports 25 of 36 total reviews used genAI (13/19 in week 8, 12/17 in week 10). Some teams performed multiple genAI reviews voluntarily. However, this is a crossover design where both groups eventually use both methods, making the 70% figure partly a design artifact.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "GenAI-powered reviews identified significantly more code issues than peer reviews (mean 26.7 vs 9.5, p=0.01).",
    291       "evidence": "A t-test with p=0.01 is reported for the difference in issues identified. Means and SDs are provided (26.713 SD=27.412 for genAI vs. 9.512 SD=3.013 for peer). The very large SD for genAI (larger than the mean) suggests high variability.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "GenAI-powered reviews led to more fixes than peer reviews (mean 9.165 vs 5.667 issues fixed).",
    296       "evidence": "Means and SDs reported (9.165 SD=14 for genAI, 5.667 SD=2.16 for peer). No significance test is reported for this comparison. The SD of 14 for genAI is larger than the mean, indicating extremely high variability.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Human reviewers demonstrate superior proficiency in identifying logical and structural issues compared to genAI.",
    301       "evidence": "Figure 5 shows peer review identified more structural and logic issues on average. No significance test is reported for this specific comparison. The observation is qualitatively supported by student reflections about genAI lacking project context.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "GenAI could identify more documentation, visual representation, and check defects than peer review.",
    306       "evidence": "Figure 5 shows the category breakdown. No significance test for individual categories. The claim is supported only by visual comparison of bar charts.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["rct", "qualitative"],
    311   "key_findings": "This study compared AI-powered code review (using GPT-3.5-turbo via GitHub Actions) with peer-to-peer code review among 80 CS master's students in a capstone subject. GenAI-powered reviews identified significantly more issues overall (mean 26.7 vs. 9.5, p=0.01) but performed worse at identifying structural and logical issues. Students reported genAI feedback was sometimes generic but appreciated its speed. The crossover design is a strength, but the study has significant confounds including self-selection bias, assessment incentives, lack of blinding, and very high variance in genAI results.",
    312   "red_flags": [
    313     {
    314       "flag": "Self-selection bias",
    315       "detail": "Only 80 out of 170 enrolled students participated. The paper does not analyze whether participating students differ systematically from non-participants, which could strongly bias engagement metrics."
    316     },
    317     {
    318       "flag": "Assessment incentives confound engagement",
    319       "detail": "Code reviews were worth 2 marks out of 15 per sprint. Students were incentivized to perform reviews for grades, not just engagement. The higher rate of genAI reviews may reflect that they were easier to perform (automated via GitHub Actions) rather than genuine engagement improvement."
    320     },
    321     {
    322       "flag": "Very high variance in genAI results",
    323       "detail": "The SD for genAI issues identified (27.412) exceeds the mean (26.713), and the SD for issues fixed (14) far exceeds its mean (9.165). This suggests the genAI results are highly inconsistent, with some teams getting many issues and others very few, undermining the aggregate comparison."
    324     },
    325     {
    326       "flag": "Crossover design reported as one-way comparison",
    327       "detail": "Despite using a crossover design (Group A and B swap conditions), the analysis pools all genAI reviews vs. all peer reviews rather than analyzing within-group changes across weeks. This wastes the power of the crossover design and does not control for time/learning effects."
    328     },
    329     {
    330       "flag": "Severity assessed by students without verification",
    331       "detail": "The paper states 'the severity of the issue identified was evaluated by students and the researchers did not verify the students' assessment.' This means the accuracy and severity data are unverified self-reports."
    332     },
    333     {
    334       "flag": "No significance tests for most comparisons",
    335       "detail": "Only one t-test (p=0.01) is reported for overall issue count. No significance tests for issues fixed, category breakdowns, severity comparisons, or the engagement claim (70% genAI). Multiple claims rest on visual inspection of bar charts."
    336     }
    337   ],
    338   "cited_papers": [
    339     {
    340       "title": "Expectations, outcomes, and challenges of modern code review",
    341       "authors": ["A. Bacchelli", "C. Bird"],
    342       "year": 2013,
    343       "doi": "10.1109/ICSE.2013.6606617",
    344       "relevance": "Foundational study on code review practices in software engineering, relevant to understanding baseline code review effectiveness."
    345     },
    346     {
    347       "title": "A review of peer code review in higher education",
    348       "authors": ["T. D. Indriasari", "A. Luxton-Reilly", "P. Denny"],
    349       "year": 2020,
    350       "doi": "10.1145/3403935",
    351       "relevance": "Survey of peer code review in education, directly relevant to evaluating automated code review as an alternative."
    352     },
    353     {
    354       "title": "Impersonating chatbots in a code review exercise to teach software engineering best practices",
    355       "authors": ["J. C. Farah", "B. Spaenlehauer", "V. Sharma", "M. J. Rodriguez-Triana", "S. Ingram", "D. Gillet"],
    356       "year": 2022,
    357       "doi": "10.1109/EDUCON52537.2022.9766793",
    358       "relevance": "Prior work on using chatbots for code review in education, a direct predecessor to the genAI approach evaluated in this paper."
    359     },
    360     {
    361       "title": "Is there a Need for Automated Code Review to be Used in Teaching?: From the perspective of students",
    362       "authors": ["C. Kaufmann", "J. Pavão", "H. Wahl"],
    363       "year": 2022,
    364       "doi": "10.23919/CISTI54924.2022.9820030",
    365       "relevance": "Studies student perspectives on automated code review in education, directly relevant to the engagement question in this paper."
    366     },
    367     {
    368       "title": "Competencies for Code Review",
    369       "authors": ["P. Wurzel Gonçalves", "G. Calikli", "A. Serebrenik", "A. Bacchelli"],
    370       "year": 2023,
    371       "doi": "10.1145/3579471",
    372       "relevance": "Identifies competencies needed for effective code review, relevant to understanding what skills AI-powered review develops or bypasses."
    373     },
    374     {
    375       "title": "Teaching code review management using branch based workflows",
    376       "authors": ["S. Krusche", "M. Berisha", "B. Bruegge"],
    377       "year": 2016,
    378       "doi": "10.1145/2889160.2889191",
    379       "relevance": "Prior work on teaching code review practices to students, relevant to the educational context of this study."
    380     },
    381     {
    382       "title": "Software engineering at google",
    383       "authors": ["F. Henderson"],
    384       "year": 2020,
    385       "relevance": "Describes industrial code review practices at scale, providing context for how professional code review compares to the educational genAI approach."
    386     }
    387   ]
    388 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs