scan.json (28424B)
1 { 2 "paper": { 3 "title": "Human and Machine: How Software Engineers Perceive and Engage with AI-Assisted Code Reviews Compared to Their Peers", 4 "authors": ["Adam Alami", "Neil A. Ernst"], 5 "year": 2025, 6 "venue": "IEEE/ACM International Conference on Connected Health: Applications, Systems and Engineering Technologies (CHASE)", 7 "arxiv_id": "2501.02092", 8 "doi": "10.1109/CHASE66643.2025.00016" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No analysis code or scripts are released. A shared documents package at Zenodo (doi:10.5281/zenodo.14000259) contains code snippets, reviews, questionnaires, and member checking responses, but no analysis code." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "A shared documents package is released at Zenodo (Sect. III-E) containing code snippets and reviews submitted by participants, the interview pre-screening questionnaire, ChatGPT-generated reviews, member checking questionnaire and responses, and the interview guide. Interview transcripts are promised on acceptance but not yet available." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No specification of analysis tools or their versions. The paper mentions using Otter.ai for transcription and Zoom for interviews but provides no version information or reproducibility details for the qualitative analysis environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step instructions for reproducing the analysis. The methods section (Sect. III) describes the general process at a high level but does not provide specific instructions a researcher could follow to replicate the coding and analysis." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": false, 36 "answer": false, 37 "justification": "This is a qualitative interview study with no quantitative results requiring confidence intervals or error bars." 38 }, 39 "significance_tests": { 40 "applies": false, 41 "answer": false, 42 "justification": "This is a qualitative study that makes no statistical comparisons. All findings are thematic, derived from qualitative coding." 43 }, 44 "effect_sizes_reported": { 45 "applies": false, 46 "answer": false, 47 "justification": "No quantitative effect sizes are relevant to this qualitative interview study." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": true, 52 "justification": "The authors monitored data saturation throughout the analysis (Sect. III-C): 'we managed to observe when our Pattern Codes reoccur strongly in the data, hence reaching saturation.' They documented the saturation process and cite standard qualitative methodological references for this approach." 53 }, 54 "variance_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "No quantitative variance measures are relevant to this qualitative study." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The study design is inherently comparative: it juxtaposes software engineers' engagement with human peer reviews against LLM-generated reviews of the same code, providing a built-in comparison condition." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The comparison uses ChatGPT 4.0 (2024), which was contemporary at the time of the study (interviews conducted September 2024)." 70 }, 71 "ablation_study": { 72 "applies": false, 73 "answer": false, 74 "justification": "This is a qualitative interview study, not a system with components to ablate." 75 }, 76 "multiple_metrics": { 77 "applies": false, 78 "answer": false, 79 "justification": "This is a qualitative study that does not use quantitative metrics. Findings are organized by thematic dimensions (cognitive, emotional, behavioral) rather than measured with metrics." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "The entire study centers on human evaluation: 20 software engineers evaluated and compared LLM-generated code reviews against peer-generated reviews, providing detailed qualitative assessments of both." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "Not applicable to a qualitative interview study — there are no test sets or validation splits." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Findings are broken down across multiple dimensions: cognitive, emotional, and behavioral engagement (Sect. IV-B), feedback content and delivery (Sect. IV-A), reviewer context (Sect. IV-C), and human-AI collaboration preferences (Sect. IV-D). Table III documents pattern codes and first cycle codes." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper discusses where LLM-assisted review falls short: higher cognitive load (P14), lack of codebase context (P18, P20), trust issues (P20), and excessive verbosity (P5's signal-to-noise concern). These are substantive failure cases of the AI integration." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Several negative findings about LLM-assisted review are reported: higher cognitive load ('ChatGPT review will take more time and effort to analyze,' P14), trust constraints ('I wouldn't trust ChatGPT,' P20), and lack of context ('AI might not be completely aware of what the context of the code you are writing,' P18)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract's claims — multi-dimensional engagement (cognitive, emotional, behavioral), less emotional regulation needed with LLM, higher cognitive load with LLM feedback, similar sense-making process, trust and context constraints — are all supported by findings in Section IV with participant quotes." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper uses causal language: 'The introduction of LLM-assisted review impacts some of these attributes' (abstract), 'feedback delivery itself triggers negative emotional responses' (Sect. V-B). The within-subjects comparison provides some support, but participants were not blinded to review source (they knew which was ChatGPT), order was not counterbalanced (peer reviews always first), and the study design cannot control for confounds like novelty or expectation effects." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title claims to study 'How Software Engineers Perceive and Engage with AI-Assisted Code Reviews' broadly, but the study involves only 20 Prolific-recruited participants in an artificial anonymous setting. While the paper uses hedging language ('in our sample') within the text, the title and implications (Sect. V) generalize broadly to 'SE' without adequate bounding." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section VII discusses specific alternative explanations: anonymity may have altered review behavior ('developers may align the tone of their feedback according to status, relationships, and team culture'), the artificial setting may not capture real-world complexity, and participants may have modified their approach given the research context." 127 }, 128 "proxy_outcome_distinction": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper explicitly defines its construct: 'Engagement, in the context of our study, is the ways and the extent to which software engineers actively interact with, respond to, and incorporate feedback from the review process' (Sect. I). Self-reported perceptions are clearly framed as the measurement approach for this construct, and the paper does not overstate what interview data can show." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper refers to 'ChatGPT 4.0' throughout without specifying an API version, snapshot date, or model identifier (e.g., gpt-4-0613). 'ChatGPT 4.0' is ambiguous — it could refer to GPT-4 or GPT-4o, and model behavior changes across versions." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": true, 143 "justification": "The exact prompt is provided in Sect. III-B: 'You are an expert of [the programming language]. Provide a thorough review of the attached code.' The programming language fill values are documented in Table I for each participant. The full generated reviews are also shared in the Zenodo package." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": false, 148 "justification": "No mention of temperature, top-p, max tokens, or other API settings used when generating ChatGPT reviews." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "No agentic scaffolding was used. ChatGPT was prompted directly with a single prompt per code submission." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "The full pipeline from recruitment to analysis is documented: 500 initial participants → 353 after pre-screening → 76 after pre-selection → 20 interviewed (Sect. III-A). The analysis pipeline (First Cycle coding → Pattern Codes → reliability check → member checking) is also described in detail (Sect. III-C)." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section VII 'Limitations and Trade-offs' is a dedicated section discussing anonymity constraints, artificial setting, and participant behavior modification." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "The limitations are specific to this study: Prolific's anonymity requirement differs from real-world code review where 'developers are colleagues and interact directly on a daily basis'; the controlled environment 'may not capture the inherent complexity of a real-world setting'; participants may have 'modified their approach to the review compared to a professional setting.'" 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "The limitations section discusses threats but does not explicitly state what the results do NOT show or what populations/settings are excluded. There is no statement bounding the scope of the findings (e.g., not applicable to in-house teams, open-source communities, specific industries, etc.)." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": false, 182 "justification": "Interview transcripts (the core qualitative data) are not available — the paper states 'Interview transcripts will be published on acceptance' (Sect. III-E footnote 3). Supporting materials are available at Zenodo but the raw analytical data cannot be independently verified." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Data collection is described in detail (Sect. III-B): semi-structured interviews via Zoom, 40-60 minutes each, 17h12min total audio, 271 pages verbatim transcription using Otter.ai, conducted in first and second weeks of September 2024." 188 }, 189 "recruitment_methods_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "Recruitment is thoroughly documented (Sect. III-A): Prolific platform used, iterative pre-screening with programming tasks and critical-thinking questions following Alami et al.'s guidelines, AI-content detection applied, compensation specified (£0.50 for surveys, £60.00 for interviews). Potential Prolific platform bias is acknowledged." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The pipeline from recruitment (500 cap) → pre-screening (353 qualified) → pre-selection (76 agreed) → interviews (20 completed) → transcription → First Cycle coding → Pattern Codes → reliability check → member checking is fully documented with counts at each stage (Sects. III-A through III-C)." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Funding is disclosed in the Acknowledgment: 'This study was funded by the department of computer science at Aalborg University; research funding for tenure-track assistant professors.'" 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: Adam Alami at University of Southern Denmark, Neil Ernst at University of Victoria. Neither is affiliated with OpenAI or any AI tool company being discussed." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": true, 214 "justification": "The funder (Aalborg University, academic research funding) has no financial interest in whether LLM-assisted code review is perceived positively or negatively." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": false, 225 "answer": false, 226 "justification": "This is an interview study investigating human perceptions, not evaluating a pre-trained model's capability on a benchmark." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": false, 230 "answer": false, 231 "justification": "This is an interview study investigating human perceptions, not evaluating a pre-trained model on a benchmark." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": false, 235 "answer": false, 236 "justification": "This is an interview study investigating human perceptions, not evaluating a pre-trained model on a benchmark." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": true, 242 "answer": false, 243 "justification": "No mention of pre-registration (e.g., OSF, AsPredicted). The study design and analysis plan were not committed to in advance." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section III-D states: 'Ethical approvals, as per the authors' university requirements, were obtained prior to the study commencing.'" 249 }, 250 "demographics_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Table I provides detailed demographics for all 20 participants: current role, experience level, gender, industry sector, programming language, assigned reviewers, and country." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section III-A describes a detailed multi-stage screening process: programming task, critical-thinking question, problem-solving scenario, manual quality assessment, AI-content detection check, following Alami et al.'s prescreening guidelines." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "This is a qualitative interview study, not an experimental study with randomized conditions. All participants experienced both peer and LLM reviews in the same order." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "This is a qualitative observational study. Blinding was not feasible given the study design — participants needed to know the source of reviews to discuss their perceptions of human vs. LLM feedback." 269 }, 270 "attrition_reported": { 271 "applies": true, 272 "answer": false, 273 "justification": "76 participants agreed to interview but only 20 were interviewed. The paper does not explain the reduction from 76 to 20 or describe any attrition/dropout. For member checking, 19/20 responded (reported), but the initial attrition is unexplained." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "This is a qualitative interview study. There is no system whose inference cost is relevant." 281 }, 282 "compute_budget_stated": { 283 "applies": false, 284 "answer": false, 285 "justification": "This is a qualitative interview study with no significant computational component." 286 } 287 } 288 }, 289 "scan_version": 3, 290 "active_modules": [], 291 "claims": [ 292 { 293 "claim": "Engagement in code review is multi-dimensional, spanning cognitive, emotional, and behavioral responses.", 294 "evidence": "Section IV-B presents evidence across all three dimensions with participant quotes: cognitive load varies with feedback clarity (P7, P14), emotional responses range from inspiration (P2) to feeling attacked (P6), and behavioral responses include reflection and revision (P2) or seeking clarification with defensiveness (P6).", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "LLM-assisted review is less emotionally taxing than peer review due to its consistent positive tone.", 299 "evidence": "Multiple participants report preferring ChatGPT's tone: 'ChatGPT is also polite. So as long as it's polite, I don't really mind' (P8), 'I don't think it ever will be harsh... it just made it easier to kind of accept what it was telling me' (P10), 'ChatGPT in that regards is better. I reacted positively to its tone' (P6). Reported in Sect. IV-A and IV-B.", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "Cognitive load is higher when processing LLM-generated feedback compared to peer feedback.", 304 "evidence": "P14 states: 'ChatGPT review will take more time and effort to analyze and to review, and others are more straightforward' (Sect. IV-A). P5 raises signal-to-noise concerns about LLM verbosity (Sect. IV-D). However, this is based on a small number of participant accounts.", 305 "supported": "weak" 306 }, 307 { 308 "claim": "Software engineers use a similar sense-making process to evaluate and adopt feedback regardless of whether it comes from peers or LLMs.", 309 "evidence": "P7 directly states: 'decision making, I still follow the same process I use with my peers' (Sect. IV-B). P5 describes line-by-line evaluation for both sources. However, the paper also notes that LLM feedback adoption is constrained by trust and context that differ from peer review.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "LLM feedback adoption is constrained by trust in LLM abilities and lack of codebase context.", 314 "evidence": "P20: 'if you go a bit deeper in more advanced topics, it doesn't really know what to do clearly... So I wouldn't trust ChatGPT' (Sect. IV-C). P18: 'AI might not be completely aware of what the context of the code you are writing' (Sect. IV-B). Multiple participants cite these constraints.", 315 "supported": "moderate" 316 }, 317 { 318 "claim": "Most participants prefer combining peer and LLM reviews rather than replacing human reviewers entirely.", 319 "evidence": "P13 suggests: 'I would use ChatGPT for a pre-review check... then put it out for review and then let a person look at it' (Sect. IV-B). P18 suggests combining to mitigate lack of context. The paper states 'most of our participants suggested combining both peers' and LLM's reviews' (Sect. IV-D).", 320 "supported": "moderate" 321 } 322 ], 323 "methodology_tags": ["qualitative"], 324 "key_findings": "Interview study of 20 software engineers reveals that code review engagement is multi-dimensional (cognitive, emotional, behavioral). LLM-assisted reviews reduce emotional burden through consistent positive tone but increase cognitive load due to verbosity and excessive detail. Engineers apply similar sense-making processes to both human and LLM feedback, but LLM adoption is constrained by trust and lack of codebase context. Most participants prefer combining peer and LLM reviews rather than full replacement of human reviewers.", 325 "red_flags": [ 326 { 327 "flag": "Small sample with broad claims", 328 "detail": "N=20 Prolific-recruited participants with broad title and implications about 'software engineers' generally. The implications section (Sect. V) proposes organizational changes (EI training, feedback constructiveness programs) based on 20 interviews in an artificial setting." 329 }, 330 { 331 "flag": "No blinding to review source", 332 "detail": "Participants knew which reviews came from ChatGPT and which from peers. This introduces expectation bias — participants may have responded differently to LLM reviews based on preconceptions about AI rather than the actual review content." 333 }, 334 { 335 "flag": "Order effect not controlled", 336 "detail": "All participants first discussed peer reviews, then were shown the ChatGPT review. No counterbalancing was used. Order effects (novelty, fatigue, anchoring to peer reviews) could systematically bias the comparison." 337 }, 338 { 339 "flag": "Prolific recruitment may not represent professional software engineers", 340 "detail": "Prolific participants who accept £60 interview studies may differ systematically from working software engineers. While the pre-screening was thorough, the authors acknowledge Prolific 'does not verify nor evaluate self-reported skills' (Sect. III-A)." 341 }, 342 { 343 "flag": "Artificial setting acknowledged but not mitigated", 344 "detail": "Anonymous reviews between strangers differ fundamentally from real code review where developers have relationships, shared context, and team dynamics. The authors acknowledge this in Sect. VII but the study design cannot capture these critical aspects of professional code review." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "Large language models for software engineering: Survey and open problems", 350 "authors": ["A. Fan", "B. Gokkaya", "M. Harman", "M. Lyubarskiy", "S. Sengupta", "S. Yoo", "J. M. Zhang"], 351 "year": 2023, 352 "relevance": "Comprehensive survey on LLMs for software engineering covering code generation, debugging, design, and code review." 353 }, 354 { 355 "title": "Information seeking using AI assistants", 356 "authors": ["E. Al Haque", "C. Brown", "T. D. LaToza", "B. Johnson"], 357 "year": 2024, 358 "arxiv_id": "2408.04032", 359 "relevance": "Identifies challenges with AI-generated content including excessive politeness, excessive detail, and lack of trustworthiness." 360 }, 361 { 362 "title": "Autonomy is an acquired taste: Exploring developer preferences for GitHub bots", 363 "authors": ["A. Ghorbani", "N. Cassee", "D. Robinson", "A. Alami", "N. A. Ernst", "A. Serebrenik", "A. Wąsowski"], 364 "year": 2023, 365 "relevance": "Studies developer preferences for bot interactions in software engineering, finding varying trust levels and need for customizable interactions." 366 }, 367 { 368 "title": "Automating code review activities by large-scale pre-training", 369 "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan", "S. Jannu", "G. Jenks"], 370 "year": 2022, 371 "relevance": "Technical work on automating code review using pre-trained models, creating datasets of pull requests and code problems." 372 }, 373 { 374 "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges", 375 "authors": ["J. T. Liang", "C. Yang", "B. A. Myers"], 376 "year": 2024, 377 "relevance": "Large-scale usability study of AI programming assistants finding developers get frustrated when AI lacks codebase context." 378 }, 379 { 380 "title": "Human-AI collaboration in software engineering: Lessons learned from a hands-on workshop", 381 "authors": ["M. Hamza", "D. Siemon", "M. A. Akbar", "T. Rahman"], 382 "year": 2024, 383 "relevance": "Studies human-AI collaboration expectations in SE, finding engineers expect a 'collaborative partner' not merely a tool." 384 }, 385 { 386 "title": "LLaMA-Reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning", 387 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 388 "year": 2023, 389 "relevance": "Technical work on fine-tuning LLMs specifically for code review automation." 390 }, 391 { 392 "title": "Using pre-trained models to boost code review automation", 393 "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"], 394 "year": 2022, 395 "relevance": "Explores using pre-trained neural models to automate aspects of the code review process." 396 }, 397 { 398 "title": "Cobots in knowledge work: Human–AI collaboration in managerial professions", 399 "authors": ["K. Sowa", "A. Przegalinska", "L. Ciechanowski"], 400 "year": 2021, 401 "relevance": "Studies human-AI collaboration in knowledge work, finding productivity increases when collaboration is enhanced." 402 }, 403 { 404 "title": "Explainable software bot contributions: Case study of automated bug fixes", 405 "authors": ["M. Monperrus"], 406 "year": 2019, 407 "relevance": "Found that without explanation, it was difficult to get humans to accept automated code changes from bots." 408 }, 409 { 410 "title": "An empirical investigation of relevant changes and automation needs in modern code review", 411 "authors": ["S. Panichella", "N. Zaugg"], 412 "year": 2020, 413 "doi": "10.1007/s10664-020-09870-3", 414 "relevance": "Points out that code review tasks frequently change, and statically trained LLMs may miss evolving requirements." 415 }, 416 { 417 "title": "Constructive code review: Managing the impact of interpersonal conflicts in practice", 418 "authors": ["P. Wurzel Goncalves", "J. SV Goncalves", "A. Bacchelli"], 419 "year": 2024, 420 "relevance": "Reports that negative feedback in code review causes interpersonal conflicts, directly relevant to the emotional engagement findings." 421 } 422 ], 423 "engagement_factors": { 424 "practical_relevance": { 425 "score": 1, 426 "justification": "Provides insights for team leads on integrating AI code review, but no concrete tool, technique, or implementation guidance." 427 }, 428 "surprise_contrarian": { 429 "score": 1, 430 "justification": "The finding that LLM reviews reduce emotional load but increase cognitive load is mildly interesting but not deeply counterintuitive." 431 }, 432 "fear_safety": { 433 "score": 0, 434 "justification": "No AI risk, security, or safety concerns are raised." 435 }, 436 "drama_conflict": { 437 "score": 0, 438 "justification": "No controversy or provocative claims — findings are nuanced and balanced." 439 }, 440 "demo_ability": { 441 "score": 0, 442 "justification": "No code, tool, or demo to try — this is a qualitative interview study." 443 }, 444 "brand_recognition": { 445 "score": 1, 446 "justification": "Mentions ChatGPT and GitHub Copilot but the study itself is from lesser-known academic institutions." 447 } 448 } 449 }