scan-v5.json (27819B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Human and Machine: How Software Engineers Perceive and Engage with AI-Assisted Code Reviews Compared to Their Peers", 6 "authors": [ 7 "Adam Alami", 8 "Neil A. Ernst" 9 ], 10 "year": 2025, 11 "venue": "IEEE/ACM International Conference on Connected Health: Applications, Systems and Engineering Technologies", 12 "arxiv_id": "2501.02092", 13 "doi": "10.1109/CHASE66643.2025.00016" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All major abstract claims (multi-dimensional engagement, less emotional burden with LLM, higher cognitive load, similar sense-making, trust constraints) are substantiated by specific interview quotes and pattern codes in the findings section.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": false, 26 "justification": "The paper frames findings causally ('LLM-assisted review impacts engagement attributes', 'constructive feedback reduces cognitive load') but the qualitative interview design can only establish perceptions, not causation; no causal inference mechanism is present.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": false, 32 "justification": "Implications propose sweeping organizational changes (EI training in all SE programs, AI personalization features across the industry) based on 20 Prolific-recruited participants in an artificial anonymous setting, without bounding recommendations to the study's limited scope.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper does not systematically consider alternatives to its main narrative; for example, higher cognitive load with LLM could reflect novelty effects or the specific generic prompt used rather than intrinsic LLM properties, but this is not explored.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "Claims consistently match measurement granularity — findings are framed as engineers' self-reported perceptions and experiences, not as objective measures of productivity or code quality.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section VII 'LIMITATIONS AND TRADE-OFFS' is a dedicated section discussing specific study constraints.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Specific threats are identified: anonymous review setup differs from real workplaces with established relationships and status hierarchies; artificial controlled setting may not capture real-world complexity; participants may have modified behavior due to research context.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": false, 64 "justification": "While limitations are acknowledged, the implications section makes broad recommendations without explicit scope boundaries on what the results do NOT show (e.g., longitudinal adoption patterns, team-level effects, productivity outcomes are all unaddressed).", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Acknowledgment section states: 'This study was funded by the department of computer science at Aalborg University; research funding for tenure-track assistant professors.'", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations are clearly stated on the title page: Mærsk Mc-Kinney Møller Institute (University of Southern Denmark) and Department of Computer Science (University of Victoria).", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "The funder is a university computer science department with no commercial interest in the outcome of a study comparing human vs. LLM code reviews.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, consulting) anywhere in the paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "'Engagement' is explicitly defined as 'the ways and the extent to which software engineers actively interact with, respond to, and incorporate feedback from the review process'; cognitive, emotional, and behavioral dimensions are each elaborated in the findings.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper explicitly states it contributes by identifying engagement dimensions in code review (cognitive, emotional, behavioral) and how LLM introduction influences these attributes.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section II provides substantive engagement with prior work on human-tool integration, static analysis bot acceptance, LLM usability challenges, and developer bot preferences, positioning the contribution relative to existing literature.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": false, 120 "answer": false, 121 "justification": "Qualitative interview study with no computational analysis pipeline; no software code to release. Interview materials shared via Zenodo but no analysis scripts exist.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "A Zenodo package (doi.org/10.5281/zenodo.14000259) contains interview guide, code snippets, and ChatGPT-generated reviews, but interview transcripts — the primary data — are stated to be published only 'on acceptance,' indicating incomplete availability at submission.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "ChatGPT 4.0 is identified as the LLM used for generating reviews but no API version snapshot, deployment date, or model configuration details are provided.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "The interview guide is shared and the procedure is described in prose, but step-by-step instructions sufficient to replicate the full study (Prolific configuration, pre-screening thresholds, coder training) are absent.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": false, 146 "answer": false, 147 "justification": "Qualitative study; no quantitative results requiring confidence intervals are reported.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": false, 152 "answer": false, 153 "justification": "Qualitative interview study; statistical significance tests are not applicable.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": false, 158 "answer": false, 159 "justification": "Qualitative study; effect sizes are not applicable.", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "The paper monitors data saturation but provides no formal justification for why n=20 is sufficient, nor discussion of what subgroup analyses or comparisons the sample cannot support.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": false, 170 "answer": false, 171 "justification": "Qualitative study; variance measures are not applicable.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Within-subjects design: each engineer evaluates both human peer reviews and LLM-generated reviews of the same code, enabling direct comparison.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "ChatGPT 4.0 was the most capable widely-accessible LLM at the time of data collection (August–September 2024).", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": false, 190 "answer": false, 191 "justification": "Qualitative interview study; ablation study is not applicable.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "Engagement assessed across three dimensions (cognitive, emotional, behavioral) plus sense-making process and reviewer context as additional analytical lenses.", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": true, 203 "justification": "The entire study is human evaluation — 20 engineers evaluate and reflect on both human-written and LLM-generated review feedback on their own code.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": false, 208 "answer": false, 209 "justification": "Not a prediction task; held-out test sets are not applicable to this qualitative interview study.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Findings are systematically broken down by engagement dimension (cognitive, emotional, behavioral), sense-making, reviewer context (seniority, familiarity, LLM trust), and human-AI collaboration preferences.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "The paper explicitly discusses LLM review failure modes: excessive verbosity increasing cognitive load, lack of codebase-specific context, and trust deficits that constrain adoption.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "The paper honestly reports constraints on LLM adoption (trust issues, lack of context, poor signal-to-noise ratio) and negative emotional responses to harsh peer feedback as balanced findings.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": false, 235 "justification": "'ChatGPT 4.0' is mentioned but without a snapshot date, API version, or deployment identifier — this is a marketing name that does not uniquely identify the model used.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": true, 241 "justification": "The exact ChatGPT prompt is explicitly provided: 'You are an expert of [the programming language]. Provide a thorough review of the attached code.'", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": false, 247 "justification": "No hyperparameters (temperature, top-p, context window) are reported for the ChatGPT usage.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "ChatGPT is used directly as a black-box tool for generating reviews; no agentic scaffolding is involved.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "The qualitative analysis procedure is described in detail: First Cycle inductive coding, Second Cycle pattern coding synthesis, peer reliability check by second author, and iterative saturation monitoring.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": false, 267 "justification": "Primary data (interview transcripts, 271 pages) are stated to be published 'on acceptance' — conditional availability; the Zenodo package does not confirm transcripts are currently accessible.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Data collection is thoroughly described: Prolific recruitment, two-phase pre-screening process, code submission, reviewer assignment, semi-structured interview structure (Table II), Zoom recording, and Otter.ai transcription.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "Recruitment via Prolific is described in detail including iterative pre-screening with programming task, critical-thinking question, AI-detection check, manual quality evaluation, and the specific numbers at each stage (500→353→76→20).", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "Full pipeline documented: audio recording → Otter.ai transcription → First Cycle coding → Second Cycle pattern coding → second-author review → saturation monitoring → member checking with 19/20 responses.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "This study investigates human perceptions, not model benchmark performance; training cutoff contamination concerns are not applicable.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": false, 298 "answer": false, 299 "justification": "Not applicable; the study does not evaluate model capabilities on benchmarks.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": false, 304 "answer": false, 305 "justification": "No benchmark evaluation; contamination concerns are not applicable to this qualitative interview study.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": true, 312 "answer": false, 313 "justification": "There is no mention of pre-registration of study design, hypotheses, or analysis plan anywhere in the paper.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": true, 318 "answer": true, 319 "justification": "Section III-D explicitly states: 'Ethical approvals, as per the authors' university requirements, were obtained prior to the study commencing.'", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": true, 324 "answer": true, 325 "justification": "Table I provides demographics for all 20 participants: role, experience level, gender, industry sector, programming language, assigned reviewers, and country.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": true, 330 "answer": true, 331 "justification": "Pre-screening criteria are described: programming task, critical-thinking question, problem-solving scenario, AI-generated content detection, and manual quality evaluation to filter genuine software engineers.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "This is not a randomized experiment; reviewer assignment aimed for diversity in experience and demographics but was not a formal randomization procedure.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "Blinding is not applicable to this semi-structured qualitative interview design.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": true, 348 "answer": false, 349 "justification": "76 participants agreed to interviews in pre-selection but only 20 were interviewed; the paper does not document how the final 20 were selected from 76 willing participants or whether any dropped out.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": false, 356 "answer": false, 357 "justification": "Inference cost for ChatGPT reviews is not relevant to this study's focus on human perceptions of feedback.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": false, 362 "answer": false, 363 "justification": "No computational budget is relevant to this qualitative interview study.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Engagement in code review is multi-dimensional, spanning cognitive, emotional, and behavioral responses.", 372 "evidence": "Interview data from 20 engineers reveals distinct patterns: cognitive effort in processing feedback, emotional reactions ranging from inspiration to feeling attacked, and behavioral responses including seeking clarification or implementing changes.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "LLM-assisted code review is less emotionally taxing than peer review due to ChatGPT's consistent professional tone.", 377 "evidence": "Multiple participants contrast ChatGPT's polite tone favorably with potentially harsh human feedback; P8: 'ChatGPT would never be picky'; P10: 'it just made it easier to kind of accept what it was telling me.'", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "LLM-generated reviews require higher cognitive load to process than human peer reviews.", 382 "evidence": "P14: 'ChatGPT review will take more time and effort to analyze and review'; multiple participants describe LLM feedback as verbose and requiring more mental effort to evaluate.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Software engineers apply a similar sense-making process to evaluate feedback from both peers and LLMs.", 387 "evidence": "P7: 'decision making, I still follow the same process I use with my peers'; P5 describes identical line-by-line evaluation regardless of feedback source.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "LLM adoption in code review is constrained by trust deficits and lack of codebase-specific context.", 392 "evidence": "P20: 'I wouldn't trust ChatGPT' for advanced topics; P18 notes 'AI might not be completely aware of what the context of the code you are writing … what are your future goals.'", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Most engineers prefer combining peer and LLM reviews rather than replacing one with the other.", 397 "evidence": "P13 would use ChatGPT for pre-review check but still wants a human reviewer; P18 suggests combining both to compensate for LLM's lack of context.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Constructive feedback delivery reduces cognitive load and negative emotional responses, improving behavioral engagement.", 402 "evidence": "P7 found P6's professional tone 'easy to digest'; P2 showed positive behavioral engagement (revisiting code, implementing changes) after constructive feedback, contrasted with P6's defensive response to 'brutal' feedback.", 403 "supported": "moderate" 404 } 405 ], 406 "methodology_tags": [ 407 "qualitative", 408 "observational" 409 ], 410 "key_findings": "Code review engagement is multi-dimensional (cognitive, emotional, behavioral), with peer reviews triggering stronger emotional responses — both positive (inspiration, support) and negative (feeling attacked, resignation) — requiring active emotional regulation strategies like mindfulness and non-confrontational values. LLM-assisted reviews (ChatGPT 4.0) reduce emotional burden through consistent professional tone but increase cognitive load due to verbosity and excessive detail. Engineers apply the same sense-making process to evaluate both human and LLM feedback, but LLM adoption is constrained by trust deficits and lack of codebase-specific context. Most engineers prefer a hybrid model combining LLM pre-review with human peer review rather than full replacement, valuing the human expertise and relational dimension that LLMs cannot replicate.", 411 "red_flags": [ 412 { 413 "flag": "Small n with broad implications", 414 "detail": "n=20 Prolific-recruited participants in an artificial setting, yet implications recommend organization-wide EI training and AI personalization features across the SE industry without bounding to the study's limited scope." 415 }, 416 { 417 "flag": "Artificial anonymous study setting", 418 "detail": "Reviews were conducted among strangers maintaining anonymity, fundamentally differing from real workplaces where established relationships, status hierarchies, and team culture shape feedback dynamics — the paper acknowledges this but the implications section does not account for it." 419 }, 420 { 421 "flag": "Single LLM, single generic prompt", 422 "detail": "Only ChatGPT 4.0 with one generic prompt ('You are an expert… Provide a thorough review') was tested; the higher cognitive load finding may reflect this prompt's verbosity rather than LLMs in general." 423 }, 424 { 425 "flag": "No pre-registration", 426 "detail": "This human subjects study with inductive qualitative analysis was not pre-registered, leaving the analytical framework open to post-hoc refinement and theme selection without a documented prior plan." 427 }, 428 { 429 "flag": "Unexplained participant attrition", 430 "detail": "76 participants agreed to interviews in pre-selection but only 20 were actually interviewed; the selection or dropout process accounting for this 74% reduction is not documented." 431 }, 432 { 433 "flag": "Conditional data availability", 434 "detail": "Primary data (271 pages of interview transcripts) is stated to be published 'on acceptance' rather than being currently available, preventing independent verification of the qualitative coding at submission time." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "Large Language Models for Software Engineering: Survey and Open Problems", 440 "relevance": "Surveys LLM applications in SE including code review and generation; directly contextualizes the LLM code review integration investigated in this study." 441 }, 442 { 443 "title": "Autonomy is an Acquired Taste: Exploring Developer Preferences for GitHub Bots", 444 "relevance": "Studies developer preferences for automated/bot interactions in SE workflows; directly parallel to this study's findings on LLM interaction preferences and customization needs." 445 }, 446 { 447 "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges", 448 "relevance": "Large-scale empirical study of developer experiences with AI coding tools; provides comparative context for the usability and trust findings reported here." 449 }, 450 { 451 "title": "Information Seeking Using AI Assistants", 452 "relevance": "Studies human-AI interaction in coding including challenges of AI being overly polite and lacking trustworthiness — findings directly echoed in this paper." 453 }, 454 { 455 "title": "Human-AI Collaboration in Software Engineering: Lessons Learned from a Hands-on Workshop", 456 "relevance": "Explores SE engineers' expectations of AI as a 'collaborative partner,' complementing this study's engagement and adoption findings." 457 }, 458 { 459 "title": "Constructive Code Review: Managing the Impact of Interpersonal Conflicts in Practice", 460 "relevance": "Examines interpersonal conflict in code review and constructive feedback management, directly informing the emotional engagement findings and implication on feedback constructiveness." 461 }, 462 { 463 "title": "An Empirical Investigation of Relevant Changes and Automation Needs in Modern Code Review", 464 "relevance": "Studies code review processes and automation needs, providing baseline understanding of where LLMs fit in the review workflow." 465 }, 466 { 467 "title": "Are You a Real Software Engineer? Best Practices in Online Recruitment for Software Engineering Studies", 468 "relevance": "Methodology paper on Prolific recruitment for SE studies; the present study explicitly follows its prescreening recommendations." 469 } 470 ], 471 "engagement_factors": { 472 "practical_relevance": { 473 "score": 2, 474 "justification": "Findings on the cognitive-emotional trade-off in LLM code review are directly actionable for engineering teams and tool designers deciding how to integrate AI review tools." 475 }, 476 "surprise_contrarian": { 477 "score": 2, 478 "justification": "The finding that LLM reviews reduce emotional burden but increase cognitive load challenges the naive assumption that AI efficiency uniformly benefits collaborative SE processes." 479 }, 480 "fear_safety": { 481 "score": 1, 482 "justification": "Mild concern about the human and social costs of replacing peer reviews with AI, but the paper is not focused on safety risks." 483 }, 484 "drama_conflict": { 485 "score": 1, 486 "justification": "Some tension in the human-vs-machine framing and the emotional costs of peer feedback, but the paper takes a balanced and non-controversial stance throughout." 487 }, 488 "demo_ability": { 489 "score": 1, 490 "justification": "The ChatGPT prompt is shareable but the full study methodology involves elaborate Prolific recruitment, code submission, and multi-phase interviews that are not easily replicable by readers." 491 }, 492 "brand_recognition": { 493 "score": 1, 494 "justification": "No famous lab affiliation; IEEE/ACM CHASE is a specialized venue. ChatGPT brand recognition in the study provides some hookability but not strongly." 495 } 496 }, 497 "hn_data": { 498 "threads": [ 499 { 500 "hn_id": "46389626", 501 "title": "Fisher Information in Kinetic Theory", 502 "points": 1, 503 "comments": 0, 504 "url": "https://news.ycombinator.com/item?id=46389626" 505 } 506 ], 507 "top_points": 1, 508 "total_points": 1, 509 "total_comments": 0 510 } 511 }