scan.json (24996B)
1 { 2 "paper": { 3 "title": "Quo Vadis, Code Review? Exploring the Future of Code Review", 4 "authors": [ 5 "Michael Dorner", 6 "Andreas Bauer", 7 "Darja Šmite", 8 "Lukas Thode", 9 "Daniel Méndez", 10 "Ricardo Britto", 11 "Stephan Lukasczyk", 12 "Ehsan Zabardast", 13 "Michael Kormann" 14 ], 15 "year": 2025, 16 "venue": "arXiv.org", 17 "arxiv_id": "2508.06879", 18 "doi": "10.48550/arXiv.2508.06879" 19 }, 20 "scan_version": 3, 21 "active_modules": [], 22 "methodology_tags": ["qualitative", "observational"], 23 "key_findings": "A survey of 100 professional developers from five companies finds that 77% expect to spend the same or more time on code review in five years, with broader artifact coverage anticipated. Almost all respondents expect LLMs to become active participants in code review, either as reviewers or code authors. The authors identify three long-term risks: erosion of human understanding, accountability, and trust in collaborative software engineering.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Section 3 states: 'All anonymized data and analysis scripts are publicly available on GitHub: github.com/michaeldorner/quo-vadis-code-review.'" 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section 3 states all anonymized data is publicly available on GitHub alongside the analysis scripts." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No environment specifications, dependency lists, or software version requirements are mentioned in the paper." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "While a GitHub repository is provided, the paper itself contains no step-by-step reproduction instructions, README description, or guidance on how to replicate the analysis." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Results are reported as simple percentages (47%, 30%, 23%) with no confidence intervals or error bars despite the small sample size of 100." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper compares its distribution of review hours against two external datasets (Microsoft 2013, Stack Overflow 2019) and notes the distribution is 'consistently lower,' but performs no statistical test to assess whether the difference is significant." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No effect sizes are reported. Comparisons are made only through visual inspection of cumulative distribution plots and raw percentages." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The target of ~25 developers per company is stated as a quota but not justified. No power analysis or rationale for why 100 total participants is sufficient for the claims made." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Only a median of 'approximately three hours per week' is reported for review time. No standard deviation, interquartile range, or other spread measure is provided." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper contextualizes its findings against two external datasets: Bosu et al. (2017) surveying 416 Microsoft developers in 2013, and the 2019 Stack Overflow Developer Survey with 49,790 respondents (Section 4.1, Figure 1)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": false, 83 "justification": "The reference datasets are from 2013 (Microsoft) and 2019 (Stack Overflow), both 6+ years old. The paper acknowledges 'these datasets are older' but does not justify why no more recent reference data was used." 84 }, 85 "ablation_study": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a survey study with no system or components to ablate." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The survey examines multiple dimensions: weekly review hours (Section 4.1), expected future time investment, types of artifacts reviewed today vs. in five years (Section 4.2), and qualitative expectations about AI involvement (Section 4.3)." 94 }, 95 "human_evaluation": { 96 "applies": false, 97 "answer": false, 98 "justification": "No system outputs to evaluate. The study itself is a survey collecting human perspectives, not evaluating a system's outputs." 99 }, 100 "held_out_test_set": { 101 "applies": false, 102 "answer": false, 103 "justification": "Not applicable to a survey study — there is no train/test split." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Figure 2 provides per-artifact breakdowns (production code, test code, configuration files, documentation, GUI-based test code) comparing today vs. five years from now. Table 1 provides per-company participant counts." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper discusses opposing views: fundamental opposition to AI in code review ('As long as AI is involved, it is likely to be a more tedious process'), warnings about degraded code quality ('more generated code of a worsen quality'), and the risk of both author and reviewer being AI ('the AI doesn't catch the mistake because it generated it')." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper reports that 23% of respondents expect to spend less time on code review (contrary to the main trend), and Section 5 is entirely devoted to negative implications (erosion of understanding, accountability, trust)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims are supported: 'similar or greater effort' matches 77% expecting same/more time (Section 4.1), 'broader range of artifacts' matches Figure 2 (Section 4.2), and 'almost all expect LLMs to become active participants' is supported by the thematic analysis (Section 4.3)." 126 }, 127 "causal_claims_justified": { 128 "applies": false, 129 "answer": false, 130 "justification": "The paper makes no causal claims. All empirical findings are descriptive (what developers report and expect). The discussion section uses hedged language: 'could lead to,' 'might happen,' 'we fear,' 'may have long-term implications.'" 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly states: 'we are cautious about extrapolating this insight to the entire software industry' (Section 4.1), 'Though not statistically representative' (Section 2.2), and 'all findings reflect the perspectives of individual developers; the companies serve only as contextual backgrounds' (Section 2.2)." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "Section 4.1 briefly mentions 'Possible explanations may include a decline in the perceived importance of code review, differences in domain-specific practices, or inaccuracies in self-reported estimates' for the lower review hours. However, there is no systematic discussion of confounds such as social desirability bias, self-selection bias, or the reliability of 5-year predictions." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper measures self-reported expectations about the future and uses them to discuss what code review will actually look like. The gap between developer predictions and actual future outcomes is not acknowledged — self-reported expectations are an unreliable proxy for actual future practice changes." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": false, 151 "answer": false, 152 "justification": "No AI models are used in this study. It is a survey of human developers." 153 }, 154 "prompts_provided": { 155 "applies": false, 156 "answer": false, 157 "justification": "No prompting of AI models is involved in this study." 158 }, 159 "hyperparameters_reported": { 160 "applies": false, 161 "answer": false, 162 "justification": "No models or experiments with hyperparameters are involved." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used in this study." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 2.5 describes the analysis pipeline: quantitative responses analyzed with descriptive statistics, open-ended responses examined using thematic analysis with inductive single-cycle descriptive coding by the first author, reviewed by the second author, with disagreements resolved collaboratively." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "There is no dedicated limitations or threats-to-validity section. Caveats are scattered throughout (e.g., 'not statistically representative' in Section 2.2, 'cautious about extrapolating' in Section 4.1), but there is no substantive consolidated discussion of study limitations." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No threats to validity are systematically discussed. The scattered caveats are relatively generic ('not statistically representative,' 'varying participation per company due to voluntary participation'). Specific threats like social desirability bias, self-selection, or the reliability of 5-year predictions are not addressed." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "While the paper hedges about generalizability, it does not explicitly state what the results do NOT show. The title ('Exploring the Future of Code Review') and discussion sections make broad claims about erosion risks without bounding these to the 5-company sample. No equivalent of 'what the evidence does not show.'" 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3 states: 'All anonymized data and analysis scripts are publicly available on GitHub: github.com/michaeldorner/quo-vadis-code-review.'" 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 2.4 describes data collection: company-specific online questionnaires shared through internal communication channels, conducted between December 2024 and November 2025, with voluntary and anonymous participation." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "Section 2.1 describes quota sampling: five companies were purposively selected, then ~25 developers recruited from each via internal channels, with recruitment closing after eight weeks or once the quota was reached." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": false, 211 "justification": "The paper does not document whether any responses were excluded, filtered, or cleaned. There is no accounting of incomplete responses, nor explanation of how the raw survey data was processed into the reported statistics. The jump from 'we collected responses' to final numbers is undocumented." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "The Acknowledgement section states: 'This work was supported by the KKS Foundation through the SERT Project (Research Profile Grant 2018/010) at Blekinge Institute of Technology.'" 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are listed: Stephan Lukasczyk is at JetBrains Research, Michael Kormann is at SAP, and Ricardo Britto is at Ericsson. These are three of the five surveyed companies." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": true, 228 "justification": "The KKS Foundation (Knowledge Foundation) is a Swedish research funding body with no apparent commercial interest in code review practices or tools." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is provided. Three authors are affiliated with surveyed companies (JetBrains, SAP, Ericsson), which could influence how questions were framed or results interpreted, but this potential conflict is not acknowledged." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "This is a human survey study that does not evaluate any pre-trained model on a benchmark." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "This is a human survey study that does not evaluate any pre-trained model on a benchmark." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "This is a human survey study that does not evaluate any pre-trained model on a benchmark." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": true, 256 "answer": false, 257 "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry) anywhere in the paper." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": true, 261 "answer": false, 262 "justification": "No mention of IRB or ethics board approval despite collecting data from 100 human participants across five companies." 263 }, 264 "demographics_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "The paper deliberately excluded demographic questions: 'We deliberately excluded demographic questions such as age, gender, or company affiliation to avoid any mapping between individual respondents and their employer.' No experience level, years of experience, or other participant characteristics are reported." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": true, 271 "answer": false, 272 "justification": "The only stated criterion is that participants be 'professional software developers' from the five selected companies. No formal inclusion/exclusion criteria, screening process, or minimum experience requirements are documented." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "This is a cross-sectional survey, not an experimental study with treatment/control conditions. Randomization does not apply." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "This is a cross-sectional survey, not an experimental study. Blinding does not apply." 283 }, 284 "attrition_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "No response rate is reported. The paper does not state how many developers were invited vs. how many responded, nor whether any responses were excluded. Only the final participant counts per company are given (Table 1)." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": false, 293 "answer": false, 294 "justification": "This is a survey study with no computational method or inference to cost." 295 }, 296 "compute_budget_stated": { 297 "applies": false, 298 "answer": false, 299 "justification": "This is a survey study with no computational experiments." 300 } 301 } 302 }, 303 "claims": [ 304 { 305 "claim": "A majority of practitioners (77%) expect to spend the same or more time on code review in the coming years.", 306 "evidence": "Section 4.1 reports 47% expect more time, 30% expect the same, and 23% expect less, based on the 100-developer survey.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Practitioners currently spend a median of approximately three hours per week on code review, lower than older reference datasets showing a median of around four hours.", 311 "evidence": "Section 4.1 and Figure 1 compare the distribution against Bosu et al. (2017) Microsoft data and the 2019 Stack Overflow Developer Survey. The comparison is visual (cumulative distribution plot) with no formal statistical test.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Practitioners expect to review a broader range of artifacts in the future, with notable growth in GUI-based test code.", 316 "evidence": "Section 4.2 and Figure 2 show increases across all artifact categories (production code, test code, configuration files, documentation, GUI-based test code) and a decrease in 'none' responses.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Almost all practitioners expect LLMs to become active participants in code review.", 321 "evidence": "Section 4.3 reports this finding from thematic analysis of open-ended responses, but does not provide an exact percentage or count of how many respondents mentioned LLMs.", 322 "supported": "weak" 323 }, 324 { 325 "claim": "LLM involvement in code review risks eroding human understanding, accountability, and trust.", 326 "evidence": "Section 5 develops three erosion narratives supported by respondent quotes and references to related work (Tufano et al. 2025, Alami et al. 2025, Taivalsaari et al. 2025). The paper explicitly states these are 'conceptual interpretations informed by practitioner feedback and prior work rather than direct empirical findings.'", 327 "supported": "weak" 328 } 329 ], 330 "red_flags": [ 331 { 332 "flag": "Small, non-representative sample", 333 "detail": "N=100 developers from only 5 purposively selected companies, with uneven participation (9–31 per company). The paper acknowledges it is 'not statistically representative' yet draws broad conclusions about the future of code review generally." 334 }, 335 { 336 "flag": "No response rate reported", 337 "detail": "The paper does not state how many developers were invited at each company, making it impossible to assess self-selection bias. Developers who are more engaged with or opinionated about code review may be overrepresented." 338 }, 339 { 340 "flag": "Author-company overlap not discussed as conflict", 341 "detail": "Three authors are affiliated with three of the five surveyed companies (JetBrains, SAP, Ericsson). This could influence survey design, distribution, or interpretation, but is not acknowledged as a potential conflict of interest." 342 }, 343 { 344 "flag": "No IRB or ethics approval", 345 "detail": "The study collected data from 100 human participants across five companies with no mention of ethics board approval, despite this being standard for human subjects research." 346 }, 347 { 348 "flag": "Deliberately excluded demographics", 349 "detail": "Demographics were excluded for anonymity, but this means there is no way to assess whether the sample is representative of professional developers or skewed toward particular experience levels, roles, or backgrounds." 350 }, 351 { 352 "flag": "Speculative discussion beyond empirical evidence", 353 "detail": "Section 5's erosion narratives (understanding, accountability, trust) go substantially beyond what the survey data shows. The paper acknowledges these are 'conceptual interpretations' but presents them prominently, including in the abstract, alongside the empirical findings." 354 } 355 ], 356 "cited_papers": [ 357 { 358 "title": "Tales from the Trenches: Expectations and Challenges from Practice for Code Review in the Generative AI Era", 359 "authors": ["N. Davila", "J. Melegati", "I. Wiese"], 360 "year": 2024, 361 "doi": "10.1109/MS.2024.3428439", 362 "relevance": "Gray literature review identifying practitioner interest in generative AI solutions for code review, directly relevant to AI-assisted software engineering practices." 363 }, 364 { 365 "title": "Deep Learning-based Code Reviews: A Paradigm Shift or a Double-Edged Sword?", 366 "authors": ["R. Tufano", "A. Martin-Lopez", "A. Tayeb", "O. Dabic", "S. Haiduc", "G. Bavota"], 367 "year": 2025, 368 "doi": "10.1109/ICSE55347.2025.00060", 369 "relevance": "Empirical study on how AI-assisted code reviews affect reviewer behavior, finding reviewers focus on annotated sections and miss issues elsewhere." 370 }, 371 { 372 "title": "Future of Software Development with Generative AI", 373 "authors": ["J. Sauvola", "S. Tarkoma", "M. Klemettinen", "J. Riekki", "D. Doermann"], 374 "year": 2024, 375 "doi": "10.1007/s10515-024-00426-z", 376 "relevance": "Discusses the broader future of software development with generative AI, including accountability concerns when delegating engineering tasks to AI." 377 }, 378 { 379 "title": "Human and Machine: How Software Engineers Perceive and Engage with AI-Assisted Code Reviews Compared to Their Peers", 380 "authors": ["A. Alami", "N. Ernst"], 381 "year": 2025, 382 "relevance": "Interview study showing developers experience AI code review comments as lacking contextual grounding and requiring greater cognitive effort to evaluate." 383 }, 384 { 385 "title": "Accountability in Code Review: The Role of Intrinsic Drivers and the Impact of LLMs", 386 "authors": ["A. Alami", "V. Jensen", "N. Ernst"], 387 "year": 2025, 388 "relevance": "Empirical study showing that LLM-assisted reviews reduce developers' perceived ownership of code quality and shift responsibility toward the tool." 389 }, 390 { 391 "title": "On the Future of Software Reuse in the Era of AI Native Software Engineering", 392 "authors": ["A. Taivalsaari", "T. Mikkonen", "C. Pautasso"], 393 "year": 2025, 394 "relevance": "Argues AI-generated code represents a new form of generative reuse where developers rely on artifacts they cannot fully understand, risking long-term maintainability." 395 }, 396 { 397 "title": "Modern Code Reviews - A Survey of Literature and Practice", 398 "authors": ["D. Badampudi", "M. Unterkalmsteiner", "R. Britto"], 399 "year": 2023, 400 "doi": "10.1145/3585004", 401 "relevance": "Comprehensive survey of modern code review literature and practice, providing baseline understanding of code review as currently practiced." 402 } 403 ], 404 "engagement_factors": { 405 "practical_relevance": { 406 "score": 1, 407 "justification": "The paper reports developer expectations about code review's future but provides no actionable tool, framework, or technique that practitioners can apply." 408 }, 409 "surprise_contrarian": { 410 "score": 1, 411 "justification": "The finding that developers expect to spend the same or more time on code review despite AI assistance mildly challenges the 'AI replaces everything' narrative." 412 }, 413 "fear_safety": { 414 "score": 1, 415 "justification": "The erosion narratives (understanding, accountability, trust) raise concerns about AI in software engineering but at a conceptual rather than demonstrated-threat level." 416 }, 417 "drama_conflict": { 418 "score": 1, 419 "justification": "The 'unsupervised software engineering' framing and erosion warnings create mild tension but the paper's tone is measured and academic." 420 }, 421 "demo_ability": { 422 "score": 1, 423 "justification": "Anonymized survey data and analysis scripts are released on GitHub, but there is no interactive demo or tool to try." 424 }, 425 "brand_recognition": { 426 "score": 2, 427 "justification": "The survey includes developers from SAP, Ericsson, and JetBrains — well-known brands in the developer tools space that may attract attention." 428 } 429 } 430 }