scan.json (20109B)
1 { 2 "paper": { 3 "title": "AI-powered Code Review with LLMs: Early Results", 4 "authors": [ 5 "Zeeshan Rasheed", 6 "Malik Abdul Sami", 7 "Muhammad Waseem", 8 "Kai-Kristian Kemell", 9 "Xiaofeng Wang", 10 "Anh Nguyen", 11 "Kari Systä", 12 "Pekka Abrahamsson" 13 ], 14 "year": 2024, 15 "venue": "CEUR Workshop Proceedings" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No repository URL, GitHub link, or any code artifact is provided anywhere in the paper. The multi-agent system is described but not released." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No dataset or test cases are released. The paper does not specify what code was used for testing the agents, nor is any data archive provided." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No environment specifications, dependency lists, or setup details are provided. The paper mentions GPT-4 and API calls but gives no technical environment details." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No reproduction instructions, README, or step-by-step setup is provided. The methodology section describes agent roles in general terms without actionable reproduction steps." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "No quantitative results are presented at all. The preliminary results section (Section 4) is entirely qualitative, so no confidence intervals or error bars exist." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "No statistical significance tests are reported. The paper makes comparative claims ('substantial improvement', 'better than traditional tools') but provides no statistical evidence." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": false, 54 "justification": "No effect sizes or quantitative measures of improvement are reported. Claims of improvement are stated in purely qualitative terms." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No sample size is stated or justified. The paper refers to 'initial tests' and 'several test cases' without specifying how many code samples were reviewed." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No variance, standard deviation, or any measure of variability is reported. There are no quantitative results at all." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": false, 71 "justification": "No baselines are included. The paper claims the system is better than 'traditional static analysis tools' but provides no comparative evaluation against any specific tool." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": false, 76 "justification": "No baselines are used at all, so contemporaneity cannot be assessed." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": false, 81 "justification": "No ablation study is conducted. The system has four agents but there is no evaluation of their individual contributions or what happens when agents are removed." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": false, 86 "justification": "No metrics are used at all. Results are described only in qualitative prose without any quantitative evaluation metrics." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation is included. The paper describes agent outputs qualitatively but no developer study, expert review, or user evaluation was conducted. The paper explicitly defers this to future work." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": false, 96 "justification": "No test set (held-out or otherwise) is described. The evaluation data is completely unspecified." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": false, 101 "justification": "No per-category or per-task breakdown is provided. Results are described per agent role in qualitative terms, but without any quantitative metrics broken down." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": false, 106 "justification": "No failure cases are discussed. The results section presents only positive outcomes with no error analysis or discussion of where the system fails." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": false, 111 "justification": "No negative results are reported. Every experimental observation described is positive, which is a red flag for a preliminary system with no quantitative evaluation." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": false, 118 "justification": "The abstract claims 'enhancing overall code quality' and the system's ability to 'anticipate potential future risks in the code' — these are not supported by any quantitative evidence. The results section provides only qualitative anecdotes." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": false, 123 "justification": "The paper makes causal claims such as 'showed a substantial improvement in identifying potential issues' (Section 6) without any controlled study or even quantitative measurement to justify causation." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper makes broad claims about the system's ability to identify issues 'across different programming languages and AI application domains' (Section 4) without specifying which languages or domains were tested or bounding the generalization." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "No alternative explanations are discussed. The paper does not consider whether the observed behaviors could be explained by GPT-4's general capabilities rather than the multi-agent architecture, or any other confounds." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper mentions 'GPT-4 model' multiple times but never specifies an exact version, snapshot date, or API version (e.g., gpt-4-0613)." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper describes agents using 'prompt-based instructions' and 'prompt-driven instructions' but never provides the actual prompt text. The prompts are described only in general terms." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or any API configuration settings are mentioned." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": false, 155 "justification": "The paper describes a 'centralized coordination component' for inter-agent messaging but provides no detail on the workflow, retry logic, message format, memory management, or coordination protocol. The description is too vague to reproduce." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": false, 160 "justification": "No data preprocessing is documented. The paper does not describe how code inputs were prepared, formatted, or passed to the agents." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no limitations section or threats-to-validity section. The paper has Introduction, Related Work, Research Method, Preliminary Result, Future Work, and Conclusions sections only." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No threats to validity are discussed anywhere in the paper, neither generic nor specific." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "No scope boundaries are stated. The paper does not clarify what the results do not show or what settings were excluded from testing." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No raw data is available. Neither the code inputs nor the agent outputs from testing are provided." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": false, 189 "justification": "No data collection procedure is described. The paper mentions 'initial tests' and 'several test cases' but does not explain what code was tested, from where, or how it was selected." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants are involved in this study. The evaluation is purely automated using the multi-agent system on code." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": false, 199 "justification": "No data pipeline is documented. The path from code input to agent output to the reported qualitative findings is entirely opaque." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Section 7 (Acknowledgment) states: 'We express our sincere gratitude to Business Finland for their generous support and funding of our project.'" 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "All author affiliations are listed in the header: Tampere University, University of Jyväskylä, University of Helsinki, Free University of Bozen Bolzano, and University of South Eastern Norway. None are affiliated with OpenAI or the evaluated product." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": true, 216 "justification": "Business Finland is a Finnish government innovation funding agency. It has no financial interest in whether GPT-4-based code review tools perform well." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "The paper does not evaluate GPT-4 on any benchmark. It uses GPT-4 as part of a code review tool, so benchmark contamination is not relevant." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "No benchmark evaluation is performed, so train/test overlap is not applicable." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "No benchmark is used; the paper presents qualitative observations from ad hoc testing." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants are involved in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants are involved in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants are involved in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants are involved in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants are involved in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants are involved in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants are involved in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost, API cost, tokens consumed, or latency is reported despite the system making multiple GPT-4 API calls per code review." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No computational budget or total API spend is stated." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "The multi-agent system demonstrated a strong capability in identifying a range of issues from minor bugs to significant code smells and inefficiencies across different programming languages and AI application domains.", 294 "evidence": "Section 4 presents qualitative descriptions of agent performance but provides no quantitative metrics, specific test cases, or systematic evaluation data.", 295 "supported": "unsupported" 296 }, 297 { 298 "claim": "The Bug Detection Agent detected issues that traditional static analysis tools either missed or reported with very limited explanation.", 299 "evidence": "Section 4.1 states this but provides no specific comparison against any named static analysis tool, no test cases, and no quantitative data.", 300 "supported": "unsupported" 301 }, 302 { 303 "claim": "The system showed a substantial improvement in identifying potential issues in code and providing actionable recommendations for optimization.", 304 "evidence": "Section 6 (Conclusions) makes this claim. No baseline comparison, no metrics, and no quantitative evidence is provided anywhere in the paper.", 305 "supported": "unsupported" 306 }, 307 { 308 "claim": "The coordinated interaction between agents produced consistent and meaningful outcomes, illustrating the potential of multi-agent LLM architectures to support automated code review.", 309 "evidence": "Section 4.1 describes this qualitatively but provides no evidence of consistency (e.g., agreement across runs, inter-rater reliability) or meaningfulness (e.g., developer ratings, accuracy metrics).", 310 "supported": "weak" 311 } 312 ], 313 "methodology_tags": [ 314 "case-study" 315 ], 316 "key_findings": "The paper proposes a multi-agent system with four GPT-4-based agents (Code Review, Bug Report, Code Smell, Code Optimization) for automated code review. Only qualitative preliminary results are presented, claiming the system can identify bugs, code smells, and suggest optimizations. No quantitative evaluation, baselines, metrics, or systematic testing is reported. The paper is essentially a system description with plans for future empirical validation.", 317 "red_flags": [ 318 { 319 "flag": "No quantitative evaluation", 320 "detail": "The paper presents only qualitative prose descriptions of agent performance with no metrics, no numbers, no tables, and no figures showing results. Every claim of effectiveness is unsupported by data." 321 }, 322 { 323 "flag": "No baselines or comparisons", 324 "detail": "The paper claims superiority over 'traditional static analysis tools' but does not compare against any specific tool, let alone provide controlled comparisons." 325 }, 326 { 327 "flag": "Entirely positive results", 328 "detail": "Every observation in Section 4 is positive. No failure cases, limitations, or negative results are mentioned. This is implausible for a preliminary system and suggests selective reporting." 329 }, 330 { 331 "flag": "Unspecified test data", 332 "detail": "The paper never states what code was used for testing, how many examples were evaluated, what languages were used, or how test cases were selected. The evaluation is completely opaque." 333 }, 334 { 335 "flag": "Claims far exceed evidence", 336 "detail": "The abstract and conclusions make broad claims about 'enhancing overall code quality' and 'anticipating potential future risks' that are not supported by any evidence in the paper." 337 }, 338 { 339 "flag": "Venue template artifact", 340 "detail": "The paper contains the text 'Woodstock 22: Symposium on the irreproducible science, June 07-11, 2022, Woodstock, NY' — this is a leftover from the CEUR-ART template and was never updated, raising questions about the paper's editorial care." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "Large language models for software engineering: Survey and open problems", 346 "authors": ["A. Fan", "B. Gokkaya", "M. Harman", "M. Lyubarskiy", "S. Sengupta", "S. Yoo", "J. M. Zhang"], 347 "year": 2023, 348 "arxiv_id": "2310.03533", 349 "relevance": "Comprehensive survey of LLMs for software engineering, directly relevant to the survey scope." 350 }, 351 { 352 "title": "Evaluating large language models trained on code", 353 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 354 "year": 2021, 355 "arxiv_id": "2107.03374", 356 "relevance": "Introduces Codex/HumanEval, foundational benchmark evaluation for LLM code generation." 357 }, 358 { 359 "title": "CodePori: Large Scale Model for Autonomous Software Development by Using Multi-Agents", 360 "authors": ["Z. Rasheed", "M. Waseem", "M. Saari", "K. Systä", "P. Abrahamsson"], 361 "year": 2024, 362 "arxiv_id": "2402.01411", 363 "relevance": "Multi-agent LLM system for autonomous software development, directly relevant to agentic AI workflows." 364 }, 365 { 366 "title": "A systematic evaluation of large language models of code", 367 "authors": ["F. F. Xu", "U. Alon", "G. Neubig", "V. J. Hellendoorn"], 368 "year": 2022, 369 "relevance": "Systematic benchmark evaluation of code LLMs, relevant to methodology quality in LLM evaluations." 370 }, 371 { 372 "title": "CodeReviewer: Pre-Training for Automating Code Review Activities", 373 "authors": ["Z. Li", "S. Lu", "D. Guo"], 374 "year": 2022, 375 "arxiv_id": "2203.09095", 376 "relevance": "Pre-trained model for code review automation, directly relevant baseline for AI-assisted code review." 377 }, 378 { 379 "title": "Llama-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning", 380 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 381 "year": 2023, 382 "relevance": "LLM-based code review automation using fine-tuning, relevant to AI-assisted code review evaluation." 383 }, 384 { 385 "title": "Using pre-trained models to boost code review automation", 386 "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"], 387 "year": 2022, 388 "relevance": "Pre-trained models applied to code review, relevant baseline for automated code review approaches." 389 }, 390 { 391 "title": "Modern code review: a case study at Google", 392 "authors": ["C. Sadowski", "E. Söderberg", "L. Church", "M. Sipko", "A. Bacchelli"], 393 "year": 2018, 394 "relevance": "Case study of code review practices at scale, provides context for AI-assisted code review research." 395 } 396 ] 397 }