scan.json (24216B)
1 { 2 "paper": { 3 "title": "CodeCoR: An LLM-Based Self-Reflective Multi-Agent Framework for Code Generation", 4 "authors": ["Ruwei Pan", "Hongyu Zhang", "Chao Liu"], 5 "year": 2025, 6 "venue": "ACM (preprint)", 7 "arxiv_id": "2501.07811", 8 "doi": "10.1145/nnnnnnn.nnnnnnn" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "Section 7 (Data Availability) provides a link to source code, experimental data, and prompts: https://anonymous.4open.science/r/CodeCoR-3EFC. Note this is an anonymized link, which may be temporary for review." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses publicly available benchmarks (HumanEval, HumanEval-ET, MBPP, MBPP-ET) and states experimental data is available at their anonymous repository (Section 7)." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No requirements.txt, Dockerfile, or detailed environment setup with library versions is mentioned in the paper. The paper mentions using Python and psutil but does not provide sufficient detail to recreate the environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper points to an anonymous repository but does not include step-by-step reproduction instructions in the paper itself. No README with commands or a 'Reproducing Results' section is described." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results in Tables 2-6 are reported as point estimates (e.g., '86.6%') with no confidence intervals or error bars, despite the paper stating they ran 10 rounds of experiments (Section 5.4)." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims CodeCoR 'significantly outperforms' baselines multiple times but provides no statistical significance tests (no p-values, t-tests, or other tests). Differences are compared by raw numbers only." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports percentage improvements with baselines: e.g., CodeCoR achieves 86.6% vs MapCoder's 80.5% on HumanEval, and provides average Pass@1 of 77.8% vs MapCoder's 72.8% (Section 4.5). This provides sufficient context to judge magnitude." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for why HumanEval (164 problems), MBPP, or the first 10 problems for cost analysis were chosen. No power analysis is discussed." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper states they 'conducted 10 rounds of experiments for each trial and averaged the results' (Section 5.4), but no standard deviations, variance, or spread measures are reported anywhere in the tables." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "Table 1 lists 22 baseline models including code LLMs (InCoder, CodeGeeX, StarCoder, etc.) and multi-agent frameworks (MapCoder, CodeCoT, MetaGPT, ChatDev). Comprehensive comparisons in Tables 2 and 6." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Baselines include MapCoder (2024), CodeCoT (2023), MetaGPT (2023), ChatDev (2024), and GPT-4 — all recent and competitive at time of writing." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 4 (RQ2) presents ablation results removing each of the four major components: w/o Prompt Agent, w/o Test Agent, w/o Repair Agent, and w/o Pruning Method, demonstrating each component's contribution." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper uses three metrics: Pass@1, Edit Distance, and BLEU score (Section 4.4, Tables 2-3)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "No human evaluation of the generated code is included. All evaluation is automated via test case execution and automated metrics (Pass@1, Edit Distance, BLEU). Human evaluation could assess code readability and maintainability." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "The benchmarks HumanEval and MBPP have standard test suites that are separate from the task descriptions. The paper evaluates against these standard test cases." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down per dataset (HumanEval, HumanEval-ET, MBPP, MBPP-ET) in Tables 2 and 4, allowing readers to see variation across different benchmark types." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Figure 6 (Section 5.2) shows a specific failure case where code without the Repair Agent contains a semantic error (unnecessary conditional check), and discusses how the Repair Agent fixes it." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The ablation study (Table 4) shows that removing components degrades performance, which constitutes negative results. Section 5.3 shows that performance varies across repair rounds, with best at 3 rounds and worse at others." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims an average Pass@1 of 77.8% and that CodeCoR 'significantly outperforms existing baselines (e.g., CodeCoT and MapCoder).' Table 2 confirms these numbers: average across 4 datasets is indeed 77.8% (86.6+80.5+79.2+65.2)/4." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims through ablation studies (Table 4, Section 4.6): removing each component reduces performance, demonstrating each component's contribution via controlled single-variable manipulation." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title claims a general 'Framework for Code Generation' but all experiments are on Python-only benchmarks (HumanEval, MBPP). The paper does not bound its claims to Python or to the specific function-level code generation tasks tested. The broad title and claims about 'code generation' generally are not bounded to the tested setting." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "Section 5.4 (Threats to Validity) discusses generic methodological concerns (execution environment variability, experimenter bias, metric suitability) but does not discuss specific alternative explanations for why CodeCoR outperforms baselines — e.g., whether the improvement comes from simply making more API calls or generating more candidates." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper uses 'GPT-3.5-turbo', 'GPT-4', and 'CodeLlama (34B)' without specifying exact model versions or snapshot dates (e.g., no 'gpt-3.5-turbo-0613' or API version). Marketing names without version identifiers do not count." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Figure 3 shows example prompts and outputs for all four agents, and Figure 5 shows the pruning prompts with the actual workflow. Section 7 states that 'concrete examples of prompts and generated code' are available in the repository." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported anywhere in the paper, despite using LLM APIs for all four agents." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The multi-agent scaffolding is described in detail in Section 3 (Methodology), including the four agents, their interactions, the five-phase workflow (Figure 2), pruning methods (Section 3.3, Figures 4-5), and the overall algorithm (Algorithm 1)." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4.2 describes the four datasets used and their characteristics. The paper uses standard benchmarks without modification, and Section 4.3 notes that 'results of the baselines are obtained from previous studies under the same experimental setting.'" 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 5.4 'Threats to Validity' is a dedicated subsection discussing four categories of validity threats with multiple bullet points." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "The threats in Section 5.4 are largely generic: 'minor fluctuations in execution environments could still introduce variability,' 'experimenter bias and errors remain potential threats,' 'the suitability of these metrics could be questioned.' These are boilerplate concerns, not specific threats to this particular study." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what results do NOT show. The final bullet in Section 5.4 acknowledges 'specific datasets and settings used might limit the broader applicability' but does not specify what was not tested (e.g., non-Python languages, complex multi-file projects, real-world software engineering tasks)." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 7 states that 'source code, experimental data, and concrete examples of prompts and generated code' are available at the anonymous repository. The underlying benchmarks (HumanEval, MBPP) are also publicly available." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4.2 describes the four datasets. Section 4.3 explains that baseline results were obtained from previous studies. Section 4.7 (RQ3) describes using psutil on the first 10 HumanEval problems on a dedicated server." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants. The paper uses standard code generation benchmarks." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "Algorithm 1 documents the full pipeline from task description input through all five phases to final code output. The flow of data between agents is clearly documented in Section 3 and Figures 2, 4, 5." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants or sponsors." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All three authors are listed as affiliated with Chongqing University, China. Their email addresses ({panruwei,hyzhang,liuchao}@cqu.edu.cn) are provided." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure is a concern — the paper does not state whether it is unfunded or simply omits this information." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses GPT-3.5-turbo, GPT-4, and CodeLlama on HumanEval and MBPP benchmarks but does not state the training data cutoff dates for any of these models." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "HumanEval (published 2021) and MBPP (published 2021) are well-known public benchmarks. GPT-3.5-turbo and GPT-4 were trained after these benchmarks were published, creating contamination risk. The paper does not discuss this at all." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "HumanEval and MBPP have been publicly available since 2021, well before the training cutoffs of GPT-3.5-turbo, GPT-4, and CodeLlama. The paper does not address the risk that these models may have seen the benchmark problems during training." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants in this study. It is a benchmark evaluation of code generation systems." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Table 5 (RQ3) reports runtime (123.69s), CPU usage (0.8%), memory (0.01 GB), disk I/O, and network I/O for CodeCoR vs baselines on the first 10 HumanEval problems. However, no API cost in dollars or tokens consumed is reported." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "While Table 5 reports per-problem cost metrics for 10 problems, the total computational budget for all experiments (total API calls, total tokens, total dollar cost) is not stated. The cost analysis covers only 10 problems, not the full experimental suite." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "CodeCoR achieves an average Pass@1 score of 77.8% across four datasets (HumanEval, HumanEval-ET, MBPP, MBPP-ET) with GPT-3.5-turbo, significantly outperforming MapCoder (72.8%) and CodeCoT.", 287 "evidence": "Table 2 shows CodeCoR scores of 86.6%, 80.5%, 79.2%, 65.2% on the four datasets respectively. MapCoder scores 80.5%, 77.4%, 78.9%, 54.4%. Average for CodeCoR is 77.8% vs MapCoder's 72.8%.", 288 "supported": "moderate" 289 }, 290 { 291 "claim": "CodeCoR achieves Pass@1 of 94.5% on HumanEval and 83.5% on HumanEval-ET when using GPT-4.", 292 "evidence": "Table 6 reports these numbers for GPT-4. CodeCoR outperforms MapCoder (93.9%, 82.9%) and all other methods listed.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Each major component (Prompt Agent, Test Agent, Repair Agent, Pruning Method) is necessary for CodeCoR's performance.", 297 "evidence": "Table 4 (RQ2) shows ablation results: removing Test Agent causes the largest drop (45.1% on HumanEval vs 86.6% full), removing Prompt Agent drops to 77.4%, removing Repair Agent to 75.6%, and removing Pruning to 77.4%.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "CodeCoR incurs lower runtime cost than other representative code generation frameworks.", 302 "evidence": "Table 5 shows CodeCoR at 123.69s vs MapCoder 166.45s, SCoT 251.79s, Self-Planning 242.92s, though CodeChain is faster at 121.80s. Measured on only the first 10 HumanEval problems.", 303 "supported": "weak" 304 }, 305 { 306 "claim": "The code generated by CodeCoR shows higher textual similarity (BLEU) and shorter edit distance to ground-truth code compared to baselines.", 307 "evidence": "Table 3 shows CodeCoR achieves mean BLEU of 0.314 vs MapCoder's 0.295 and mean edit distance of 272.70 vs MapCoder's 282.49.", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "CodeCoR is a self-reflective multi-agent framework for code generation using four LLM-based agents (Prompt, Coding, Test, Repair) with pruning methods to filter low-quality outputs at each stage. On four Python code generation benchmarks using GPT-3.5-turbo, CodeCoR achieves an average Pass@1 of 77.8%, outperforming MapCoder (72.8%) and CodeCoT. Ablation studies confirm each component's necessity, with the Test Agent being most critical. The framework also generalizes to GPT-4 (94.5% on HumanEval) and CodeLlama, though all evaluations are limited to Python function-level code generation.", 313 "red_flags": [ 314 { 315 "flag": "No statistical significance tests despite 'significantly outperforms' claims", 316 "detail": "The paper repeatedly claims CodeCoR 'significantly outperforms' baselines but provides no statistical tests (p-values, confidence intervals). The difference on MBPP between CodeCoR (79.2%) and MapCoder (78.9%) is only 0.3 percentage points, well within likely variance. Despite running 10 rounds, no standard deviations are reported." 317 }, 318 { 319 "flag": "Benchmark contamination not addressed", 320 "detail": "HumanEval and MBPP were published in 2021. GPT-3.5-turbo and GPT-4 were trained after this date and may have seen these benchmarks in training data. The paper does not acknowledge or address this contamination risk, which could inflate all results." 321 }, 322 { 323 "flag": "Cost analysis on only 10 problems", 324 "detail": "The cost comparison in Table 5 uses only the first 10 HumanEval problems, which is a tiny sample. No API cost in dollars or tokens is reported, making it impossible to assess practical cost at scale." 325 }, 326 { 327 "flag": "Baseline results taken from other papers", 328 "detail": "Section 4.3 states 'results of the baselines are obtained from previous studies under the same experimental setting.' This makes it impossible to verify identical conditions (same API endpoints, same model versions, same time period) across all comparisons." 329 }, 330 { 331 "flag": "No model version specification", 332 "detail": "GPT-3.5-turbo and GPT-4 are referenced without specific version identifiers. Model behavior changes across versions, making results non-reproducible. Baseline results from other papers may use different model snapshots." 333 }, 334 { 335 "flag": "Generalization claims exceed evidence", 336 "detail": "The paper presents itself as a general 'Code Generation' framework but all evaluations are on Python-only, function-level benchmarks. No testing on other programming languages, multi-file projects, or real-world software engineering tasks." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving", 342 "authors": ["M. A. Islam", "M. E. Ali", "M. R. Parvez"], 343 "year": 2024, 344 "arxiv_id": "2405.11403", 345 "relevance": "State-of-the-art multi-agent code generation framework that CodeCoR directly compares against and claims to outperform." 346 }, 347 { 348 "title": "CodeCoT and Beyond: Learning to Program and Test like a Developer", 349 "authors": ["D. Huang", "Q. Bu", "H. Cui"], 350 "year": 2023, 351 "arxiv_id": "2308.08784", 352 "relevance": "Multi-agent code generation framework using chain-of-thought; one of CodeCoR's primary baselines." 353 }, 354 { 355 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 356 "authors": ["S. Hong", "X. Zheng", "J. Chen"], 357 "year": 2023, 358 "arxiv_id": "2308.00352", 359 "relevance": "Foundational multi-agent framework for software development with SOP-based coordination." 360 }, 361 { 362 "title": "ChatDev: Communicative Agents for Software Development", 363 "authors": ["C. Qian", "W. Liu", "H. Liu"], 364 "year": 2024, 365 "arxiv_id": "2307.07924", 366 "relevance": "Multi-agent software development framework using communicative agents with role specialization." 367 }, 368 { 369 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 370 "authors": ["N. Shinn", "F. Cassano", "A. Gopinath"], 371 "year": 2024, 372 "relevance": "Self-reflective agent framework using verbal feedback; baseline for code generation comparison." 373 }, 374 { 375 "title": "Self-Collaboration Code Generation via ChatGPT", 376 "authors": ["Y. Dong", "X. Jiang", "Z. Jin", "G. Li"], 377 "year": 2023, 378 "arxiv_id": "2304.07590", 379 "relevance": "Early multi-agent code generation framework with analyst-coder-tester role assignment." 380 }, 381 { 382 "title": "Evaluating Large Language Models Trained on Code", 383 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 384 "year": 2021, 385 "arxiv_id": "2107.03374", 386 "relevance": "Introduces HumanEval benchmark, the primary evaluation dataset used in this paper." 387 }, 388 { 389 "title": "Teaching Large Language Models to Self-Debug", 390 "authors": ["X. Chen", "M. Lin", "N. Schärli", "D. Zhou"], 391 "year": 2023, 392 "arxiv_id": "2304.05128", 393 "relevance": "Self-debugging approach for LLM code generation; baseline in evaluation." 394 }, 395 { 396 "title": "CodeChain: Towards Modular Code Generation through Chain of Self-Revisions with Representative Sub-Modules", 397 "authors": ["H. Le", "H. Chen", "A. Saha"], 398 "year": 2023, 399 "arxiv_id": "2310.08992", 400 "relevance": "Iterative self-revision approach to code generation; used as baseline for both accuracy and cost comparison." 401 }, 402 { 403 "title": "Code Llama: Open Foundation Models for Code", 404 "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"], 405 "year": 2024, 406 "arxiv_id": "2308.12950", 407 "relevance": "Open-source code LLM used to evaluate CodeCoR's generalizability beyond proprietary models." 408 }, 409 { 410 "title": "Is Self-Repair a Silver Bullet for Code Generation?", 411 "authors": ["T. X. Olausson", "J. P. Inala", "C. Wang"], 412 "year": 2023, 413 "relevance": "Examines limitations of self-repair in code generation, directly relevant to CodeCoR's repair agent design." 414 }, 415 { 416 "title": "INTERVENOR: Prompt the Coding Ability of Large Language Models with the Interactive Chain of Repairing", 417 "authors": ["H. Wang", "Z. Liu", "S. Wang"], 418 "year": 2023, 419 "arxiv_id": "2311.09868", 420 "relevance": "Interactive repair framework for LLM code generation; baseline in evaluation." 421 } 422 ] 423 }