scan.json (29888B)
1 { 2 "paper": { 3 "title": "Enhancing LLM Code Generation: A Systematic Evaluation of Multi-Agent Collaboration and Runtime Debugging for Improved Accuracy, Reliability, and Latency", 4 "authors": [ 5 "Nazmus Ashrafi", 6 "Salah Bouktif", 7 "Mohammed Mediani" 8 ], 9 "year": 2025, 10 "venue": "arXiv", 11 "arxiv_id": "2505.02133", 12 "doi": "10.48550/arXiv.2505.02133" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "Debugging-based approaches generally outperform multi-agent agentic workflows for LLM code generation across 19 models on HumanEval/HumanEval+. Combining a simple two-agent (Analyst-Coder) workflow with debugging yields a modest 0.68% mean accuracy improvement over debugging alone, but adding a more complex three-agent system (Analyst-Coder-Tester) reduces accuracy for most models. The benefit of combining approaches is greatest when the performance gap between debugging and agentic methods is small for a given model. Reduced agentic complexity generally produces more rigorous code as measured by HumanEval+ score retention.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "A GitHub repository is provided (https://github.com/nazmus-ashrafi/multiagent_vs_debugger) with agent prompts. Referenced in Section 3 and Section 4.3.1." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses HumanEval and HumanEval+, both publicly available benchmark datasets. No proprietary data was collected." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper does not describe library versions or dependencies needed to reproduce the experiments." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper or referenced. The GitHub repo contains prompts but no described workflow for replication." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Table 2 are point estimates (pass@1 scores) with no confidence intervals or error bars. The t-tests report p-values but no CIs on the accuracy differences." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": true, 50 "justification": "One-tailed paired t-tests are used (Section 4.2) to compare ACT+Debug vs ACT alone and ACT+Debug vs Debug alone, with explicit hypothesis formulation and p-value reporting." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Mean accuracy differences are reported with baseline context: e.g., ACT+Debug mean 64.82% vs ACT alone 57.16% vs Debug alone 63.86% (Section 4.2). Per-model improvements are given in Table 2 with specific percentage point changes." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification for why 19 LLMs were selected as the sample size. No power analysis is discussed. The choice appears driven by availability rather than statistical planning." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper explicitly uses n=1 (one sample per problem) per Section 4.1.1: 'we chose to generate only one sample per problem (n=1).' No variance across runs is reported." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Six approaches are compared: Basic, AC, ACT, Debugger, AC+Debugger, and ACT+Debugger (Section 4.1.3, Figure 8). These include individual components and combinations." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include the LDB debugger (2024) and self-collaboration framework (2023), both recent at time of writing. Models tested include GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3, all contemporary." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "RQ2 (Section 4.3) systematically removes/adds components: Basic → AC → ACT → Debugger → AC+Debug → ACT+Debug, isolating the contribution of each component. Figure 8 shows the composition of each segment." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper evaluates pass@1 accuracy, code rigorousness (HumanEval vs HumanEval+ accuracy drop, RQ3), and generation latency (RQ4, Table 3)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Evaluation is entirely automated via pass/fail on test suites (HumanEval and HumanEval+). No human evaluation of code quality, readability, or correctness is performed." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper explicitly separates 'visible test cases' (used during the framework for debugging) from 'hidden test cases, reserved for evaluating the final output' (Section 3)." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 provides per-model breakdowns for all 19 LLMs across all 6 approaches on both datasets. Figures 4, 5 give detailed per-provider analysis." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 4.2.3 discusses cases where ACT hurts performance (e.g., QwQ-Preview where AC/ACT performed significantly worse than Basic). Section 4.3.2 discusses specific failures like agentic complexity reducing accuracy for Llama, DeepSeek, etc." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Key negative findings reported: ACT+Debug does not significantly outperform Debug alone (Section 4.2.2); adding a Tester agent often reduces accuracy (Section 4.3.2); ACT+Debug shows highest robustness drop (Section 4.4.1, Figure 11)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims about 85% confidence level for combined vs ACT alone, and inability to show significance over Debug alone, are both supported by the t-test results in Section 4.2. Claims about 0.68% improvement match Table 2 analysis." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims like 'combining ACT and debugging improves accuracy' are supported by a controlled ablation design where components are systematically added/removed while other factors are held constant (same models, same datasets, same prompts). This single-variable manipulation is adequate for the causal claims made." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Improved Accuracy, Reliability, and Latency' broadly. The conclusion speaks of 'organizations seeking robust AI-driven coding solutions' and 'real-world programming scenarios.' However, results are only on HumanEval/HumanEval+ (Python, function-level tasks). No bounding to this narrow scope." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper offers one interpretation for each finding (e.g., debugging provides 'rich context') but does not consider confounds such as the additional API calls providing more tokens/attempts regardless of the specific approach, or whether prompt design rather than multi-agent structure drives the differences." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures pass@1 and frames it as 'functional accuracy,' and measures HE/HE+ drop as 'code rigorousness.' These claims match the granularity of the measurements without overclaiming broader code quality constructs." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "Table 1 lists model names like 'GPT-4o', 'Claude 3.5 Sonnet', 'DeepSeek-V3' without specific version IDs or snapshot dates. The paper only states 'All APIs were accessed in the month of December 2024' which is insufficient per the schema requirement for exact versions." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Agent prompts are available in the GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger), referenced in Sections 3 and 4.3.1: 'All agent prompts, including those used in the debugging process, are can be found in our GitHub repository.'" 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No temperature, top-p, max tokens, or other LLM sampling parameters are reported. Only iteration limits are stated (retriesCT=3, retriesD=4 or 10). These significantly affect output quality." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The multi-agent scaffolding is described in detail in Section 3 with Figure 1 showing the architecture. Agent roles (Analyst, Coder, Tester), interaction flow, retry logic (retriesCT, retriesD), and the CFG-based debugging mechanism are all documented." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 3 describes how HumanEval is segmented into three components: task description, visible test cases (for execution within framework), and hidden test cases (for final evaluation)." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations or threats-to-validity section. Some caveats are mentioned inline (e.g., 'A limitation of this approach is its reliance on a limited set of visible test cases' in Section 3; 'which may not be ideal' regarding same prompts for all models), but these are scattered and not substantive." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "A few specific caveats exist (same prompts for all models, visible test case limitation, n=1 design) but these are brief inline mentions, not substantive discussion. No systematic analysis of what could invalidate the findings." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show. No mention of limitations to Python, function-level tasks, HumanEval-specific characteristics, or the narrow scope of the tested approaches. The conclusion speaks broadly about 'real-world programming scenarios.'" 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw experimental data (generated code, execution logs, per-problem pass/fail results) is released. Only aggregated pass@1 scores are shown in Table 2." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Data sources are well described: HumanEval (164 tasks, Section 4.1.2) and HumanEval+ (80x more tests). API access timing documented (December 2024, Table 1)." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. Data sources are standard public benchmarks (HumanEval, HumanEval+)." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The agent pipeline is described but the data pipeline from raw outputs to final metrics is not documented. No information on how many problems each model solved at each stage, how errors were handled, or what intermediate outputs looked like." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated: all three authors are from the Department of Computer Science and Software Engineering, United Arab Emirates University. They are not affiliated with any LLM provider being evaluated." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding source is disclosed, making it impossible to assess funder independence. University affiliation suggests academic funding, but without explicit disclosure this cannot be confirmed." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper states APIs were accessed in December 2024 but does not state training data cutoff dates for any of the 19 models tested." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether HumanEval/HumanEval+ problems appear in the training data of the 19 models, despite HumanEval being published in 2021 and widely known to be contaminated in newer models." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "HumanEval was published in July 2021. All 19 models tested were trained well after this date and likely saw HumanEval solutions during training. This contamination risk is not addressed anywhere in the paper." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. All evaluation is automated benchmark-based." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Latency is reported in detail: Table 3 summarizes average time per approach (7.68 to 68.42 minutes), and Figure 13 shows per-model latency across approaches." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No total API costs, GPU hours, or hardware specifications are provided. Running 19 models × 6 approaches × 2 datasets represents substantial compute, but the total budget is not quantified." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Only n=1 (one sample per problem) is generated. No multiple seeds or runs are conducted. Section 4.1.1: 'we chose to generate only one sample per problem (n=1).'" 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Explicitly stated in Section 4.1.1: 'we chose to generate only one sample per problem (n=1) in our experiments.' This is clear, even though it's a single run." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is described. The iteration limits (retriesCT=3, retriesD=4/10) appear to be set by design, but no search over LLM hyperparameters (temperature, etc.) is reported." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "All six configurations are reported transparently in Table 2 with full per-model results. The paper does not selectively show only the best configuration — all approaches and all models are presented." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Multiple t-tests are conducted (ACT+Debug vs ACT, ACT+Debug vs Debug) without correction for multiple comparisons. The already-lenient α=0.15 exacerbates this issue." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "No discussion of author-evaluation bias. The authors implement their own version of the ACT and LDB frameworks and compare them, but don't acknowledge that their implementations of baselines may systematically differ from the original authors' implementations." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": true, 330 "justification": "RQ4 (Section 4.5, Table 3) explicitly compares latency vs accuracy across approaches: 'The AC + Debugger configuration attains the highest average accuracy (61.7%) across both datasets while maintaining a reasonable execution time of 38.42 minutes.'" 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "No discussion of whether HumanEval/HumanEval+ actually measures the code generation quality the paper claims to evaluate. These are function-level Python tasks, but the paper claims implications for 'real-world programming scenarios' without questioning the benchmark's validity for that purpose." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "The same scaffolding (ACT framework, LDB-based debugger) is applied consistently across all 19 models. Model comparisons use identical prompts and identical agent configurations, controlling for the scaffold variable." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "HumanEval was published in 2021 and all tested models were trained after 2021. Solutions and discussions of HumanEval problems are widely available online. This temporal leakage is not addressed." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The visible test cases used during the debugging phase provide information about expected behavior. No discussion of whether this constitutes feature leakage relative to real-world usage conditions." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether HumanEval problems or their variants appeared in model training data. Given the widespread use of HumanEval, near-duplicate problems likely exist in training corpora." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are used." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "The combined ACT+Debug approach significantly outperforms ACT alone at α=0.15 significance level", 369 "evidence": "One-tailed paired t-test across 19 LLMs on HumanEval: mean ACT+Debug 64.82% vs ACT 57.16% (Section 4.2.1, Figure 2). H0,1 rejected.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The combined ACT+Debug approach does not significantly outperform Debug alone", 374 "evidence": "One-tailed paired t-test: ACT+Debug 64.82% vs Debug 63.86%, improvement of 0.96%. H0,2 not rejected at α=0.15 (Section 4.2.2, Figure 3).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "A simple AC (Analyst-Coder) workflow combined with debugging achieves 0.68% mean accuracy improvement over debugging alone", 379 "evidence": "Table 2 results across 19 LLMs and two datasets. AC+Debug mean accuracy slightly exceeds Debug mean (Section 4.2.3).", 380 "supported": "weak" 381 }, 382 { 383 "claim": "Debugging-based approaches generally outperform agentic workflows", 384 "evidence": "Mean accuracy: Debug 63.86% vs ACT 57.16% on HumanEval across 19 LLMs. Debug improves over ACT by 6.7% on HumanEval and 7.36% on HumanEval+ (Section 4.3.2).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "More complex agentic configurations reduce accuracy when combined with debugging for most models", 389 "evidence": "ACT+Debug performs 1.22% worse than AC+Debug on HumanEval+ (Section 4.3.2). Table 2 shows multiple models where ACT+Debug underperforms AC+Debug.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Reduced agentic interaction generally leads to more rigorous code generation", 394 "evidence": "Basic approach shows smallest accuracy drop (90.83) between HumanEval and HumanEval+; ACT+Debug shows highest drop (137.74). AC+Debug achieves better HumanEval+ results than ACT+Debug (Section 4.4.1, Figure 11).", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "OpenAI models generally exhibit stable performance improvements when using combined approaches", 399 "evidence": "Figure 4 shows all three OpenAI models (GPT-3.5-turbo, GPT-4o-mini, GPT-4o) improve with combined approaches. But this is only 3 models (Section 4.2.3).", 400 "supported": "weak" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "Very lenient significance threshold", 406 "detail": "The study uses α=0.15 instead of the conventional 0.05. The justification (detecting small improvements) is really an argument for larger sample sizes, not lower thresholds. Even at α=0.15, the combined approach fails to significantly outperform debugging alone." 407 }, 408 { 409 "flag": "Single-run evaluation (n=1)", 410 "detail": "Only one sample is generated per problem with no repeated runs. LLM outputs are stochastic; a single run provides no information about variance. The reported differences (especially 0.68%) could easily fall within run-to-run variation." 411 }, 412 { 413 "flag": "No contamination analysis", 414 "detail": "HumanEval was published in 2021 and is widely known to be contaminated in post-2021 models. All 19 tested models were trained after this date. The paper draws conclusions about model capabilities without addressing whether models have memorized solutions." 415 }, 416 { 417 "flag": "Marginal improvements without adequate statistical support", 418 "detail": "The headline finding (0.68% improvement from AC+Debug over Debug) is extremely small and not statistically significant even at α=0.15. Presenting this as a meaningful practical improvement is not well-supported." 419 }, 420 { 421 "flag": "No limitations section", 422 "detail": "The paper lacks a dedicated limitations section despite having significant methodological constraints (n=1, two benchmarks only, Python only, no hyperparameter reporting, HumanEval contamination)." 423 }, 424 { 425 "flag": "Cherry-picked model narratives", 426 "detail": "Section 4.2.3 selectively highlights OpenAI and Anthropic models as 'leading AI research companies' deserving deeper analysis. The narrative around OpenAI models 'generally performing well' is based on only 3 models and appears to be cherry-picked from the mixed overall results." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Evaluating Large Language Models Trained on Code", 432 "authors": ["M. Chen"], 433 "year": 2021, 434 "arxiv_id": "2107.03374", 435 "relevance": "Introduced HumanEval benchmark and pass@k metric, foundational to LLM code generation evaluation." 436 }, 437 { 438 "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation", 439 "authors": ["D. Huang", "J. M. Zhang", "M. Luck", "Q. Bu", "Y. Qing", "H. Cui"], 440 "year": 2023, 441 "arxiv_id": "2312.13010", 442 "relevance": "Multi-agent framework for code generation with test designer, test executor, and coder agents." 443 }, 444 { 445 "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving", 446 "authors": ["Md. A. Islam", "M. E. Ali", "M. R. Parvez"], 447 "year": 2024, 448 "arxiv_id": "2405.11403", 449 "relevance": "Multi-agent system with four specialized agents for competitive programming, demonstrating agentic code generation." 450 }, 451 { 452 "title": "Self-collaboration Code Generation via ChatGPT", 453 "authors": ["Y. Dong", "X. Jiang", "Z. Jin", "G. Li"], 454 "year": 2023, 455 "arxiv_id": "2304.07590", 456 "relevance": "Introduced the Analyst-Coder-Tester framework that this paper directly builds upon and evaluates." 457 }, 458 { 459 "title": "Debug like a Human: A Large Language Model Debugger via Verifying Runtime Execution Step-by-step", 460 "authors": ["L. Zhong", "Z. Wang", "J. Shang"], 461 "year": 2024, 462 "arxiv_id": "2402.16906", 463 "relevance": "The LDB debugger framework that forms the debugging component of this paper's approach." 464 }, 465 { 466 "title": "ChatDev: Communicative Agents for Software Development", 467 "authors": ["C. Qian"], 468 "year": 2023, 469 "arxiv_id": "2307.07924", 470 "relevance": "Multi-agent software development framework demonstrating role-playing agents for code generation." 471 }, 472 { 473 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 474 "authors": ["S. Hong"], 475 "year": 2023, 476 "arxiv_id": "2308.00352", 477 "relevance": "Multi-agent framework for software development using meta-programming approaches." 478 }, 479 { 480 "title": "CYCLE: Learning to Self-Refine the Code Generation", 481 "authors": ["Y. Ding", "M. J. Min", "G. Kaiser", "B. Ray"], 482 "year": 2024, 483 "doi": "10.1145/3649825", 484 "relevance": "Self-refinement approach for code generation using execution feedback, directly relevant to debugging strategies." 485 }, 486 { 487 "title": "RGD: Multi-LLM Based Agent Debugger via Refinement and Generation Guidance", 488 "authors": ["H. Jin", "Z. Sun", "H. Chen"], 489 "year": 2024, 490 "arxiv_id": "2410.01242", 491 "relevance": "Multi-LLM debugging framework with Guide, Debug, and Feedback agents for iterative code refinement." 492 }, 493 { 494 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 495 "authors": ["N. Shinn"], 496 "year": 2023, 497 "arxiv_id": "2303.11366", 498 "relevance": "Verbal reinforcement learning for LLM agents; combined with LDB achieves 98.2 on HumanEval, directly motivating this study." 499 }, 500 { 501 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 502 "authors": ["J. Liu", "C. Steven Xia", "Y. Wang", "Lingming Zhang"], 503 "relevance": "Introduced HumanEval+ with 80x more tests, used as the rigorousness benchmark in this study." 504 }, 505 { 506 "title": "SOEN-101: Code Generation by Emulating Software Process Models Using Large Language Model Agents", 507 "authors": ["F. Lin", "D. J. Kim", "Tse-Husn", "Chen"], 508 "year": 2024, 509 "arxiv_id": "2403.15852", 510 "relevance": "Multi-agent framework simulating software process models (Waterfall, TDD, Scrum) for code generation." 511 }, 512 { 513 "title": "From Code to Correctness: Closing the Last Mile of Code Generation with Hierarchical Debugging", 514 "authors": ["Y. Shi", "S. Wang", "C. Wan", "X. Gu"], 515 "year": 2024, 516 "arxiv_id": "2410.01215", 517 "relevance": "Hierarchical debugging tool (MGDebugger) that addresses code bugs at varying granularity levels." 518 }, 519 { 520 "title": "Teaching Large Language Models to Self-Debug", 521 "authors": ["X. Chen", "M. Lin", "N. Schärli", "D. Zhou"], 522 "year": 2023, 523 "arxiv_id": "2304.05128", 524 "relevance": "Foundational work on LLM self-debugging through iterative generation, explanation, and execution feedback." 525 } 526 ] 527 }