scan.json (28290B)
1 { 2 "paper": { 3 "title": "Collaborative Agents for Automated Program Repair in Ruby", 4 "authors": [ 5 "Nikta Akbarpour", 6 "Mahdieh Sadat Benis", 7 "Fatemeh Hendijani Fard", 8 "Ali Ouni", 9 "Mohamed Aymen Saied" 10 ], 11 "year": 2025, 12 "venue": "arXiv preprint (submitted to ACM conference)", 13 "arxiv_id": "2511.03925" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper provides a Figshare replication package link: https://figshare.com/s/829875edc8c876c50de5. Section 8 states 'Full replication package and experimental data are available at' this URL. The introduction also states 'Releasing all scripts and experimental results as open source to support replication and future research.'" 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The evaluation uses the publicly available xCodeEval benchmark (Khan et al. 2024). The replication package on Figshare includes experimental data. The benchmark dataset is publicly available." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "Section 3.3 mentions GPU hardware (NVIDIA Tesla V100, NVIDIA H100 SXM5) and 4-bit quantization, but there is no mention of a requirements.txt, Dockerfile, conda environment, or specific library versions. The environment specification is insufficient to recreate the setup." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "While the replication package is provided on Figshare, the paper itself does not include step-by-step reproduction instructions, a README description of how to run experiments, or specific commands. The paper only provides a link to the replication package without detailing what it contains or how to use it." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": false, 42 "justification": "All results are reported as point estimates (e.g., 67.0% pass@1) without confidence intervals or error bars. No uncertainty quantification is provided." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper claims RAMP 'outperforms' baselines based solely on comparing pass@1 numbers (e.g., 67.0% vs 61.7%) without any statistical significance tests. Section 3.4 explicitly states 'As this is a deterministic generation with no randomness, the results are comparable without requiring statistical tests,' but this justification is weak given the small sample sizes (34 questions in RQ1)." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper consistently reports percentage point differences with baseline context. For example, ablation studies report 'pass@1 dropping from 66.5% to 48.4% (−18.1 points)' and comparative results give both raw values (e.g., RAMP 67.0% vs LANTERN 61.7%). This provides enough context to understand effect magnitudes." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "The RQ1 comparison uses only 34 Ruby samples (10% of validation set). The paper acknowledges this was 'due to resource constraints' (Section 6) but does not justify whether 34 samples provide adequate statistical power for the claims made. No power analysis is presented." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "Section 3.4 states they use greedy decoding (deterministic, single sample), so there is no variance across runs. However, this means the paper reports single-run results with no measure of variability. No standard deviation or variance is reported for any experiment." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Section 3.5 describes six baselines: LANTERN, ChatRepair, Self-Planning, Self-Collaboration, Few-Shot, and Zero-Shot prompting. Table 1 compares all methods on Ruby." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "LANTERN (2025), ChatRepair (2024), Self-Planning (2024), Self-Collaboration (2024), and RepairAgent (2025) are all recent approaches. The baselines are contemporary and represent the current state of the art in LLM-based APR." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": true, 79 "justification": "RQ2 (Table 2) presents ablation studies removing test generation, self-reflection, first reflection, all reflection, and adding specification inference. RQ3 (Table 4) ablates prompt components (I/O specs, time/memory limits, sample I/O). These ablations are conducted across two models." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper uses only pass@1 as its evaluation metric. Section 3.4 justifies this choice but acknowledges it 'may miss qualities such as efficiency and readability' (Section 6). No other metrics (e.g., exact match, edit distance, code quality) are reported." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": false, 89 "justification": "No human evaluation is included. The paper evaluates repairs solely through automated test execution. Given the paper's claims about practical repair utility, human evaluation of repair quality (e.g., readability, correctness beyond test passing) would be relevant." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "The xCodeEval benchmark provides hidden test cases that are used only for final evaluation (Section 3.2, Step 5). Generated tests are used during the iterative repair process, and hidden benchmark tests are reserved for final validation, providing a clear separation." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "RQ4 provides breakdowns by difficulty level (Figure 4 left), bug execution outcome type (Figure 5 right), and problem domain tags (Figure 5 left). Table 3 provides per-iteration breakdowns of test case outcomes." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "RQ4 discusses failure categories: RAMP struggles with TIME_LIMIT_EXCEEDED (40.0% success), advanced categories (binary search, bitmasks, matrices at 0%), and higher-difficulty problems. Section 4.4 and the Discussion section address these limitations." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper reports several negative results: SCoT performs worse than CoT (Table 4), specification-inference reflection reduces DeepSeekCoder performance (-2.7 points in Table 2), adding sample I/O pairs decreases performance (-2.7 points in Table 4), and Figure 3 left shows regressions where previously correct repairs become incorrect in later iterations." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims RAMP achieves 'pass@1 of 67% on Ruby, outperforming prior approaches' (supported by Table 1), converges within five iterations (supported by Figure 2 left and Figure 3 right), and that test generation and self-reflection are key drivers (supported by Table 2 ablations). All claims match the results." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper makes causal claims through ablation studies (e.g., 'removing test generation...substantially degrades performance'). The ablation design—controlled single-variable manipulation—is adequate for these causal claims. Each component is removed individually while holding others constant." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": false, 126 "justification": "The title says 'Automated Program Repair in Ruby' but the evaluation is on competitive-programming-style tasks from xCodeEval (short, single-file programs with well-defined I/O). While Section 6 acknowledges this limitation, the abstract and title do not bound the claims to competitive programming. The paper also claims RAMP provides 'new insights into multi-agent repair strategies' and is 'a practical solution for APR in Ruby' without bounding to the tested setting." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": true, 131 "justification": "The Discussion section considers alternative explanations: SCoT's lower performance may be due to 'rigid structures constrain[ing] the model's flexibility' (Section 5). The threats to validity section discusses prompt sensitivity, parameter choices, and benchmark representativeness as factors that could affect results. Section 4.2 discusses why Qwen Coder may be insensitive to ablations." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Section 3.3 specifies 'DeepSeek-Coder 6.7B-Instruct' and 'Qwen2.5-Coder-7B-Instruct' with specific parameter counts. These are specific open-source model versions with known architectures and weights, unlike proprietary API-based models where version ambiguity is problematic." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": false, 143 "justification": "Section 3.1 states 'Due to space limitations, the prompts used for each LLM-based agent are included in the replication package.' The prompts are not in the paper or appendix itself. While they may be in the Figshare package, the paper describes prompts only in natural language (e.g., 'the model is prompted to generate only a natural language explanation')." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 3.3 reports: temperature 0.8 for code generation, 0.1 for other agents, top_p=0.95, single candidate per bug, 4-bit quantization, and 11 iterations. For baselines, 'we use the hyperparameters specified in their original implementations.'" 149 }, 150 "scaffolding_described": { 151 "applies": true, 152 "answer": true, 153 "justification": "The multi-agent scaffolding is described in detail in Section 3.1: four agents (Feedback Integrator, Test Designer, Programmer, Test Executor), their roles, data flow between them (Figure 1), the iterative loop, and stopping conditions. Section 3.1.1-3.1.4 describes each agent's specific function." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 3.2 documents the benchmark selection: xCodeEval validation set, 5,068 samples across 11 languages, 343 Ruby samples. Section 4 explains the 10% sampling strategy for RQ1 ('sampled to preserve the original language and difficulty distribution') with 34 Ruby questions." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 6 'Threats to Validity' provides a dedicated, substantive section covering internal, external, construct, and conclusion validity threats across multiple paragraphs." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 6 discusses specific threats: 'RAMP relies on LLM-generated test cases for intermediate feedback; these test cases may not always be correct,' 'the behavior of individual agents...is sensitive to prompt design,' 'We conducted experiments on the XCodeEval benchmark, which covers competitive-programming-style tasks. These problems may not fully reflect the complexity or diversity of real-world Ruby projects,' and 'RQ1 evaluations were performed on a 10% subset.'" 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 6 states: 'These problems may not fully reflect the complexity or diversity of real-world Ruby projects.' Section 2 explicitly scopes: 'our focus is on competitive-programming style tasks: short, single-file programs with well-defined I/O-based test cases.' Section 6 also notes: 'we did not systematically investigate' generalization to other languages." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 8 provides a Figshare link with 'full replication package and experimental data.' The underlying xCodeEval benchmark is also publicly available. Per-task identifiers are mentioned in Section 6." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 3.2 describes the data source: xCodeEval benchmark validation set with 5,068 samples across 11 languages, 343 Ruby samples. Each instance includes buggy code, problem description, unit tests, reference solution, execution outcome, problem tags, and difficulty level." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants are involved. The data comes from a standard benchmark (xCodeEval), so recruitment methods do not apply." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The data pipeline is documented: xCodeEval validation set → Ruby subset (343 samples) → 10% subsample for RQ1 (34 samples, preserving language and difficulty distribution) → full set for RQ2-RQ4. Section 3.2 and Section 4 describe these steps." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": false, 204 "justification": "No funding or acknowledgments section is present in the paper. There is no mention of funding sources." 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: University of British Columbia (Canada), Ecole de Technologie Superieure (Canada), Laval University (Canada). These are academic institutions with no apparent conflict regarding the evaluated tools." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure means this criterion is not satisfied." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests statement or financial disclosure is present in the paper." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper uses DeepSeek-Coder 6.7B-Instruct and Qwen2.5-Coder-7B-Instruct but does not state the training data cutoff dates for either model. Since these models are evaluated on xCodeEval (a public benchmark), knowing the training cutoff is important for assessing contamination." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": false, 231 "justification": "No discussion of whether the xCodeEval benchmark data may have been in the training data of DeepSeek-Coder or Qwen2.5-Coder. The xCodeEval paper was published in 2024, and the models were trained on code data that may include competitive programming problems." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": false, 236 "justification": "xCodeEval includes competitive programming problems that may have been available online before the models' training cutoffs. No contamination analysis is performed. This is particularly relevant because the models are code-specialized and likely trained on competitive programming data." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study. It is a benchmark evaluation of an automated program repair system." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": true, 280 "justification": "The Discussion section reports wall-clock time for all methods: RAMP takes ~6.6 hours, RAMP with early stopping ~5.1 hours, Self-Planning ~2.4 hours, Few-Shot ~6.4 hours, Zero-Shot is very fast, and LANTERN ~147 hours. GPU memory snapshots are also provided (24,229.25 MB GPU for full run). Figure 6 left visualizes the time-accuracy trade-off." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": true, 285 "justification": "Section 3.3 specifies hardware: NVIDIA Tesla V100 (32GB) for RAMP with DeepSeekCoder, NVIDIA H100 SXM5 (80GB) for Qwen experiments and baselines. The Discussion section reports GPU memory usage (24,229.25 MB / 29,269.25 MB), CPU utilization (4.5% / 10.4%), and RAM (1,748.66 MB / 2,215.67 MB)." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "RAMP achieves the highest pass@1 on Ruby (67.0%), outperforming LANTERN (61.7%), Self-Planning (56.0%), and other baselines.", 292 "evidence": "Table 1 shows pass@1 across all methods on the Ruby subset of xCodeEval (10% validation set, 34 samples). RAMP achieves 67.0% vs LANTERN 61.7%, Self-Planning 56.0%, Few-Shot 47.5%, Zero-Shot 24.1%, ChatRepair 17.6%, Self-Collaboration 0.0%.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "RAMP converges quickly, reaching peak performance within five iterations.", 297 "evidence": "Figure 2 left and Figure 3 right show cumulative pass@1 rising sharply in early iterations and plateauing by iteration 5. The text in Section 4.1 states 'performance rises from 55.0% at iteration 0 to 67.0% by iteration five, after which results plateau.'", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Test generation and self-reflection are key drivers of RAMP's performance, with removal causing up to 19.3 percentage point drops.", 302 "evidence": "Table 2 shows ablation results: removing test generation drops DeepSeekCoder pass@1 from 66.5% to 48.4% (−18.1 points); removing self-reflection drops it from 66.5% to 47.2% (−19.3 points). However, Qwen Coder shows minimal sensitivity (<1.5 points change).", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "RAMP is particularly effective at repairing WRONG_ANSWER (68.5%), COMPILATION_ERROR (66.7%), and RUNTIME_ERROR (60.4%) cases.", 307 "evidence": "Figure 5 right and Section 4.4 report pass@1 per execution outcome category on the full Ruby validation set (343 samples). WRONG_ANSWER: 68.5%, COMPILATION_ERROR: 66.7%, RUNTIME_ERROR: 60.4%, TIME_LIMIT_EXCEEDED: 40.0%.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "RAMP offers the best accuracy-time trade-off among evaluated methods for Ruby repair.", 312 "evidence": "Figure 6 left and the Discussion section show RAMP achieves 67% pass@1 in ~6.6 hours, while LANTERN achieves 61.7% in ~147 hours. Self-Planning is faster (~2.4h) but lower accuracy (56.0%). However, LANTERN's time includes multi-language processing, making the comparison not fully apples-to-apples.", 313 "supported": "moderate" 314 } 315 ], 316 "methodology_tags": [ 317 "benchmark-eval" 318 ], 319 "key_findings": "RAMP, a multi-agent framework for automated program repair in Ruby, achieves 67.0% pass@1 on the xCodeEval benchmark, outperforming prior methods including LANTERN (61.7%) on a 34-sample subset. Ablation studies reveal that test generation and self-reflection are critical for DeepSeekCoder (up to 19.3 percentage point impact) but have minimal effect on Qwen Coder. RAMP converges within five iterations and is most effective on WRONG_ANSWER and COMPILATION_ERROR cases, while struggling with resource-related failures and advanced algorithmic categories. The framework demonstrates practical portability by achieving competitive results on C++ with minimal adaptation.", 320 "red_flags": [ 321 { 322 "flag": "Very small comparison sample size", 323 "detail": "The head-to-head comparison with baselines (RQ1) uses only 34 Ruby samples (10% subset). At this sample size, a 5.3 percentage point difference between RAMP (67.0%) and LANTERN (61.7%) could easily be due to chance—this corresponds to roughly 1-2 additional problems solved. No statistical testing is performed." 324 }, 325 { 326 "flag": "No statistical significance testing", 327 "detail": "The paper claims RAMP 'substantially outperforms' baselines based on raw percentage comparisons without any significance tests. The justification that 'deterministic generation' eliminates the need for statistics ignores the sampling variability from the benchmark subset and the small sample size." 328 }, 329 { 330 "flag": "Inconsistent ablation results across models", 331 "detail": "Ablation results are highly model-dependent: test generation and self-reflection produce 18-19 point improvements for DeepSeekCoder but <1.5 points for Qwen Coder (Table 2). This suggests the claimed benefits of the multi-agent design may be model-specific rather than general, but the paper's conclusions don't adequately bound this." 332 }, 333 { 334 "flag": "Benchmark contamination risk unaddressed", 335 "detail": "DeepSeek-Coder and Qwen2.5-Coder are trained on large code corpora that likely include competitive programming problems. xCodeEval draws from such problems. No contamination analysis is performed despite this being a known risk for code LLMs evaluated on public benchmarks." 336 }, 337 { 338 "flag": "Single evaluation metric", 339 "detail": "Only pass@1 is reported. No metrics for code quality, readability, efficiency, or similarity to reference solutions are used. A repaired program that passes tests but is poorly written or inefficient would still count as a success." 340 }, 341 { 342 "flag": "Greedy decoding with no variance estimation", 343 "detail": "Using greedy (deterministic) decoding means results are based on a single run with no variance estimation. While deterministic, the results are fragile—temperature variations or different sampling could yield meaningfully different outcomes, especially on 34 samples." 344 } 345 ], 346 "cited_papers": [ 347 { 348 "title": "AutoCodeRover: Autonomous Program Improvement", 349 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 350 "year": 2024, 351 "doi": "10.1145/3650212.3680384", 352 "relevance": "Autonomous multi-agent framework for repository-level program repair using AST navigation and fault localization from GitHub issues." 353 }, 354 { 355 "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair", 356 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 357 "year": 2025, 358 "doi": "10.1109/ICSE55347.2025.00157", 359 "relevance": "LLM-based autonomous agent for program repair, representative of the newest wave of multi-agent APR systems." 360 }, 361 { 362 "title": "MAGIS: LLM-based multi-agent framework for GitHub issue ReSolution", 363 "authors": ["Wei Tao", "Yucheng Zhou", "Yanlin Wang", "Wenqiang Zhang", "Hongyu Zhang", "Yu Cheng"], 364 "year": 2025, 365 "relevance": "Multi-agent LLM framework targeting GitHub issue resolution, relevant to agentic software engineering workflows." 366 }, 367 { 368 "title": "SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement", 369 "authors": ["Antonis Antoniades", "Albert Örwall", "Kexun Zhang", "Xixi Xie", "Anirudh Goyal", "William Yang Wang"], 370 "year": 2025, 371 "relevance": "Combines MCTS with multi-agent iterative refinement for software engineering tasks, advancing agentic search strategies." 372 }, 373 { 374 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 375 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 376 "year": 2025, 377 "relevance": "Open platform for AI software engineering agents, relevant to understanding agentic coding tool infrastructure." 378 }, 379 { 380 "title": "Demystifying LLM-Based Software Engineering Agents", 381 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 382 "year": 2025, 383 "doi": "10.1145/3715754", 384 "relevance": "Analysis of LLM-based SE agents, directly relevant to understanding the agentic AI paradigm for software tasks." 385 }, 386 { 387 "title": "A Survey of LLM-based Automated Program Repair: Taxonomies, Design Paradigms, and Applications", 388 "authors": ["Boyang Yang", "Zijian Cai", "Fengling Liu", "Bach Le", "Lingming Zhang"], 389 "year": 2025, 390 "relevance": "Comprehensive survey of LLM-based APR covering taxonomy and design paradigms, relevant as a survey-of-surveys candidate." 391 }, 392 { 393 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt", 394 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 395 "year": 2024, 396 "relevance": "ChatRepair demonstrates conversational LLM-based repair with cost reporting, a key baseline for agentic repair approaches." 397 }, 398 { 399 "title": "FixAgent: A Unified Debugging Approach via LLM-Based Multi-Agent Synergy", 400 "authors": ["Cheryl Lee", "Chunqiu Steven Xia", "Jen tse Huang"], 401 "year": 2024, 402 "arxiv_id": "2404.17153", 403 "relevance": "Multi-agent debugging framework aligning with cognitive models, relevant to understanding collaborative agent designs for SE." 404 }, 405 { 406 "title": "Reflexion: Language agents with verbal reinforcement learning", 407 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 408 "year": 2023, 409 "relevance": "Foundational work on self-reflection in language agents, directly relevant to the reflection mechanism used in RAMP." 410 }, 411 { 412 "title": "A systematic literature review on large language models for automated program repair", 413 "authors": ["Quanjun Zhang", "Chunrong Fang", "Yang Xie"], 414 "year": 2024, 415 "relevance": "Systematic review of LLM-based APR revealing language coverage skew and evaluation gaps." 416 }, 417 { 418 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence", 419 "authors": ["Daya Guo", "Qi Zhu", "Zhi Du"], 420 "year": 2024, 421 "arxiv_id": "2401.14196", 422 "relevance": "Technical report for DeepSeek-Coder, one of the backbone models used in the RAMP evaluation." 423 } 424 ] 425 }