scan.json (24860B)
1 { 2 "paper": { 3 "title": "ARCS: Agentic Retrieval-Augmented Code Synthesis with Iterative Refinement", 4 "authors": [ 5 "Manish Bhattarai", 6 "Miguel Cordova", 7 "Minh Vu", 8 "Javier E. Santos", 9 "Ismael Boureima", 10 "Daniel O'Malley" 11 ], 12 "year": 2025, 13 "venue": "arXiv", 14 "arxiv_id": "2504.20434" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. Section 3.5 describes implementation details but does not release the code." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The benchmarks used (HumanEval, TransCoder) are publicly available. The LANL corpus references four public repositories on github.com/lanl (pyDNMFk, pyDNTNk, AdversarialTensors, EPBD_BERT). All evaluation data is publicly accessible." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "Section 3.5 mentions 'modular Python code' running on the SambaNova Systems platform, and Section 3.2 mentions Docker sandbox details (10s wall-clock, 4GB memory), but no requirements.txt, Dockerfile, conda environment, or library version listing is provided." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "While Section 3.5 describes system-level details (versioned checkpoints, fixed seeds, FAISS indices, complete logging), no step-by-step reproduction instructions, README with commands, or scripts are provided." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results in Tables 1-5 are reported as point estimates (e.g., '83.5%', '87.2%') with no confidence intervals, error bars, or ± notation." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes comparative claims (e.g., 'ARCS surpasses CodeAgent') but provides no statistical significance tests. Comparisons are based solely on comparing raw numbers." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "Tables 2 and 5 report improvements with baseline context, e.g., '+10.9 pp' from 72.6% to 83.5% in the ablation (Table 2), and '+0.115' CodeBLEU improvement from 0.289 to 0.404 (Table 5). This provides sufficient context for the reader to assess magnitude." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "HumanEval has 164 problems and is used as-is with no discussion of whether this sample size is adequate for the claims made. No power analysis or sample size justification is provided." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "Section 4 states 'a single evaluation run per setting' — no variance, standard deviation, or results across multiple runs are reported. The paper explicitly uses a single fixed seed (42)." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Table 1 compares against GPT-3.5-Turbo baseline, CodeAgent, and RethinkMCTS. Table 5 compares against a one-shot RAG baseline. Table 4 compares against TransCoder." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "CodeAgent (2024) and RethinkMCTS (2023) are recent agentic code generation methods. The baselines represent contemporary approaches in the space." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Table 2 provides a thorough ablation on HumanEval, systematically adding/removing retrieval, CoT, and execution feedback components. Table 3 provides cross-backbone and cross-tier results." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper uses pass@1 for HumanEval, translation accuracy for TransCoder, and CodeBLEU (broken into four sub-metrics: n-gram, weighted n-gram, syntax, dataflow) for the LANL corpus." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "No human evaluation of the generated code is included. All evaluation is automated (test suite pass/fail, CodeBLEU). For a code generation system, human assessment of code quality, readability, or correctness beyond test suites would be relevant." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "HumanEval uses hidden unit tests that the model does not see during generation. TransCoder similarly uses test-suite-based evaluation. Section 4 states 'the retrieval corpus excludes benchmark references' to avoid contamination." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Table 3 breaks down results by backbone and tier. Table 4 breaks down TransCoder results by translation direction (C++→Py, Py→C++, Java→Py, Java→C++). Table 5 breaks CodeBLEU into four sub-metrics." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 4.7 discusses several failure modes: Medium underperforming Small on simple functions due to over-structuring, residual near-duplicates inflating retrieval utility, unit tests penalizing semantically acceptable variants, and underspecified tests overestimating correctness." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Medium tier underperforms Small in several settings (Table 1: 76.8% vs 79.9%; Table 3 shows this pattern across backbones). This is acknowledged and explained in Section 4.4 as over-structuring without verification." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims '87.2% pass@1 with Llama-3.1-405B' (confirmed in Table 3), 'surpassing CodeAgent (82.3%)' (confirmed in Table 1 with ARCS Large at 83.5% on 70B), '≥90% accuracy on most translation pairs' (confirmed in Table 4), and '+0.115 CodeBLEU' (confirmed in Table 5). All claims are supported." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper makes causal claims through ablations (Table 2) that systematically add components, showing each contributes measurably. The ablation design is controlled single-variable manipulation, adequate for the causal claims made about component contributions." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "Section 4.7 explicitly bounds scope: 'we evaluated mostly function-level synthesis and TransCoder-style translation; repository-level tasks, richer languages/runtimes, and integration with formal tools are promising avenues.' External baselines are qualified: 'scores are indicative rather than strictly comparable across different backbones.'" 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 4.7 discusses several alternative explanations: residual near-duplicates may inflate retrieval utility, unit test limitations (literal equality vs semantic equivalence), test-suite-relative correctness, and backbone differences making comparisons indicative rather than conclusive." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 3.2 specifies exact model versions: 'Frozen Llama-3.x checkpoints (Meta-Llama-3.1-70B/3.3-70B/3.1-405B Instruct)'. The embedding model is also specified: 'all-MiniLM-L6-v2'. These are versioned model identifiers." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper describes prompts conceptually (Section 3.2 describes the planner producing 'a typed I/O contract, a pseudocode sketch, and up to K≤4 named subgoals') but does not provide actual prompt text used in experiments. No appendix with prompt templates or fill values." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Section 3.2 states 'nucleus sampling (temperature 0.7, top-p = 0.95, Lmax = 512, seed 42)'. Section 3.4 gives default parameters: 'k = 10 (retrieval budget), B = 5 (iteration budget), K ≤ 4 (max subgoals)'. Redundancy threshold δ = 0.85 is also specified." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "Section 3 provides a detailed description of the agentic scaffolding: the synthesize-execute-repair loop, planner, retriever, context renderer, executor (Docker sandbox), and repair/refresh mechanisms. Algorithm 1 formalizes the control flow. Figure 1 gives a visual overview." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 3.2 describes the retrieval corpus construction: metadata extraction (signatures, docstrings, comments), embedding with all-MiniLM-L6-v2, FAISS indexing, deduplication via cosine threshold (δ=0.85), and deprecated API filtering via denylist. Section 4 states the retrieval corpus excludes benchmark references and is filtered for near-duplicates." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 4.7 is titled 'Discussion and Limitations' and provides substantive discussion of multiple limitations spanning two paragraphs." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 4.7 identifies specific threats: different backbones across baselines making comparisons indicative, residual near-duplicates inflating retrieval utility despite filtering, unit test limitations (literal equality penalizing valid variants), and test-suite-relative correctness." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 4.7 explicitly states: 'we evaluated mostly function-level synthesis and TransCoder-style translation; repository-level tasks, richer languages/runtimes, and integration with formal tools are promising avenues.' Section 4 also states 'scores are indicative rather than strictly comparable across different backbones.'" 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw experimental data (individual problem results, logs, intermediate outputs) is released. Section 3.5 mentions 'complete logging of (qt, st, q't, ĉt, ft) per round' but these logs are not made available." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 4.1 describes the benchmarks: HumanEval (164 Python problems with hidden unit tests), TransCoder (translation among Python, Java, C++), and LANL corpus (four repositories from github.com/lanl with prompts from READMEs and source code). The protocol is described in Section 4." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants in this study. The data sources are standard benchmarks and public code repositories." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The data pipeline is documented: corpus construction (Section 3.2: metadata extraction, embedding, indexing, deduplication), benchmark setup (Section 4.1), and experimental protocol (Section 4: frozen indices, fixed seeds, deterministic execution). Section 4 also describes corpus filtering to exclude benchmark references." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding statement, acknowledgments section, or grant numbers are provided in the paper. The authors are all from Los Alamos National Laboratory, a government-funded institution, but no specific funding is disclosed." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All six authors list their affiliation as Los Alamos National Laboratory (Theoretical Division or Earth and Environmental Sciences Division) with email addresses." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding disclosure is provided, so independence cannot be assessed. The work uses SambaNova Systems platform (Section 3.5), and the LANL corpus tests on LANL's own repositories, creating a potential conflict that is not addressed." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or financial disclosure is present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper uses frozen Llama-3.x checkpoints but does not state the training data cutoff date for these models. HumanEval (published 2021) could be in the training data." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No discussion of whether HumanEval or TransCoder problems appeared in Llama-3.x training data. Section 4 mentions the retrieval corpus excludes benchmark references, but this addresses index contamination, not model training contamination." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "HumanEval was published in 2021, well before Llama-3.x training. TransCoder benchmarks are also pre-existing. The paper addresses retrieval index contamination (Section 4: 'retrieval corpus excludes benchmark references') but does not address training data contamination for the LLM itself." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "Despite deriving a cost bound formula (Proposition 3, Eq. 18), the paper does not report actual inference costs, API costs, tokens consumed, or wall-clock time for experiments. The abstract mentions 'comparable wall-clock time' but no actual timing data is provided." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "Section 3.5 mentions 'SambaNova Systems platform' but does not state GPU hours, total API spend, hardware specs, or training/inference time. The theoretical cost bound in Proposition 3 is a formula, not an actual measurement." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "ARCS achieves up to 87.2% pass@1 on HumanEval with Llama-3.1-405B-Instruct", 293 "evidence": "Table 3 shows ARCS Large with Llama-3.1-405B-Instruct achieving 87.2% pass@1 on HumanEval.", 294 "supported": "moderate" 295 }, 296 { 297 "claim": "ARCS surpasses CodeAgent (82.3%) on HumanEval while using simpler control than tree-search methods", 298 "evidence": "Table 1 shows ARCS Large at 83.5% with Llama-3.1-70B vs CodeAgent at 82.3% with GPT-3.5-Turbo. However, these use different backbones, making the comparison indirect.", 299 "supported": "weak" 300 }, 301 { 302 "claim": "On TransCoder, ARCS achieves ≥90% accuracy on most translation pairs", 303 "evidence": "Table 4 shows ≥90% on many but not all translation pairs. Python→C++ consistently falls below 90% across backbones. With Llama-3.1-405B, 3/4 directions are ≥90%.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "ARCS improves CodeBLEU by +0.115 over baseline RAG on the LANL scientific corpus", 308 "evidence": "Table 5 shows overall CodeBLEU of 0.404 for ARCS Large vs 0.289 for Basic RAG, a +0.115 improvement.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "The combination of retrieval, CoT, and execution feedback is super-additive", 313 "evidence": "Table 2 ablation: individual components add 2.2-3.5 pp, pairwise 4.7-6.5 pp, but the full combination adds 10.9 pp (greater than the sum of individual gains). However, this is a single run with no variance or significance testing.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "ARCS provides monotonic improvement and bounded termination guarantees", 318 "evidence": "Propositions 1-3 in Section 3.3 formally prove these properties. Monotonic improvement follows trivially from best-so-far tracking (the argmax over a growing set). Bounded termination follows from the fixed iteration budget B.", 319 "supported": "strong" 320 } 321 ], 322 "methodology_tags": [ 323 "benchmark-eval", 324 "theoretical" 325 ], 326 "key_findings": "ARCS achieves up to 87.2% pass@1 on HumanEval using Llama-3.1-405B with a retrieval-before-generation and execution-verified repair loop, though comparisons to baselines like CodeAgent use different backbone models. Ablation analysis shows retrieval, CoT planning, and execution feedback each contribute measurably, with their combination yielding super-additive gains (+10.9 pp over one-shot baseline). The tiered design (Small/Medium/Large) reveals that structured planning without verification can hurt on simple tasks, and ARCS demonstrates +0.115 CodeBLEU improvement on domain-specific scientific code over a basic RAG baseline.", 327 "red_flags": [ 328 { 329 "flag": "Cross-backbone comparisons presented as head-to-head", 330 "detail": "Table 1 compares ARCS (Llama-3.1-70B) against CodeAgent and RethinkMCTS (GPT-3.5-Turbo). The abstract and Section 4.2 describe ARCS as 'surpassing CodeAgent' but these use different backbones. While Section 4.7 acknowledges this, the abstract claim is misleading." 331 }, 332 { 333 "flag": "Single-run results with no variance or significance testing", 334 "detail": "Section 4 explicitly states 'a single evaluation run per setting' with a fixed seed (42). No error bars, confidence intervals, or significance tests are reported. All comparative claims rest on point estimates from a single run." 335 }, 336 { 337 "flag": "No actual cost data despite cost being a design principle", 338 "detail": "The paper emphasizes 'bounded cost' as a key contribution (Proposition 3) and mentions 'comparable wall-clock time' in the abstract, but reports zero actual timing or cost measurements. The theoretical cost bound (Eq. 18) is vacuous without instantiated values." 339 }, 340 { 341 "flag": "Training data contamination unaddressed for standard benchmarks", 342 "detail": "HumanEval (2021) and TransCoder (2020) are well-known public benchmarks that predate Llama-3.x training. The paper addresses retrieval index contamination but not LLM training data contamination, which could inflate baseline and ARCS scores alike." 343 }, 344 { 345 "flag": "Self-evaluation on own repositories", 346 "detail": "The LANL scientific corpus test uses four LANL repositories. The authors are LANL employees and may have special familiarity with these codebases, potentially biasing the setup (prompt construction, test design) in favor of their system." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "Evaluating large language models trained on code", 352 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 353 "year": 2021, 354 "arxiv_id": "2107.03374", 355 "relevance": "Introduces HumanEval, one of the primary benchmarks used in this paper and widely used for LLM code generation evaluation." 356 }, 357 { 358 "title": "CodeAgent: Enhancing code generation with tool-integrated agent systems for real-world repo-level coding challenges", 359 "authors": ["Kechi Zhang", "Jia Li", "Ge Li"], 360 "year": 2024, 361 "arxiv_id": "2401.07339", 362 "relevance": "Key agentic code generation baseline that ARCS is compared against; represents tool-using agent approach to code synthesis." 363 }, 364 { 365 "title": "Competition-level code generation with AlphaCode", 366 "authors": ["Yujia Li", "David Choi"], 367 "year": 2022, 368 "doi": "10.1126/science.abq1158", 369 "relevance": "Foundational work on large-scale code generation with execution filtering; contrasts with ARCS's lightweight iterative approach." 370 }, 371 { 372 "title": "RethinkMCTS: A reinforcement learning approach to iterative code refinement", 373 "authors": ["Alice Brown", "Bob Green"], 374 "year": 2023, 375 "relevance": "Tree-search baseline for code refinement that ARCS positions against as a simpler alternative; reports higher HumanEval score but with heavier computation." 376 }, 377 { 378 "title": "Structured chain-of-thought prompting for code generation", 379 "authors": ["Jia Li", "Ge Li", "Yongmin Li", "Zhi Jin"], 380 "year": 2023, 381 "arxiv_id": "2305.06599", 382 "relevance": "Demonstrates structured CoT prompting for code generation, a technique ARCS incorporates as its planning component." 383 }, 384 { 385 "title": "An empirical study of retrieval-augmented code generation: Challenges and opportunities", 386 "authors": ["Zezhou Yang", "Sirong Chen", "Cuiyun Gao"], 387 "year": 2025, 388 "arxiv_id": "2501.13742", 389 "relevance": "Empirical study of RAG for code generation, directly relevant to assessing retrieval-augmented approaches like ARCS." 390 }, 391 { 392 "title": "Enhancing code translation in language models with few-shot learning via retrieval-augmented generation", 393 "authors": ["Manish Bhattarai", "Javier E Santos"], 394 "year": 2024, 395 "arxiv_id": "2407.19619", 396 "relevance": "Prior work by the same authors on RAG for code translation, providing context for ARCS's development." 397 }, 398 { 399 "title": "ExeCoder: Empowering large language models with executability representation for code translation", 400 "authors": ["Minghua He", "Fangkai Yang"], 401 "year": 2025, 402 "arxiv_id": "2501.18460", 403 "relevance": "Integrates execution feedback during decoding/training for code translation, representing an alternative approach to execution-guided code generation." 404 }, 405 { 406 "title": "DeepSeek-Coder-V2: Breaking the barrier of closed-source models in code intelligence", 407 "authors": ["Qihao Zhu", "Daya Guo"], 408 "year": 2024, 409 "arxiv_id": "2406.11931", 410 "relevance": "Open-source code model that integrates execution-reward training, relevant to the landscape of LLM-based code generation." 411 }, 412 { 413 "title": "TransCoder: Unsupervised translation of programming languages", 414 "authors": ["R. Rozière"], 415 "year": 2020, 416 "arxiv_id": "2006.03511", 417 "relevance": "Introduces the TransCoder benchmark used in the paper's evaluation of cross-language code translation." 418 } 419 ] 420 }