scan.json (19684B)
1 { 2 "paper": { 3 "title": "EVOR: Evolving Retrieval for Code Generation", 4 "authors": ["Hongjin Su", "Shuyang Jiang", "Yuhang Lai", "Haoyuan Wu", "Boao Shi", "Che Liu", "Qian Liu", "Tao Yu"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2402.12317" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The abstract states 'Our model, code, and data are available at https://arks-codegen.github.io.' A URL is provided." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states code and data are available at the project page. They also compile a new benchmark EVOR-BENCH with four datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section listing library versions is provided in the paper." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are included in the paper itself. The project page is referenced but the paper does not contain a README or reproducing results section." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 2-4 and figures are reported as point estimates (e.g., '37.9') with no confidence intervals, error bars, or ± notation." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims EVOR 'significantly' outperforms baselines but provides no statistical significance tests (no p-values, t-tests, etc.)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports absolute percentage improvements with baseline context, e.g., 'EVOR outperforms DocPrompting by 18.6% on average using CodeLlama' and provides full baseline numbers in Table 2." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Dataset sizes are reported (142, 45, 107, 113 problems) but no justification is given for why these sizes are sufficient for the claims being made." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance across runs, or spread measures are reported. Results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Five baselines are compared: Vanilla, MPSC, ExeDec, Reflexion, and DocPrompting (Section 3.1, Table 2)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent methods: Reflexion (2024), DocPrompting (2023), MPSC (2023), ExeDec (2023). These are contemporary and relevant." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 4.1 (Table 3) ablates query evolution vs. knowledge evolution vs. both. Section 4.2 (Table 4) ablates knowledge source types." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "The paper uses only execution accuracy (pass@1) as the metric: 'By default, we use the execution accuracy (pass@1) as the metric throughout the paper.' Section 4.4 also uses pass@t but this is the same metric at different token budgets." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. All evaluation is automated via execution accuracy. Human evaluation could assess code quality beyond pass/fail." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "EVOR-BENCH is a newly compiled benchmark with manually written ground truth solutions. The benchmark problems are separate from any development data." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 2 provides per-dataset breakdowns across all four datasets (Scipy-M, Tensorflow-M, Ring, Pony) rather than just averages." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No error analysis or qualitative discussion of failure cases is presented. The paper does not show where EVOR fails or discuss specific failure modes." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that web search 'only marginally improves the results' (Section 3.2) and achieves less than 1% improvement when used alone without query evolution (Section 4.2, Table 4)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims '2 to 4 times execution accuracy' which is supported by Table 2 (e.g., EVOR 35.3% vs Reflexion 13.9% with ChatGPT). Claims about flexibility and combination with other methods are supported in Section 4.3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about component contributions are supported by controlled ablation studies in Sections 4.1 and 4.2, where individual components are systematically added/removed." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title 'Evolving Retrieval for Code Generation' is broad, but the evaluation is limited to 4 specific datasets (2 modified Python libraries, 2 long-tail languages) and 2 models. The paper does not explicitly bound its generalization claims." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the results are discussed. For example, whether the gains come primarily from more LLM calls/tokens rather than the retrieval evolution is not adequately addressed beyond Section 4.4." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions are stated: 'gpt-3.5-turbo-1106' (footnote 5), 'CodeLlama-34b-Instruct-hf' (footnote 6), 'GPT-4-1106' and 'Claude-3-opus' for SWE-bench experiments." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes prompting approaches but does not provide full prompt text. Query evolution and code generation prompts are described in natural language without the actual text used." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Maximum iterations set to 30, termination condition (same feedback 3 consecutive iterations), maximum context length of 4096, INSTRUCTOR-xl as retrieval model (Section 3.2). However, temperature/sampling settings are not explicitly stated." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Algorithm 1 provides a detailed formal description of the EVOR pipeline including query evolution, retrieval, generation, execution feedback, and knowledge base evolution steps." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 2.4 describes dataset curation: modified Python libraries to simulate updates, adapted DS-1000 problems, selected Ring and Pony for long-tail languages, manual ground truth annotation. More details in Appendix A." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 'Limitations' discusses iterative process leading to longer latency and increased energy consumption." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations section is generic, mentioning only latency and energy consumption concerns. No specific threats to validity for the experimental results are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show or what settings/populations are excluded from the claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The project page (https://arks-codegen.github.io) claims to make model, code, and data available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 2.4 describes how datasets were compiled: modifying Scipy/Tensorflow libraries, adapting DS-1000 problems, selecting LeetCode problems for Ring/Pony, manual ground truth writing." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data is from standard benchmarks and programmatically constructed datasets." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Table 1 provides dataset statistics. The curation process is described in Section 2.4 with further details in Appendix A, including how problems were adapted and solutions annotated." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section mentioning grants or sponsors is visible in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: University of Hong Kong, Fudan University, Sea AI Lab." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed, so independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "Training cutoff dates for gpt-3.5-turbo-1106 and CodeLlama are not explicitly stated, though the paper addresses contamination concerns through benchmark design." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 2.4 explicitly addresses this: 'We do not use a real library update version because it is potentially exposed to LLM training data.' The benchmark is designed to avoid overlap by modifying libraries and using long-tail languages excluded from StarCoder training." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The entire benchmark design (EVOR-BENCH) is motivated by avoiding contamination: modified libraries simulate unseen updates, and Ring/Pony are chosen because they 'have little public data and are excluded from the StarCoder training set' (Section 2.4)." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 4.4 analyzes token consumption at different budgets (4k-24k tokens) and compares EVOR's efficiency against DocPrompting. Figure 3 shows pass@t at different token levels." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total API spend, GPU hours, or total computational budget is reported. The token analysis in Section 4.4 shows per-example token budgets but not total experiment cost." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "EVOR achieves 2-4x execution accuracy compared to existing methods like Reflexion and DocPrompting", 286 "evidence": "Table 2: EVOR achieves 35.3% avg with ChatGPT vs Reflexion 13.9% (~2.5x) and DocPrompting 19.2% (~1.8x). With CodeLlama, EVOR 32.2% vs DocPrompting 16.0% (2x).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Synchronous evolution of both queries and knowledge is consistently better than evolving either alone", 291 "evidence": "Table 3 (Section 4.1): EVOR (evolve both) achieves 35.3% vs evolve query only 28.4% vs evolve knowledge only 23.8% with ChatGPT. Similar pattern with CodeLlama.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "EVOR can be combined with existing methods (MPSC, ExeDec, Reflexion) for further improvement", 296 "evidence": "Table 2: EVOR + Reflexion achieves 37.9% vs EVOR alone 35.3% with ChatGPT, up to 2.6% additional gain on average.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "EVOR is a more effective approach to using tokens, achieving superior results at all token budgets", 301 "evidence": "Figure 3 (Section 4.4): EVOR achieves higher pass@t than DocPrompting at all token consumption levels from 4k to 24k for both models.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Diverse knowledge sources enhance RACG performance, with larger improvements under evolution", 306 "evidence": "Table 4 (Section 4.2): CodeLlama with Exec+Code+Doc achieves 32.2% with evolution vs 20.4% without. Adding documentation to Exec+Code improves by 6.9% with evolution but only 4.5% without.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "EVOR, a retrieval-augmented code generation pipeline that synchronously evolves both queries and knowledge bases, achieves 2-4x execution accuracy over existing methods on a new benchmark (EVOR-BENCH) covering updated libraries and long-tail programming languages. Ablation studies show that evolving both queries and knowledge is consistently superior to evolving either alone. The approach is composable with existing methods like Reflexion and SWE-agent for additional gains. Diverse knowledge sources (documentation, code snippets, execution feedback) provide complementary benefits that are amplified by the evolution mechanism.", 312 "red_flags": [ 313 { 314 "flag": "No variance or uncertainty quantification", 315 "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals. Given that LLM outputs are stochastic, results could vary across runs." 316 }, 317 { 318 "flag": "No significance tests despite repeated claims of 'significant' improvement", 319 "detail": "The paper uses the word 'significantly' multiple times to describe improvements but provides no statistical significance tests." 320 }, 321 { 322 "flag": "Small dataset sizes for some benchmarks", 323 "detail": "Tensorflow-M has only 45 problems. Performance differences of a few percentage points on such small datasets may not be meaningful." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 329 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 330 "year": 2023, 331 "arxiv_id": "2310.06770", 332 "relevance": "Key benchmark for evaluating LLM agents on real-world software engineering tasks, used in EVOR's repo-level evaluation." 333 }, 334 { 335 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 336 "authors": ["Noah Shinn"], 337 "year": 2024, 338 "relevance": "Iterative LLM self-improvement framework used as baseline; represents agent-based code generation approaches." 339 }, 340 { 341 "title": "Evaluating Large Language Models Trained on Code", 342 "authors": ["Mark Chen"], 343 "year": 2021, 344 "arxiv_id": "2107.03374", 345 "relevance": "Introduced HumanEval benchmark for code generation, foundational for LLM code evaluation." 346 }, 347 { 348 "title": "DocPrompting: Generating Code by Retrieving the Docs", 349 "authors": ["Shuyan Zhou"], 350 "year": 2022, 351 "relevance": "Retrieval-augmented code generation using documentation; key baseline and predecessor to EVOR." 352 }, 353 { 354 "title": "Teaching Large Language Models to Self-Debug", 355 "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"], 356 "year": 2023, 357 "arxiv_id": "2304.05128", 358 "relevance": "Uses execution feedback to refine LLM code generation, related approach to EVOR's execution feedback component." 359 }, 360 { 361 "title": "StarCoder: May the Source Be with You!", 362 "authors": ["Raymond Li"], 363 "year": 2023, 364 "arxiv_id": "2305.06161", 365 "relevance": "Open-source code LLM; its training set exclusions informed EVOR-BENCH's choice of long-tail languages." 366 }, 367 { 368 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 369 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 370 "year": 2023, 371 "arxiv_id": "2305.01210", 372 "relevance": "Rigorous evaluation methodology for LLM code generation quality." 373 }, 374 { 375 "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation", 376 "authors": ["Yuhang Lai"], 377 "year": 2023, 378 "relevance": "Source benchmark from which EVOR-BENCH's Scipy and Tensorflow problems were adapted." 379 }, 380 { 381 "title": "Active Retrieval Augmented Generation", 382 "authors": ["Zhengbao Jiang"], 383 "year": 2023, 384 "relevance": "Active RAG approach that iteratively decides when to retrieve; related to EVOR's evolving retrieval paradigm." 385 } 386 ] 387 }