scan-v5.json (21040B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Evaluation of LLMs on Syntax-Aware Code Fill-in-the-Middle Tasks", 6 "authors": [ 7 "Linyuan Gong", 8 "Sida Wang", 9 "Mostafa Elhoushi", 10 "Alvin Cheung" 11 ], 12 "year": 2024, 13 "venue": "International Conference on Machine Learning", 14 "arxiv_id": "2403.04814", 15 "doi": "10.48550/arXiv.2403.04814" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All abstract claims are substantiated: 17,720 examples confirmed in Table 5, 15 LLMs evaluated in Table 4, FIM pretraining benefits discussed in Section 6.1, and pretraining-vs-size finding supported in Section 6.3.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper makes causal claims ('FIM pretraining enhances L2R performance') but explicitly acknowledges in Section 1 that 'these comparisons across different model families are not controlled experiments and could be influenced by differences in pretraining environments,' making causal inference inadequate.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper consistently bounds claims to code FIM tasks in the tested languages and explicitly states that cross-model-family comparisons should be 'interpreted with caution' due to uncontrolled training environments.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper acknowledges that differences in pretraining data and methods confound comparisons, but does not systematically explore alternative explanations for observed performance gaps (e.g., training data volume, instruction tuning, tokenizer differences).", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "Pass@1 on unit tests is the primary metric and is directly tied to functional correctness of code completions; the paper does not conflate this metric with broader claims like productivity or developer efficiency.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": false, 54 "justification": "There is no dedicated limitations section; a single sentence in the conclusion acknowledges the non-controlled experiment limitation, which does not meet the threshold for a dedicated section.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Appendix A.9 provides a specific empirical contamination analysis with a new held-out test set (April 2023–January 2024), and the conclusion explicitly names the non-controlled cross-family comparison as a specific threat.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly states the key limitation that conclusions cannot be extended to causal claims about pretraining paradigms without controlled experiments, and scopes findings to the FIM task in the four tested languages.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "The acknowledgments explicitly disclose funding: 'gift from Meta, the U.S. National Science Foundation through grants IIS-1955488, IIS-2027575, ARO W911NF2110339, ONR N00014-21-1-2724, and DOE awards DE-SC0016260, DE-SC0021982.'", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are clearly stated on the first page: Linyuan Gong and Alvin Cheung at UC Berkeley, Sida Wang and Mostafa Elhoushi at AI at Meta.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "Two authors (Wang, Elhoushi) are employed at Meta and the work received a Meta gift; Meta's model InCoder is evaluated in the benchmark, creating a potential non-independence between funder and evaluated product.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is included beyond the funding disclosure.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Fill-in-the-Middle (FIM), syntax-aware completion, and the three task splits (algorithmic block, control-flow, API function call) are all defined precisely with reference to AST structure.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper clearly states it contributes a new benchmark (SAFIM), a syntax-aware truncation algorithm, a five-prompt evaluation framework, and an evaluation toolkit with leaderboard.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 provides substantive engagement with prior benchmarks (HumanEval, HumanEval-Infilling, APPS, SWE-Bench) and prior FIM training work, explicitly identifying gaps that SAFIM addresses.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "benchmark-creation": { 119 "construct_design": { 120 "construct_validity_argued": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper argues explicitly that AST-based syntax-aware masking measures more realistic code completion capability than random line masking (HumanEval-Infilling), and ties each of the three splits to distinct competencies (algorithm design, control-flow understanding, API knowledge).", 124 "source": "haiku" 125 }, 126 "difficulty_distribution_characterized": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper provides input length distributions (Figure 4, Table 5/6) but does not characterize or report difficulty tiers; no easy/medium/hard categorization or difficulty measurement is provided.", 130 "source": "haiku" 131 }, 132 "ceiling_floor_effects_checked": { 133 "applies": true, 134 "answer": false, 135 "justification": "The benchmark does discriminate well in practice (scores range from ~22% to ~69%), but the paper never explicitly checks for or discusses ceiling/floor effects as a design consideration.", 136 "source": "haiku" 137 }, 138 "human_baseline_included": { 139 "applies": true, 140 "answer": false, 141 "justification": "No human performance baseline is reported; the API function call split only notes examples are 'solvable by humans' without providing human pass rates.", 142 "source": "haiku" 143 }, 144 "scoring_rubric_justified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Pass@1 is justified by the large dataset size (17,720 examples) enabling robust evaluation without multiple samples, and execution-based evaluation is preferred over match-based metrics with syntactic matching used only where execution is infeasible.", 148 "source": "haiku" 149 } 150 }, 151 "robustness": { 152 "contamination_resistance_designed": { 153 "applies": true, 154 "answer": true, 155 "justification": "Code sourced from April 1, 2022–January 1, 2023 to avoid overlap with The Stack (cutoff March 31, 2022) and GPT-3.5/4 training data (cutoff September 2021), with explicit contamination analysis in Appendix A.9.", 156 "source": "haiku" 157 }, 158 "temporal_robustness_discussed": { 159 "applies": true, 160 "answer": false, 161 "justification": "The paper does not discuss plans for updating the benchmark as models advance or how it will remain useful as newer models' training data encompasses the April 2022–January 2023 source window.", 162 "source": "haiku" 163 }, 164 "failure_modes_discussed": { 165 "applies": true, 166 "answer": false, 167 "justification": "The paper briefly notes that execution is infeasible for API calls with external dependencies (leading to syntactic matching), but does not systematically enumerate what the benchmark cannot measure or how it could be gamed.", 168 "source": "haiku" 169 }, 170 "baseline_implementations_provided": { 171 "applies": true, 172 "answer": true, 173 "justification": "Evaluation toolkit and dataset available at GitHub, exact model identifiers for all 23 evaluated models provided in Table 7 (Appendix A.3), enabling full reproduction of reported numbers.", 174 "source": "haiku" 175 } 176 }, 177 "documentation": { 178 "dataset_documentation_complete": { 179 "applies": true, 180 "answer": true, 181 "justification": "Dataset statistics are provided in Tables 5 and 6 (per-split and per-language breakdowns), API libraries listed in Appendix A.1, and collection methodology (filtering, deduplication, unit test validation) described in Section 3.1.", 182 "source": "haiku" 183 }, 184 "licensing_and_access_clear": { 185 "applies": true, 186 "answer": false, 187 "justification": "The benchmark is publicly available on GitHub and a leaderboard is hosted, but no license terms for the dataset or evaluation toolkit are stated in the paper.", 188 "source": "haiku" 189 }, 190 "intended_use_specified": { 191 "applies": true, 192 "answer": false, 193 "justification": "The paper describes what SAFIM measures but does not explicitly state what should NOT be concluded from benchmark results (e.g., it cannot distinguish model size effects from training data effects in cross-family comparisons).", 194 "source": "haiku" 195 } 196 } 197 } 198 }, 199 "claims": [ 200 { 201 "claim": "FIM pretraining enhances Left-to-Right (L2R) generation performance, not just FIM task performance.", 202 "evidence": "StarCoder (FIM-pretrained) outperforms CodeGen-16B (L2R-only, similar size) in L2R mode; CodeLLaMa-13B (FIM+L2R) outperforms larger CodeLLaMa-34B (L2R-only) in FIM evaluation (Table 2).", 203 "supported": "moderate" 204 }, 205 { 206 "claim": "Pretraining method and data quality are more important than model size for code FIM tasks.", 207 "evidence": "StarCoder (15.5B) achieves 55.5% avg Pass@1 comparable to GPT-4's 53.3%; DeepSeekCoder-1.3B (52.6%) matches CodeLLaMa-34B (49.7%) (Table 4).", 208 "supported": "moderate" 209 }, 210 { 211 "claim": "Syntax-aware truncation substantially improves Pass@1 and reduces compilation errors, especially for non-FIM models.", 212 "evidence": "CodeLLaMa-13B jumps from 16.4% to 41.4% Pass@1 with truncation; CodeGen-16B from 0.0% to 25.9% (Table 3).", 213 "supported": "strong" 214 }, 215 { 216 "claim": "Narrow prompt selection leads to skewed model comparisons, and comprehensive prompt coverage is necessary for fair evaluation.", 217 "evidence": "CodeGen-16B achieves 25.9% with SPM vs 15.2% with IPF, reversing the apparent ranking with InCoder-6B (25.2% PSM) reported by Fried et al. (Table 2, Section 6.1).", 218 "supported": "strong" 219 }, 220 { 221 "claim": "SAFIM data contamination with CodeLLaMa and DeepSeekCoder training data has negligible impact on evaluation results.", 222 "evidence": "Held-out test set from April 2023–January 2024 shows no significant performance decrease for either model compared to the original dataset (Table 17, Appendix A.9).", 223 "supported": "moderate" 224 }, 225 { 226 "claim": "Repository-level pretraining context improves API function call completion performance.", 227 "evidence": "StarCoder and DeepSeekCoder, which incorporate repository-level context in pretraining, excel specifically on the API function call split (Table 4, Section 6.3).", 228 "supported": "weak" 229 } 230 ], 231 "methodology_tags": [ 232 "benchmark-eval", 233 "benchmark-creation" 234 ], 235 "key_findings": "SAFIM is a 17,720-example multilingual code fill-in-the-middle benchmark using AST-based syntax-aware task construction across three splits (algorithmic block, control-flow, API function call), with a temporal cutoff to minimize contamination. Evaluation of 23 LLMs shows that FIM pretraining enhances both FIM and L2R generation performance, and that pretraining methodology and data quality predict performance better than raw model size (StarCoder 15.5B ≈ GPT-4). Syntax-aware truncation post-processing substantially improves Pass@1 for non-FIM models and is necessary for fair cross-model comparison. DeepSeekCoder-33B achieves the highest performance (69.0% avg Pass@1).", 236 "red_flags": [ 237 { 238 "flag": "Non-controlled causal claims", 239 "detail": "Claims that FIM pretraining 'enhances' L2R performance are drawn from observational comparisons across model families with different training data, architectures, and compute budgets — not controlled experiments. The paper acknowledges this but still presents the finding as a primary result." 240 }, 241 { 242 "flag": "Meta funder/evaluator conflict", 243 "detail": "Two authors are employed at Meta (AI at Meta) and the work received a Meta gift grant. Meta's InCoder model is one of the evaluated systems, creating a potential conflict between funder independence and evaluation objectivity." 244 }, 245 { 246 "flag": "No human baseline", 247 "detail": "The benchmark includes no human performance numbers. It is impossible to assess difficulty calibration or whether any model approaches human-level performance without this reference point." 248 }, 249 { 250 "flag": "API split too small", 251 "detail": "The API function call completion split contains only 310 examples sourced from GitHub, compared to 8,781 and 8,629 for the other splits. This small size and reliance on syntactic rather than execution-based evaluation weakens the reliability of conclusions about that split." 252 }, 253 { 254 "flag": "No difficulty characterization", 255 "detail": "The benchmark provides no difficulty tiers or difficulty measurement; whether models are solving easy or hard examples at each performance level cannot be determined from reported metrics." 256 } 257 ], 258 "cited_papers": [ 259 { 260 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 261 "relevance": "Primary baseline benchmark SAFIM is designed to supersede; establishes the standalone function generation paradigm and is used as a comparison point throughout." 262 }, 263 { 264 "title": "Efficient Training of Language Models to Fill in the Middle", 265 "relevance": "Introduces HumanEval-Infilling, the FIM benchmark SAFIM most directly extends, and establishes the PSM/SPM prompt paradigm evaluated in this paper." 266 }, 267 { 268 "title": "Code Llama: Open Foundation Models for Code", 269 "relevance": "One of the primary evaluated models and a key comparison point for FIM pretraining benefits; CodeLLaMa's evaluation on HumanEval-Infilling is used as a motivating example for SAFIM's improvements." 270 }, 271 { 272 "title": "StarCoder: May the Source Be with You", 273 "relevance": "High-performing evaluated model demonstrating that repository-level pretraining context improves API completion; used as evidence that pretraining data quality > model size." 274 }, 275 { 276 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 277 "relevance": "Top-performing model on SAFIM; evidence for pretraining methodology claims." 278 }, 279 { 280 "title": "InCoder: A Generative Model for Code Infilling and Synthesis", 281 "relevance": "Establishes HumanEval-Infilling benchmark and original FIM evaluation methodology that SAFIM argues is flawed due to prompt selection bias." 282 }, 283 { 284 "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples", 285 "relevance": "Motivates SAFIM's contamination resistance design via temporal cutoff." 286 }, 287 { 288 "title": "The Stack: 3 TB of Permissively Licensed Source Code", 289 "relevance": "Major pretraining dataset with March 2022 cutoff; SAFIM's post-April 2022 source selection is explicitly designed to avoid this corpus." 290 } 291 ], 292 "engagement_factors": { 293 "practical_relevance": { 294 "score": 3, 295 "justification": "Immediately usable benchmark with GitHub toolkit, live leaderboard, and exact reproduction instructions for 23 models — directly useful for anyone evaluating code LLMs on FIM tasks." 296 }, 297 "surprise_contrarian": { 298 "score": 2, 299 "justification": "Challenges the common belief that larger models automatically outperform smaller ones on coding tasks, with concrete examples like StarCoder 15.5B ≈ GPT-4." 300 }, 301 "fear_safety": { 302 "score": 1, 303 "justification": "The impact statement briefly raises concerns about improved code generation enabling malicious software development, but this is boilerplate and not a primary finding." 304 }, 305 "drama_conflict": { 306 "score": 1, 307 "justification": "Directly challenges prior evaluation methodology by Fried et al. (InCoder paper) with a concrete example showing their prompt comparison was biased, but framed constructively rather than contentiously." 308 }, 309 "demo_ability": { 310 "score": 3, 311 "justification": "Live leaderboard at safimbenchmark.com, evaluation toolkit on GitHub, and all model identifiers provided — anyone can reproduce or submit new models immediately." 312 }, 313 "brand_recognition": { 314 "score": 2, 315 "justification": "UC Berkeley and Meta authors, GPT-4 and GPT-3.5 evaluation included, and ICML venue — recognizable names but not a flagship lab paper." 316 } 317 }, 318 "hn_data": { 319 "threads": [ 320 { 321 "hn_id": "40881654", 322 "title": "LLM Agents can Autonomously Exploit One-day Vulnerabili-ties [pdf]", 323 "points": 4, 324 "comments": 1, 325 "url": "https://news.ycombinator.com/item?id=40881654" 326 }, 327 { 328 "hn_id": "40138889", 329 "title": "LLM Agents Can Autonomously Exploit One-Day Vulnerabilities", 330 "points": 4, 331 "comments": 1, 332 "url": "https://news.ycombinator.com/item?id=40138889" 333 }, 334 { 335 "hn_id": "40633364", 336 "title": "LLM Agents Can Autonomously Exploit One-Day Vulnerabilities", 337 "points": 3, 338 "comments": 1, 339 "url": "https://news.ycombinator.com/item?id=40633364" 340 }, 341 { 342 "hn_id": "41128425", 343 "title": "Things Come from Having Many Good Models", 344 "points": 2, 345 "comments": 0, 346 "url": "https://news.ycombinator.com/item?id=41128425" 347 }, 348 { 349 "hn_id": "40756286", 350 "title": "Solving Maxwell's Equations with Non-Trainable Graph Neural Network", 351 "points": 2, 352 "comments": 0, 353 "url": "https://news.ycombinator.com/item?id=40756286" 354 }, 355 { 356 "hn_id": "40679472", 357 "title": "Discovering Optimization Algorithms With And For Large Language Models", 358 "points": 2, 359 "comments": 0, 360 "url": "https://news.ycombinator.com/item?id=40679472" 361 }, 362 { 363 "hn_id": "40666270", 364 "title": "Discovering Preference Optimization Algorithms with Large Language Models", 365 "points": 2, 366 "comments": 0, 367 "url": "https://news.ycombinator.com/item?id=40666270" 368 }, 369 { 370 "hn_id": "40085930", 371 "title": "LLM Agents Can Autonomously Exploit One-Day Vulnerabilities with 87% Success", 372 "points": 2, 373 "comments": 0, 374 "url": "https://news.ycombinator.com/item?id=40085930" 375 }, 376 { 377 "hn_id": "39765229", 378 "title": "Quantifying Contamination in Code Generation Capabilities of Language Models", 379 "points": 1, 380 "comments": 0, 381 "url": "https://news.ycombinator.com/item?id=39765229" 382 }, 383 { 384 "hn_id": "39737870", 385 "title": "LSTM-Based Machine Learning for Enhancing Storm Surge Forecasting Accuracy", 386 "points": 1, 387 "comments": 0, 388 "url": "https://news.ycombinator.com/item?id=39737870" 389 } 390 ], 391 "top_points": 4, 392 "total_points": 23, 393 "total_comments": 3 394 } 395 }