scan.json (34735B)
1 { 2 "paper": { 3 "title": "Evaluation of LLMs on Syntax-Aware Code Fill-in-the-Middle Tasks", 4 "authors": [ 5 "Linyuan Gong", 6 "Sida Wang", 7 "Mostafa Elhoushi", 8 "Alvin Cheung" 9 ], 10 "year": 2024, 11 "venue": "International Conference on Machine Learning (ICML)", 12 "arxiv_id": "2403.04814", 13 "doi": "10.48550/arXiv.2403.04814" 14 }, 15 "scan_version": 3, 16 "active_modules": [ 17 "experimental_rigor", 18 "data_leakage" 19 ], 20 "methodology_tags": [ 21 "benchmark-eval" 22 ], 23 "key_findings": "SAFIM, a 17,720-example syntax-aware fill-in-the-middle benchmark across four languages, reveals that FIM pretraining enhances both FIM and left-to-right inference, and that pretraining methods and data quality matter more than model size. Syntax-aware truncation in post-processing dramatically improves fair comparison across model types, boosting Pass@1 for non-FIM models like CodeLLaMa-13B from 16.4% to 41.4%. DeepSeekCoder-33B leads at 69.0% average Pass@1, while 1.3B DeepSeekCoder (52.6%) outperforms GPT-3.5 (40.9%). Data contamination analysis on a temporally separated test set shows negligible impact on results.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The abstract states 'The evaluation toolkit and dataset are available at https://github.com/gonglinyuan/safim, and the leaderboard is available at https://safimbenchmark.com.' A concrete repository URL is provided." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The dataset is released alongside the evaluation toolkit at the GitHub URL. The benchmark contains 17,720 examples from Codeforces and GitHub, publicly available." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions using 'the Huggingface transformers library' and 'OpenAI API' but does not provide library versions, requirements.txt, Dockerfile, or a detailed environment setup section. Specific model IDs are listed in Table 7 but framework/dependency versions are absent from the paper." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "While the paper provides detailed methodology descriptions and releases code/data, there are no step-by-step reproduction instructions in the paper itself (no 'Reproducing Results' section or commands to run)." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All tables (Tables 2-4, 8-17) report only point estimates of Pass@1 percentages with no confidence intervals or error bars. The paper justifies this by claiming the dataset's large size (17,720 examples) makes uncertainty quantification unnecessary." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are reported anywhere. Claims like 'StarCoder outperforms CodeGen-16B' and 'FIM pretraining enhances L2R performance' are based on comparing raw Pass@1 numbers without any statistical tests." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper consistently reports absolute scores with baseline context, e.g., 'the weakest CodeLLaMa model surpasses the strongest CodeGen model by 14 points' and 'StarCoder, with 15.5B parameters, achieves an average Pass@1 of 55.5%, comparable to GPT-4's 53.3%.' Per-model scores are provided for all comparisons." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": true, 66 "justification": "Section 3.3 explicitly justifies the dataset size: 'Our large dataset size of 17,720 examples enables robust evaluations without the need for multiple generations and averaging, as seen in smaller datasets like HumanEval (164 programs).'" 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "The paper explicitly generates only one completion per model per example ('we only generate one completion for each LLM on each example') and reports no variance across runs, seeds, or sampling stochasticity." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper compares 15+ models spanning multiple families (CodeGen, InCoder, CodeLLaMa, StarCoder, DeepSeekCoder, GPT-3.5, GPT-4) as baselines against each other. Tables 4 and 15 present comprehensive comparisons." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines include contemporary models for the submission period: DeepSeekCoder (2024), GPT-4 (2023), CodeLLaMa (2023), StarCoder (2023), plus additional models like Mixtral, WizardCoder, and Magicoder in the appendix." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 3 (and Tables 11-13) provides an ablation study on the syntax-aware truncation algorithm, comparing performance with and without truncation. Tables 2/8-10 compare five prompt designs per model. Within-family size comparisons serve as implicit ablations." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper reports Pass@1 as the primary metric and CErr% (compilation/syntax error percentage) as a secondary metric in Tables 3, 11-13. Results are also broken down by task category and language." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "Evaluation is entirely automated: execution-based testing (98.25% of examples) and syntactical match evaluation for API function calls. No human evaluation of outputs is performed." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The benchmark is constructed from code created between April 2022 and January 2023, deliberately chosen to be after most models' training data cutoffs. Appendix A.9 further creates a separate test set from April 2023 to January 2024 to test contamination effects." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 4 breaks down results by three task categories (algorithmic block, control-flow, API function call). Table 16 (Appendix A.7) provides per-programming-language breakdowns across C++, Java, Python, and C#." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix A.8 provides a detailed case study comparing InCoder-6B's successful completion and CodeGen-16B's failure on the same task. Compilation error rates are analyzed in Tables 3/11-13. The paper discusses models producing 'extraneous content' and 'infinite output generation.'" 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Several negative findings are reported: finetuning on distilled data hurts FIM performance compared to base models (Appendix A.6), CodeLLaMa-34B underperforms the smaller 13B variant due to lack of FIM (Section 6.1), IPF prompt causes placeholder token output errors (Section 4.1), Phi models underperform relative to their HumanEval scores (Appendix A.6)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims are supported: FIM pretraining enhancing FIM and L2R inference is shown in Table 2 (Section 6.1), pretraining methods mattering more than model size is demonstrated in Table 4/Figure 3 (Section 6.3), SAFIM providing fair comparisons is evidenced by the truncation ablation (Table 3) and prompt comparison (Table 2)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": false, 130 "justification": "The abstract claims 'FIM pretraining not only enhances FIM proficiency but also improves Left-to-Right (L2R) inference' — a causal claim based on observational comparison across model families with different architectures, data, and training procedures. While the paper acknowledges in Section 7 that 'our conclusions are drawn from comparisons across various model families trained with different paradigms, rather than from controlled experiments,' the abstract presents these causal claims without caveats." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "The title scopes to 'Syntax-Aware Code Fill-in-the-Middle Tasks.' Claims use hedging language ('our findings suggest'). The paper acknowledges cross-family comparisons are not controlled experiments (Section 7) and that results may vary by programming language (Appendix A.7). Scope is mostly bounded to SAFIM's three task types and four languages." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper discusses multiple alternative explanations: data contamination (Appendix A.9 with a dedicated experiment), differences in pretraining environments confounding cross-family comparisons (Section 1, Section 7), programming language coding style affecting results (Appendix A.7), and training data cutoff date overlap (Table 1)." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures Pass@1 on execution-based tests and syntactical matching, and frames results in terms of 'code completion proficiency' and 'FIM performance.' The measurements match the granularity of claims — no broader framing of 'developer productivity' or similar proxy gaps exists." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table 7 (Appendix A.3) provides exact model identifiers for every model: 'gpt-3.5-turbo-0301', 'gpt-4-1106-preview', 'Salesforce/codegen-350M-multi', 'codellama/CodeLlama-7b-hf', 'deepseek-ai/deepseek-coder-33b-base', etc." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Figure 2 provides detailed prompt structures with actual examples for all five prompt types (L2R, PSM, SPM, IPF, 1S), including sentinel token mappings for each model family. The prompt designs are described precisely enough to reconstruct." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 5 states: 'we use top-p random sampling with p = 0.95 and a temperature of 0.2.' The decoding strategy follows 'established practices in Fried et al. (2023).'" 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The paper evaluates raw model inference (single-turn generation) on code completion tasks." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3 provides detailed preprocessing: Codeforces solutions are filtered by unit test execution within 50% time limit, length (under 2x shortest solution), deduplication (CodeBLEU > 0.9 threshold). GitHub files are filtered by API library usage, star count (>10), and presence of comments. Section 3.2 describes AST-based task generation and validation filtering for each task category." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 7 contains substantive limitations discussion: 'We acknowledge a key limitation in our study: our conclusions are drawn from comparisons across various model families trained with different paradigms, rather than from controlled experiments altering pretraining paradigms within the same model.' An Impact Statement section also discusses broader concerns." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper discusses specific threats: data contamination from training data overlap (Table 1, Appendix A.9 with a dedicated experiment), non-controlled cross-family comparisons (Section 7), programming language coding style affecting results (Appendix A.7), and prompt sensitivity affecting model rankings (Section 6.1)." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 7 explicitly states what the results do NOT show: 'our conclusions are drawn from comparisons across various model families trained with different paradigms, rather than from controlled experiments' and proposes 'future work in pretraining such models under the same environment to validate these observations further.' The paper also scopes to decoder-only models, excluding encoder-decoder models (Section 2)." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The full dataset and evaluation toolkit are publicly available at https://github.com/gonglinyuan/safim. Individual examples, unit tests, and evaluation results can be independently verified." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3.1 describes collection from Codeforces (problems, unit tests, solutions) and GitHub (git commits from GH Archive), with time period (April 1, 2022 to January 1, 2023), inclusion criteria (GitHub repos with 10+ stars, files with natural language comments), and four programming languages." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data is sourced from public code repositories (Codeforces and GitHub) via standard scraping." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The pipeline is documented in detail: Sections 3.1-3.2 describe source corpus collection → unit test re-evaluation → length and duplicate filtering → AST parsing → task generation → validation filtering. Numbers are provided at key stages: 490 coding questions, 8,590 unique code solutions (Codeforces), 11,936 code files (GitHub), 17,720 total examples. Per-category statistics in Table 5." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "The Acknowledgements section lists: 'gift from Meta, the U.S. National Science Foundation through grants IIS-1955488, IIS-2027575, ARO W911NF2110339, ONR N00014-21-1-2724, and DOE awards DE-SC0016260, DE-SC0021982.'" 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are listed: Linyuan Gong and Alvin Cheung at University of California at Berkeley, Sida Wang and Mostafa Elhoushi at 'AI at Meta.' Two authors are at Meta, whose CodeLLaMa model is evaluated." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "Meta provided a gift fund and two authors are Meta employees. Meta develops CodeLLaMa, one of the evaluated models. While CodeLLaMa does not consistently win (DeepSeekCoder outperforms it), the funder has a financial interest in LLM evaluation outcomes." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper. Two Meta-affiliated authors evaluate Meta's CodeLLaMa, but no explicit conflict-of-interest disclosure is provided." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": true, 240 "justification": "Table 1 provides training data cutoff dates for all evaluated models: GPT-3.5/GPT-4 (Sept 2021), CodeGen (Oct 2021), InCoder (≤Mar 2022), CodeLLaMa (Jul 2022), StarCoder (Mar 2022), DeepSeekCoder (Feb 2023)." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Extensively discussed. Table 1 highlights models with date-range overlap in red. Section 3.1 explains the temporal cutoff strategy. Appendix A.9 provides a detailed contamination analysis with a new test set from April 2023 - January 2024 that has no overlap with any model." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "Appendix A.9 creates an entirely new dataset from a later time period (April 2023 - January 2024) with no date overlap with any model's training data. Table 17 and Figure 5 show that performance remains stable, demonstrating 'negligible impact' of potential contamination. The benchmark itself is sourced from code after April 2022 to avoid overlap with major pretraining datasets." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on code completion tasks." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The study evaluates LLMs on code completion using automated execution-based testing." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. The study uses publicly available code from Codeforces and GitHub." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. Inclusion/exclusion criteria are for code examples, not people." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants or experimental conditions involving randomized assignment." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants. Evaluation is fully automated." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants. No attrition concept applies." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No inference costs, latency, tokens consumed, or API costs are reported despite evaluating 15+ models across 17,720 examples each using both local GPU inference and OpenAI API calls." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No GPU hours, total API spend, or hardware specifications are reported. The acknowledgements thank individuals for 'providing computational resources' but do not quantify the compute budget." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Only one completion is generated per model per example using stochastic sampling (top-p 0.95, temperature 0.2). No multi-seed analysis is conducted. The paper argues dataset size compensates but does not test seed sensitivity." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 3.3 explicitly states: 'we only generate one completion for each LLM on each example and report the percentage of first-attempt passes, i.e., Pass@1, as our evaluation metric.'" 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "Temperature (0.2) and top-p (0.95) are fixed following Fried et al. (2023) without exploring alternatives. While all five prompt configurations are reported, the total compute budget for the prompt comparison is not stated." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "Tables 8-10 (Appendix A.4) report all prompt configurations for every model. The paper selects the best-performing prompt per model with a clear criterion (highest Pass@1 with truncation), with all alternatives transparently shown." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "Many cross-model comparisons are drawn across 15+ models on multiple task categories and prompts, but no statistical tests are performed at all, so no multiple comparison correction is applied." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors created the SAFIM benchmark and evaluate all models on it. They do not discuss the potential bias of evaluating their own benchmark construction, post-processing, and prompt design choices." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Figure 3 plots model size vs. performance, but actual compute budgets (inference cost, GPU hours) are not reported. Models of vastly different sizes and architectures are compared without controlling for compute." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 2 extensively discusses what existing benchmarks measure vs. real development needs and how SAFIM addresses the gap. Each task category's construct is explained (Section 3.2). Appendix A.6 shows Phi models' SAFIM results differ from HumanEval, demonstrating SAFIM captures different capabilities." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No agentic scaffolding is involved. All models are evaluated via direct inference (single-turn generation) with standardized prompts." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The benchmark uses code from April 2022 - January 2023, deliberately after most models' training cutoffs. Table 1 shows training cutoffs for each model. Appendix A.9 creates a temporally separated test set (April 2023 - January 2024) to further validate that temporal leakage is not affecting results." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "The paper does not explicitly discuss whether the FIM prompts (prefix + suffix context) could leak answer information beyond what would be available in realistic code completion scenarios. The suffix context is part of the task design but its potential to provide unintended hints is not analyzed." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "While within-dataset deduplication is performed (CodeBLEU > 0.9 threshold), the paper does not discuss whether test examples share structural similarities with training data beyond temporal separation (e.g., similar Codeforces problems, same algorithm patterns)." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": true, 368 "justification": "Appendix A.9 uses a concrete temporal split method: creates a new test set from a later time period (April 2023 - January 2024) with zero date overlap with any model's training data. Table 17 compares original vs. new dataset performance, and Figure 5 shows performance stability across months, constituting a concrete leakage detection approach." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "FIM pretraining not only enhances FIM proficiency but also improves Left-to-Right (L2R) inference performance.", 375 "evidence": "Table 2 shows FIM-pretrained StarCoder (29.3% L2R) outperforms purely L2R-pretrained CodeGen-16B (24.6% L2R) on algorithmic block completion. CodeLLaMa-34B (no FIM) underperforms smaller FIM-trained CodeLLaMa-13B (38.5% vs 41.4%).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Pretraining methods and data quality have more impact than model size on code FIM performance.", 380 "evidence": "Table 4/Figure 3: StarCoder (15.5B, 55.5% avg) matches GPT-4 (>220B, 53.3% avg). DeepSeekCoder-1.3B (52.6%) outperforms GPT-3.5 (175B, 40.9%). Within CodeLLaMa family, 7.8-point size spread vs. 14-point cross-family gap (CodeLLaMa vs CodeGen).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Syntax-aware truncation significantly enhances FIM output quality and enables fair model comparison.", 385 "evidence": "Table 3: CodeLLaMa-13B Pass@1 jumps from 16.4% to 41.4% with truncation; CErr% drops from 64.6% to 10.9%. CodeGen-16B goes from 0.0% to 25.9% Pass@1. All models show improvement or stability.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Prompt selection is crucial for fair evaluation, as different models respond differently to different prompt types.", 390 "evidence": "Table 2: CodeGen-16B achieves 25.9% with SPM but only 15.2% with IPF. CodeLLaMa-13B achieves 41.4% with SPM but only 10.2% with PSM. DeepSeekCoder-33B achieves 60.8% with PSM but 33.8% with IPF.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Data contamination has negligible impact on SAFIM evaluation results despite date-range overlaps for some models.", 395 "evidence": "Appendix A.9, Table 17: On a new test set (April 2023 - January 2024) with no date overlap for any model, all models maintain or improve performance (e.g., DeepSeekCoder-33B: 60.8% → 61.7%). Figure 5 shows stable performance across months.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Repository-level pretraining data enhances API function call completion, while execution-based feedback improves control-flow completion.", 400 "evidence": "Table 4: StarCoder and DeepSeekCoder (with repo-level context) lead in API completion (68.1%, 75.2%). CodeLLaMa (with execution feedback) shows relatively strong control-flow performance (57.2% for 13B) compared to its API performance (59.7%).", 401 "supported": "weak" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No error bars or statistical tests", 407 "detail": "All comparisons across 15+ models are made on single-run point estimates without any uncertainty quantification, confidence intervals, or statistical significance tests. With stochastic sampling (temperature 0.2, top-p 0.95), output varies across runs, but only one completion per example is generated. The paper argues dataset size (17,720) compensates, but this does not address model-level variance." 408 }, 409 { 410 "flag": "Causal claims from observational data", 411 "detail": "The paper's headline claim that 'FIM pretraining enhances L2R inference' is drawn from comparing models that differ in architecture, training data volume, training data quality, and pretraining objectives simultaneously. The paper acknowledges this in Section 7 but presents the causal claims prominently in the abstract without caveats." 412 }, 413 { 414 "flag": "Potential conflict of interest", 415 "detail": "Two of four authors are from Meta (AI at Meta), the paper is partially funded by a Meta gift, and Meta's CodeLLaMa is among the evaluated models. While CodeLLaMa does not consistently outperform competitors, no competing interests statement is provided." 416 }, 417 { 418 "flag": "API function call split is very small", 419 "detail": "The API function call completion split contains only 310 examples (Table 5), compared to 8,781 and 8,629 for the other two splits. This small sample size for one-third of the benchmark's task categories may not provide robust estimates, especially for per-language breakdowns." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Evaluating large language models trained on code", 425 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 426 "year": 2021, 427 "arxiv_id": "2107.03374", 428 "doi": "10.48550/arXiv.2107.03374", 429 "relevance": "Introduces HumanEval and Codex, foundational benchmark and model for code generation evaluation." 430 }, 431 { 432 "title": "InCoder: A generative model for code infilling and synthesis", 433 "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"], 434 "year": 2023, 435 "arxiv_id": "2204.05999", 436 "doi": "10.48550/arXiv.2204.05999", 437 "relevance": "Pioneered FIM as a pretraining objective for decoder-only code LLMs and established HumanEval-Infilling benchmark." 438 }, 439 { 440 "title": "Code Llama: Open foundation models for code", 441 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 442 "year": 2023, 443 "arxiv_id": "2308.12950", 444 "doi": "10.48550/arXiv.2308.12950", 445 "relevance": "Major open-source code LLM with mixed FIM+L2R pretraining, key evaluated model in this study." 446 }, 447 { 448 "title": "StarCoder: may the source be with you!", 449 "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"], 450 "year": 2023, 451 "arxiv_id": "2305.06161", 452 "doi": "10.48550/arXiv.2305.06161", 453 "relevance": "Large open-source code LLM with FIM pretraining and repository-level context, strong performer in benchmark evaluations." 454 }, 455 { 456 "title": "DeepSeek-Coder: When the large language model meets programming – the rise of code intelligence", 457 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 458 "year": 2024, 459 "arxiv_id": "2401.14196", 460 "doi": "10.48550/arXiv.2401.14196", 461 "relevance": "Top-performing code LLM in the SAFIM benchmark, demonstrates impact of repository-level pretraining and FIM objectives." 462 }, 463 { 464 "title": "SWE-Bench: Can language models resolve real-world Github issues?", 465 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 466 "year": 2023, 467 "arxiv_id": "2310.06770", 468 "doi": "10.48550/arXiv.2310.06770", 469 "relevance": "Real-world software engineering benchmark for LLMs, cited as a contextually rich benchmark in the related work." 470 }, 471 { 472 "title": "Efficient training of language models to fill in the middle", 473 "authors": ["Mohammad Bavarian", "Heewoo Jun", "Nikolas Tezak"], 474 "year": 2022, 475 "arxiv_id": "2207.14255", 476 "doi": "10.48550/arXiv.2207.14255", 477 "relevance": "Foundational work on FIM training methodology showing 90% FIM ratio doesn't harm L2R performance, directly relevant to this paper's findings." 478 }, 479 { 480 "title": "Program synthesis with large language models", 481 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 482 "year": 2021, 483 "arxiv_id": "2108.07732", 484 "doi": "10.48550/arXiv.2108.07732", 485 "relevance": "Introduces MBPP benchmark for code generation, one of the key existing benchmarks discussed in related work." 486 }, 487 { 488 "title": "Codegen: An open large language model for code with multi-turn program synthesis", 489 "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"], 490 "year": 2023, 491 "arxiv_id": "2203.13474", 492 "doi": "10.48550/arXiv.2203.13474", 493 "relevance": "Open L2R-pretrained code LLM used as a key baseline demonstrating limitations of pure L2R pretraining for FIM tasks." 494 }, 495 { 496 "title": "RepoBench: Benchmarking repository-level code auto-completion systems", 497 "authors": ["Tianyang Liu", "Canwen Xu", "Julian McAuley"], 498 "year": 2023, 499 "arxiv_id": "2306.03091", 500 "doi": "10.48550/arXiv.2306.03091", 501 "relevance": "Repository-level code completion benchmark, contextually related to evaluating LLMs' ability to use cross-file context." 502 }, 503 { 504 "title": "Rethinking benchmark and contamination for language models with rephrased samples", 505 "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng"], 506 "year": 2023, 507 "arxiv_id": "2311.04850", 508 "doi": "10.48550/arXiv.2311.04850", 509 "relevance": "Directly addresses benchmark contamination in LLM evaluation, a key concern that motivates SAFIM's temporal cutoff design." 510 }, 511 { 512 "title": "WizardCoder: Empowering code large language models with Evol-Instruct", 513 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 514 "year": 2023, 515 "arxiv_id": "2306.08568", 516 "doi": "10.48550/arXiv.2306.08568", 517 "relevance": "Instruction-tuned code LLM evaluated in extended experiments, showing finetuning on distilled data can hurt FIM performance." 518 } 519 ], 520 "engagement_factors": { 521 "practical_relevance": { 522 "score": 2, 523 "justification": "Provides a publicly available benchmark toolkit and dataset for evaluating code LLMs on FIM tasks, useful for model developers but not directly usable by most practitioners." 524 }, 525 "surprise_contrarian": { 526 "score": 1, 527 "justification": "The finding that smaller models can match larger ones and FIM helps L2R is mildly surprising but aligns with a growing body of evidence on data quality over model size." 528 }, 529 "fear_safety": { 530 "score": 0, 531 "justification": "No safety or security concerns raised; the Impact Statement mentions risks of automated code generation but this is generic." 532 }, 533 "drama_conflict": { 534 "score": 0, 535 "justification": "No controversy — a straightforward benchmark paper with balanced evaluation across multiple model families." 536 }, 537 "demo_ability": { 538 "score": 2, 539 "justification": "Code, dataset, and leaderboard (safimbenchmark.com) are publicly available; researchers can evaluate their own models on SAFIM." 540 }, 541 "brand_recognition": { 542 "score": 2, 543 "justification": "Published at ICML 2024 (top venue), evaluates well-known models (GPT-4, CodeLLaMa, DeepSeekCoder), two authors from Meta AI." 544 } 545 } 546 }