scan.json (33728B)
1 { 2 "paper": { 3 "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence", 4 "authors": [ 5 "Mayank Mishra", 6 "Matt Stallone", 7 "Gaoyuan Zhang", 8 "Yikang Shen", 9 "Aditya Prasad", 10 "Adriana Meza Soria", 11 "Michele Merler", 12 "Parameswaran Selvam", 13 "Saptha Surendran", 14 "Shivdeep Singh", 15 "Manish Sethi", 16 "Xuan-Hong Dang", 17 "Pengyuan Li", 18 "Kun-Lung Wu", 19 "Syed Zawad", 20 "Andrew Coleman", 21 "Matthew White", 22 "Mark Lewis", 23 "Raju Pavuluri", 24 "Yan Koyfman", 25 "Boris Lublinsky", 26 "Maximilien de Bayser", 27 "Ibrahim Abdelaziz", 28 "Kinjal Basu", 29 "Mayank Agarwal", 30 "Yi Zhou", 31 "Chris Johnson", 32 "Aanchal Goyal", 33 "Hima Patel", 34 "Yousaf Shah", 35 "Petros Zerfos", 36 "Heiko Ludwig", 37 "Asim Munawar", 38 "Maxwell Crouse", 39 "Pavan Kapanipathi", 40 "Shweta Salaria", 41 "Bob Calio", 42 "Sophia Wen", 43 "Seetharami Seelam", 44 "Brian Belgodere", 45 "Carlos Fonseca", 46 "Amith Singhee", 47 "Nirmit Desai", 48 "David D. Cox", 49 "Ruchir Puri", 50 "Rameswar Panda" 51 ], 52 "year": 2024, 53 "venue": "arXiv", 54 "arxiv_id": "2405.04324", 55 "doi": "10.48550/arXiv.2405.04324" 56 }, 57 "scan_version": 2, 58 "active_modules": ["experimental_rigor", "data_leakage"], 59 "methodology_tags": ["benchmark-eval"], 60 "key_findings": "Granite Code models (3B-34B parameters) trained on 3.5-4.5T tokens across 116 programming languages achieve competitive or state-of-the-art performance among open-source code LLMs across diverse tasks including code generation, fixing, explanation, editing, and translation. Granite-8B-Code-Base outperforms CodeGemma-8B by ~12 points on HumanEvalPack average despite training on fewer tokens (4.5T vs 7.5T). The models show particular strength in code explanation and fixing tasks where specialized code models like StarCoder2 and CodeGemma fall behind. All models are released under Apache 2.0 license.", 61 "checklist": { 62 "artifacts": { 63 "code_released": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper provides a GitHub link (https://github.com/ibm-granite/granite-code-models) in the abstract and states 'We release all our Granite Code models under an Apache 2.0 license for both research and commercial use.'" 67 }, 68 "data_released": { 69 "applies": true, 70 "answer": true, 71 "justification": "All evaluation benchmarks used are publicly available (HumanEval, MBPP, MultiPL-E, DS1000, RepoBench, etc.). Training data is sourced from publicly available datasets (Github Code Clean, StarCoderdata) though the specific curated training set is not released as a standalone artifact." 72 }, 73 "environment_specified": { 74 "applies": true, 75 "answer": false, 76 "justification": "The paper describes hardware (A100/H100 clusters) and frameworks (Megatron-LM, FlashAttention 2, NVIDIA Apex) in Sections 4.3-4.4, but does not provide versioned dependency lists, requirements.txt, Dockerfile, or sufficient detail to exactly recreate the software environment." 77 }, 78 "reproduction_instructions": { 79 "applies": true, 80 "answer": false, 81 "justification": "No step-by-step reproduction instructions are provided in the paper. While a GitHub repository is linked, the paper itself does not include commands or scripts to replicate the main experiments." 82 } 83 }, 84 "statistical_methodology": { 85 "confidence_intervals_or_error_bars": { 86 "applies": true, 87 "answer": false, 88 "justification": "All benchmark results in Tables 3-16 are point estimates (e.g., '43.9% pass@1') with no confidence intervals or error bars, despite sampling-based evaluations (50 samples for MultiPL-E, 40 for DS1000, etc.)." 89 }, 90 "significance_tests": { 91 "applies": true, 92 "answer": false, 93 "justification": "The paper makes numerous 'outperforms' claims (e.g., 'outperforms the most competitive CodeGemma-8B model by almost 12 points') based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, etc.) are reported anywhere." 94 }, 95 "effect_sizes_reported": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper consistently reports absolute differences with baseline context: 'outperforms CodeGemma-8B by almost 12 points on HumanEvalPack (33.2% vs 21.3%)', 'outperforms Llama-3-8B-Base by ~12 points on GSM8K and ~6 points on MATH', providing both values for context." 99 }, 100 "sample_size_justified": { 101 "applies": true, 102 "answer": false, 103 "justification": "No justification is given for why 50 samples per prompt (MultiPL-E), 40 samples (DS1000), 10 samples (CRUXEval), or 20 completions (CanItEdit) were chosen. These numbers follow prior work conventions but are not explicitly justified." 104 }, 105 "variance_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "Despite using multiple samples per evaluation (50 for MultiPL-E, 40 for DS1000, 10 for CRUXEval), only mean pass@1 values are reported. No standard deviations, interquartile ranges, or other spread measures are provided across runs or samples." 109 } 110 }, 111 "evaluation_design": { 112 "baselines_included": { 113 "applies": true, 114 "answer": true, 115 "justification": "Extensive baseline comparisons with StarCoder, StarCoder2, CodeLlama, CodeGemma, Mistral, Llama-3, Gemma, Mixtral, OctoCoder, and StableCode across all benchmarks (Tables 3-16)." 116 }, 117 "baselines_contemporary": { 118 "applies": true, 119 "answer": true, 120 "justification": "Baselines include models released in 2024 (Llama-3, CodeGemma, StarCoder2) which were state-of-the-art at time of writing. The paper evaluates against the most recent open-source code LLMs." 121 }, 122 "ablation_study": { 123 "applies": true, 124 "answer": false, 125 "justification": "No ablation study is conducted. The paper evaluates models at different sizes (3B, 8B, 20B, 34B) but does not ablate training components (e.g., phase 2 training impact, FIM objective contribution, data filtering choices). The multi-size evaluation shows scaling but not component contributions." 126 }, 127 "multiple_metrics": { 128 "applies": true, 129 "answer": true, 130 "justification": "Multiple metrics are used: pass@1, exact match, edit similarity, ExcessCode, RP@1 (robustness), Code ES, ID F1, across diverse benchmarks covering generation, fixing, explanation, editing, translation, execution, math reasoning, and function calling." 131 }, 132 "human_evaluation": { 133 "applies": true, 134 "answer": false, 135 "justification": "All evaluations are fully automated using pass@1 on test suites, exact match, edit similarity, and other automated metrics. No human evaluation of code quality, usefulness, or readability is included." 136 }, 137 "held_out_test_set": { 138 "applies": true, 139 "answer": true, 140 "justification": "All evaluations use established public benchmarks (HumanEval, MBPP, MultiPL-E, DS1000, RepoBench, CrossCodeEval, etc.) which serve as held-out test sets separate from any development decisions." 141 }, 142 "per_category_breakdown": { 143 "applies": true, 144 "answer": true, 145 "justification": "Extensive per-language breakdowns across tables: 6 languages for HumanEvalPack (Table 3), 18 languages for MultiPL-E (Table 4), 7 libraries for DS1000 (Table 7), per-language results for RepoBench (Table 6), CrossCodeEval (Table 8), FIM (Table 9), CRUXEval (Table 14), and per-task results for BFCL (Figures 4-5)." 146 }, 147 "failure_cases_discussed": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper notes where Granite falls behind competitors (e.g., 'lags behind CodeGemma-7B on all categories' for ReCode, Sec 6.7) but provides no systematic error analysis or qualitative examination of failure modes. There is no discussion of what types of problems Granite systematically fails on or why." 151 }, 152 "negative_results_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Several negative results are reported: 'no performance improvement in scaling the model sizes from 8B to 34B' for FIM (Sec 6.1.6), Granite-8B 'lags behind CodeGemma-7B on all categories' for ReCode (Sec 6.7), Granite-3B 'falls short of StarCoder2-3B' on MBPP (Sec 6.1.3), and mixed results on CrossCodeEval and RepoBench where no single model dominates." 156 } 157 }, 158 "claims_and_evidence": { 159 "abstract_claims_supported": { 160 "applies": true, 161 "answer": false, 162 "justification": "The abstract claims Granite Code models 'consistently reaches state-of-the-art performance among available open-source code LLMs.' However, results show mixed performance: on MBPP, StarCoder2-3B and CodeGemma-7B beat their Granite counterparts; on MultiPL-E, 'there is no single model that works best at every language'; on ReCode, CodeGemma-7B beats Granite-8B. 'Consistently state-of-the-art' overstates the actual results." 163 }, 164 "causal_claims_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The paper makes causal claims without adequate justification: 'We attribute this performance to our data mixture and base model training decisions' (Sec 6.2) is a causal claim about what caused good performance, but no ablation study isolates these factors. No controlled experiments demonstrate that the data mixture or training decisions specifically caused the improvements." 168 }, 169 "generalization_bounded": { 170 "applies": true, 171 "answer": false, 172 "justification": "The title claims 'Code Intelligence' broadly and the abstract claims the models are 'optimized for enterprise software development workflows,' but no enterprise environment testing is performed. The paper tests on academic benchmarks only. Claims about enterprise suitability and general code intelligence extend well beyond the tested settings." 173 }, 174 "alternative_explanations_discussed": { 175 "applies": true, 176 "answer": false, 177 "justification": "No alternative explanations are discussed for the observed results. For instance, performance differences could be due to training data overlap with benchmarks, differences in tokenization, or data scale effects, but none of these alternatives are considered." 178 }, 179 "proxy_outcome_distinction": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper measures pass@1 on synthetic benchmarks and frames results as evidence of 'enterprise software development' capability and 'code intelligence.' The gap between benchmark performance and real-world enterprise development utility is never acknowledged or discussed." 183 } 184 }, 185 "setup_transparency": { 186 "model_versions_specified": { 187 "applies": true, 188 "answer": true, 189 "justification": "Exact model names with versions are specified for all baselines: 'Mistral-7B-v0.2', 'Mixtral-8x7B-v0.1', 'Mixtral-8x22B-v0.1', 'Llama-3-8B', etc. Own models specify exact sizes and architecture configurations in Table 1." 190 }, 191 "prompts_provided": { 192 "applies": true, 193 "answer": false, 194 "justification": "The paper states 'we adhere to the formats provided in their official examples' and use 'completion format for the base models, and instruction template for the instruction-tuned models' (Sec 6.1.1) but never provides the actual prompt text used in any experiment. The specific prompts are described only in natural language." 195 }, 196 "hyperparameters_reported": { 197 "applies": true, 198 "answer": true, 199 "justification": "Extensive hyperparameters reported: optimizer (AdamW, β1=0.9, β2=0.95, weight decay=0.1), learning rates (3×10⁻⁴ to 3×10⁻⁵, cosine and exponential schedules), FIM α=0.5, evaluation settings (temperature 0.2, top-p 0.95 for MultiPL-E/DS1000; temperature 0.8 for pass@5), batch sizes (Table 1), context lengths." 200 }, 201 "scaffolding_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No agentic scaffolding is used. Models are evaluated directly on benchmarks with standard prompting." 205 }, 206 "data_preprocessing_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Section 2 documents the data pipeline in detail: language filtering (116 of 300+ languages), four quality filtering rules (alphabetic threshold, XML filter, HTML visible text filter, JSON/YAML character count), exact dedup (SHA256), fuzzy dedup (MinHash + LSH, Jaccard threshold 0.7), HAP/PII filtering (StarPII model), and malware scanning (ClamAV)." 210 } 211 }, 212 "limitations_and_scope": { 213 "limitations_section_present": { 214 "applies": true, 215 "answer": false, 216 "justification": "There is no dedicated limitations section. The conclusion (Section 7) briefly mentions future plans (long-context variants, specialized models) but does not discuss any limitations of the current work." 217 }, 218 "threats_to_validity_specific": { 219 "applies": true, 220 "answer": false, 221 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential issues with the evaluation methodology, data quality, or generalizability of results." 222 }, 223 "scope_boundaries_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were not tested, or what claims the authors are NOT making." 227 } 228 }, 229 "data_integrity": { 230 "raw_data_available": { 231 "applies": true, 232 "answer": false, 233 "justification": "The curated training dataset is not released. While source datasets (Github Code Clean, StarCoderdata) are publicly available, the specific filtered, deduplicated, and processed training set used is not available for verification. Model weights are released but not the training data." 234 }, 235 "data_collection_described": { 236 "applies": true, 237 "answer": true, 238 "justification": "Section 2 describes the data collection in detail: sources (publicly available GitHub datasets), filtering criteria (language filtering, quality rules), deduplication methods (SHA256 + MinHash/LSH), and content filtering (HAP/PII/malware). Natural language data sources are also described in Section 2.4." 239 }, 240 "recruitment_methods_described": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants. Data sources are standard public code repositories and established benchmarks." 244 }, 245 "data_pipeline_documented": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 2 documents the full pipeline: crawling → language filtering (116 languages from 300+) → quality filtering (4 rules) → exact dedup (SHA256) → fuzzy dedup (MinHash, Jaccard 0.7) → HAP filtering → PII redaction → malware scanning. Section 4.1 documents the two-phase training data splits." 249 } 250 }, 251 "conflicts_of_interest": { 252 "funding_disclosed": { 253 "applies": true, 254 "answer": true, 255 "justification": "While there is no explicit 'Funding' section, all authors are identified as IBM Research employees, the acknowledgments thank IBM leadership by name, and the computing infrastructure (IBM's Vela and Blue Vela clusters) is described. The corporate funding source is effectively disclosed." 256 }, 257 "affiliations_disclosed": { 258 "applies": true, 259 "answer": true, 260 "justification": "All authors are listed under 'IBM Research' affiliation at the top of the paper. The paper is explicitly about IBM's own Granite Code models product." 261 }, 262 "funder_independent_of_outcome": { 263 "applies": true, 264 "answer": false, 265 "justification": "IBM funded this research and has a direct commercial interest in Granite Code models performing well. The models are part of IBM's WatsonX product line (acknowledged in the paper). The funder is not independent of the outcome." 266 }, 267 "financial_interests_declared": { 268 "applies": true, 269 "answer": false, 270 "justification": "No competing interests or financial interests statement is included. Given that this is a corporate product paper from IBM, the absence of a formal declaration of financial interests (patents, equity, commercial product tie-ins) is a gap." 271 } 272 }, 273 "contamination": { 274 "training_cutoff_stated": { 275 "applies": true, 276 "answer": false, 277 "justification": "No training data cutoff date is stated. The paper describes data sources but never specifies when the training data was collected or its temporal boundaries." 278 }, 279 "train_test_overlap_discussed": { 280 "applies": true, 281 "answer": false, 282 "justification": "No discussion of potential overlap between the GitHub-sourced training data and the evaluation benchmarks (HumanEval, MBPP, etc.), which are also derived from publicly available code. This is a significant omission given that the training data includes public GitHub repositories." 283 }, 284 "benchmark_contamination_addressed": { 285 "applies": true, 286 "answer": false, 287 "justification": "HumanEval (2021), MBPP (2021), and other benchmarks were published years before this work. The training data includes public GitHub code where solutions to these benchmarks are widely available. No decontamination analysis or discussion of this risk is provided." 288 } 289 }, 290 "human_studies": { 291 "pre_registered": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants in this study. All evaluations are automated benchmark tests." 295 }, 296 "irb_or_ethics_approval": { 297 "applies": false, 298 "answer": false, 299 "justification": "No human participants in this study." 300 }, 301 "demographics_reported": { 302 "applies": false, 303 "answer": false, 304 "justification": "No human participants in this study." 305 }, 306 "inclusion_exclusion_criteria": { 307 "applies": false, 308 "answer": false, 309 "justification": "No human participants in this study." 310 }, 311 "randomization_described": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants in this study." 315 }, 316 "blinding_described": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants in this study." 320 }, 321 "attrition_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants in this study." 325 } 326 }, 327 "cost_and_practicality": { 328 "inference_cost_reported": { 329 "applies": true, 330 "answer": false, 331 "justification": "No inference cost, latency, or per-example cost is reported for any of the four model sizes. For an enterprise-focused model, this is a notable omission." 332 }, 333 "compute_budget_stated": { 334 "applies": true, 335 "answer": true, 336 "justification": "The paper states carbon emissions of '~455 tCO2eq' for pretraining (Sec 4.4), describes hardware (IBM Vela A100 and Blue Vela H100 clusters), and notes instruction tuning takes 'a few hours on 8×A100 GPUs' for the 20B model. Total token counts are also provided (3.5T-4.5T)." 337 } 338 }, 339 "experimental_rigor": { 340 "seed_sensitivity_reported": { 341 "applies": true, 342 "answer": false, 343 "justification": "While multiple samples are drawn per evaluation (50 for MultiPL-E, 40 for DS1000), no variance or sensitivity across random seeds is reported. Results are single-point estimates." 344 }, 345 "number_of_runs_stated": { 346 "applies": true, 347 "answer": true, 348 "justification": "Number of samples/runs is stated for most benchmarks: '50 completions per prompt' (MultiPL-E), '40 samples per each library' (DS1000), '10 samples' (CRUXEval), '20 completions per problem' (CanItEdit), '5 seeds' (ReCode). Greedy decoding benchmarks implicitly use 1 run." 349 }, 350 "hyperparameter_search_budget": { 351 "applies": true, 352 "answer": false, 353 "justification": "No mention of hyperparameter search for any aspect of training or evaluation. The paper presents a single set of training hyperparameters and evaluation settings without discussing how they were selected or how many configurations were tried." 354 }, 355 "best_config_selection_justified": { 356 "applies": true, 357 "answer": false, 358 "justification": "The paper presents specific architecture choices (MHA vs GQA vs MQA, RMSNorm vs LayerNorm, swiglu vs gelu) and training configurations without justifying why these were selected over alternatives. The FIM weight α=0.5 is described as 'emperically set' but no search process is described." 359 }, 360 "multiple_comparison_correction": { 361 "applies": true, 362 "answer": false, 363 "justification": "The paper makes numerous comparative claims across dozens of model-benchmark combinations without any correction for multiple comparisons. No statistical tests are performed at all, let alone with multiplicity corrections." 364 }, 365 "self_comparison_bias_addressed": { 366 "applies": true, 367 "answer": false, 368 "justification": "IBM evaluates its own Granite models against baselines re-run by the same team. While they note 'we evaluate the baseline models (including ours) using the same script and environment for fair comparison' (Sec 6), they do not acknowledge the inherent bias of authors evaluating their own system (Lucic et al., 2018 showed systematic underperformance of author re-implementations)." 369 }, 370 "compute_budget_vs_performance": { 371 "applies": true, 372 "answer": true, 373 "justification": "The paper explicitly compares performance relative to training tokens: 'Granite-8B-Code-Base outperforms the most competitive CodeGemma-8B model by almost 12 points on HumanEvalPack, despite being trained on significantly less number of tokens (4.5T vs 7.5T tokens).' Performance at different model sizes (3B-34B) also provides compute-performance perspective." 374 }, 375 "benchmark_construct_validity": { 376 "applies": true, 377 "answer": false, 378 "justification": "The paper uses many benchmarks but never discusses whether they actually measure the claimed capabilities. For example, HumanEval pass@1 is used as evidence of 'code intelligence' and 'enterprise software development' capability without questioning whether these synthetic benchmarks measure real-world coding ability." 379 }, 380 "scaffold_confound_addressed": { 381 "applies": false, 382 "answer": false, 383 "justification": "No agentic scaffolding is used. Models are evaluated directly on benchmarks via standard prompting." 384 } 385 }, 386 "data_leakage": { 387 "temporal_leakage_addressed": { 388 "applies": true, 389 "answer": false, 390 "justification": "HumanEval was published in 2021, MBPP in 2021, and many other benchmarks predate the training data collection. The training data includes GitHub code where solutions to these benchmarks are freely available. No temporal leakage analysis is discussed." 391 }, 392 "feature_leakage_addressed": { 393 "applies": true, 394 "answer": false, 395 "justification": "No discussion of whether evaluation setups leak information, or whether the model's training on GitHub issues and code comments provides unfair context for benchmark problems derived from similar sources." 396 }, 397 "non_independence_addressed": { 398 "applies": true, 399 "answer": false, 400 "justification": "Training data comes from GitHub repositories, and several benchmarks (HumanEval, MBPP) are derived from or available on GitHub. The potential for train-test overlap from shared data sources is never discussed." 401 }, 402 "leakage_detection_method": { 403 "applies": true, 404 "answer": false, 405 "justification": "No decontamination pipeline, canary strings, membership inference tests, or any other leakage detection method is described or applied." 406 } 407 } 408 }, 409 "claims": [ 410 { 411 "claim": "Granite-8B-Code-Base outperforms CodeGemma-8B by almost 12 points on HumanEvalPack average (33.2% vs 21.3%), despite being trained on significantly fewer tokens (4.5T vs 7.5T).", 412 "evidence": "Table 3 and Figure 1 show the HumanEvalSynthesize results. The average across 6 languages confirms the 12-point gap. Token counts are stated in Section 4.1.", 413 "supported": "strong" 414 }, 415 { 416 "claim": "Granite Code models consistently reach state-of-the-art performance among available open-source code LLMs.", 417 "evidence": "Tables 3-16 show competitive performance across many benchmarks, but Granite is not always best: StarCoder2-3B beats Granite-3B on MBPP (Table 5), CodeGemma-7B beats Granite-8B on ReCode (Table 16) and CRUXEval (Table 14), and 'there is no single model that works best at every language' on MultiPL-E.", 418 "supported": "moderate" 419 }, 420 { 421 "claim": "Granite-8B-Code-Base outperforms Llama-3-8B-Base by ~12 points on GSM8K and ~6 points on MATH.", 422 "evidence": "Table 15 shows GSM8K: 61.9 vs 49.8 (~12 point gap) and MATH: 21.4 vs 15.6 (~6 point gap).", 423 "supported": "strong" 424 }, 425 { 426 "claim": "Granite Code base models significantly outperform other SOTA base code LLMs on code explanation and fixing tasks.", 427 "evidence": "Tables 10-11 show large margins: Granite-8B-Code-Base achieves 26.4% avg on HumanEvalExplain vs 13.5% for StarCoder2-7B and 12.4% for CodeGemma-7B; 29.6% avg on HumanEvalFix vs 8.9% for StarCoder2-7B and 10.1% for CodeGemma-7B.", 428 "supported": "strong" 429 }, 430 { 431 "claim": "Domain-specific code models are more suitable for cost- and performance-sensitive enterprise environments than larger general-purpose models.", 432 "evidence": "Performance results show Granite outperforming some larger models on coding tasks (Section 6.1.1), but no cost analysis, latency measurement, or enterprise deployment evaluation is provided to support the enterprise suitability claim.", 433 "supported": "weak" 434 }, 435 { 436 "claim": "Depth upscaling from the 20B model is effective for training the 34B model, with small initial performance drop that is quickly recovered.", 437 "evidence": "Section 3 (34B description) and Figure 2 describe the depth upscaling approach. The claim that 'drop in performance compared to 20B model is pretty small' is stated but no quantitative evidence of the drop or recovery curve is provided in the paper.", 438 "supported": "weak" 439 }, 440 { 441 "claim": "Instruction tuning consistently improves function calling performance, with +17.88% overall accuracy from Granite-8B-Code-Base to Granite-8B-Code-Instruct.", 442 "evidence": "Figures 4 and 5 show BFCL results with clear improvement from base to instruct variants.", 443 "supported": "strong" 444 } 445 ], 446 "red_flags": [ 447 { 448 "flag": "Company evaluating own product", 449 "detail": "All 46 authors are IBM Research employees evaluating IBM's Granite Code models, which are part of IBM's commercial WatsonX product line. The paper acknowledges WatsonX Code Assistant in the references. This is a textbook conflict of interest where the funder has a direct commercial stake in positive results." 450 }, 451 { 452 "flag": "No contamination analysis", 453 "detail": "Training data includes GitHub code and publicly available datasets. Evaluation benchmarks (HumanEval 2021, MBPP 2021, etc.) have solutions widely available on GitHub. No decontamination, overlap analysis, or temporal leakage discussion is provided despite this being a known and serious issue for code LLM evaluation." 454 }, 455 { 456 "flag": "No error bars or statistical tests", 457 "detail": "All results are point estimates despite sampling-based evaluations. Numerous 'outperforms' claims are made by comparing raw numbers without any statistical significance testing. Small differences (e.g., 0.1% on HumanEvalSynthesize between Granite-20B and StarCoder2-15B) are treated as meaningful." 458 }, 459 { 460 "flag": "No limitations section", 461 "detail": "The paper has no limitations section, no threats to validity, and no scope boundaries. For a paper with 46 authors and strong enterprise positioning claims, the complete absence of self-critical analysis is concerning." 462 }, 463 { 464 "flag": "Enterprise claims without enterprise evaluation", 465 "detail": "The paper repeatedly claims models are 'optimized for enterprise software development workflows' and suitable for 'enterprise environments,' but all evaluation is on academic benchmarks. No real-world enterprise tasks, developer studies, or deployment metrics are provided." 466 }, 467 { 468 "flag": "Selective baseline presentation", 469 "detail": "Some tables include different model sets, making cross-table comparison difficult. CodeGemma-2B is not included in all evaluations where Granite-3B appears. The Llama-3-70B Python generation issue (footnote in Table 3) is flagged but no investigation is described." 470 } 471 ], 472 "cited_papers": [ 473 { 474 "title": "StarCoder: may the source be with you!", 475 "authors": ["Raymond Li", "Loubna Ben Allal"], 476 "year": 2023, 477 "arxiv_id": "2305.06161", 478 "relevance": "Open-source code LLM that established data filtering and training methodology standards for code models." 479 }, 480 { 481 "title": "Code Llama: Open Foundation Models for Code", 482 "authors": ["Baptiste Rozière"], 483 "year": 2023, 484 "arxiv_id": "2308.12950", 485 "relevance": "Key baseline code LLM family from Meta, demonstrating code specialization through continued pretraining." 486 }, 487 { 488 "title": "OctoPack: Instruction Tuning Code Large Language Models", 489 "authors": ["Niklas Muennighoff"], 490 "year": 2023, 491 "relevance": "Introduced HumanEvalPack benchmark for multilingual code generation/fixing/explanation and CommitPackFT used in Granite instruction tuning." 492 }, 493 { 494 "title": "StarCoder 2 and The Stack v2: The Next Generation", 495 "authors": ["Anton Lozhkov", "Raymond Li"], 496 "year": 2024, 497 "arxiv_id": "2402.19173", 498 "relevance": "State-of-the-art open code LLM and training dataset, key baseline for Granite Code evaluation." 499 }, 500 { 501 "title": "Evaluating Large Language Models Trained on Code", 502 "authors": ["Mark Chen"], 503 "year": 2021, 504 "relevance": "Introduced HumanEval benchmark, the most widely used code generation evaluation, used extensively in this paper." 505 }, 506 { 507 "title": "Program Synthesis with Large Language Models", 508 "authors": ["Jacob Austin"], 509 "year": 2021, 510 "relevance": "Introduced MBPP benchmark for Python code generation evaluation, used as key benchmark in this paper." 511 }, 512 { 513 "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation", 514 "authors": ["Federico Cassano"], 515 "year": 2023, 516 "relevance": "Multilingual code generation benchmark covering 18 languages, used for comprehensive language evaluation." 517 }, 518 { 519 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 520 "authors": ["Alex Gu"], 521 "year": 2024, 522 "arxiv_id": "2401.03065", 523 "relevance": "Code reasoning and execution benchmark testing input/output prediction, evaluating deeper code understanding." 524 }, 525 { 526 "title": "RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems", 527 "authors": ["Tianyang Liu"], 528 "year": 2023, 529 "arxiv_id": "2306.03091", 530 "relevance": "Repository-level code completion benchmark testing cross-file understanding, used for evaluating practical code completion." 531 }, 532 { 533 "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code", 534 "authors": ["Rangeet Pan"], 535 "year": 2024, 536 "relevance": "CodeLingua benchmark for code translation quality, evaluating LLM capability in cross-language code tasks." 537 }, 538 { 539 "title": "RoCode: Robustness Evaluation of Code Generation Models", 540 "authors": ["Shiqi Wang"], 541 "year": 2022, 542 "doi": "10.48550/arXiv.2212.10264", 543 "relevance": "Robustness evaluation benchmark using perturbations on code generation, assessing model reliability under input variations." 544 }, 545 { 546 "title": "Berkeley Function Calling Leaderboard", 547 "authors": ["Fanjia Yan"], 548 "year": 2024, 549 "relevance": "Function calling benchmark evaluating LLM ability to generate correct function calls, relevant to tool-use and agentic capabilities." 550 }, 551 { 552 "title": "Efficient Training of Language Models to Fill in the Middle", 553 "authors": ["Mohammad Bavarian"], 554 "year": 2022, 555 "relevance": "Introduced FIM (Fill-in-the-Middle) training objective used in Granite Code models for code completion." 556 } 557 ] 558 }