scan-v5.json (26036B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence", 6 "authors": [ 7 "Mayank Mishra", 8 "Matt Stallone", 9 "Gaoyuan Zhang", 10 "Yikang Shen", 11 "Aditya Prasad" 12 ], 13 "year": 2024, 14 "venue": "arXiv.org", 15 "arxiv_id": "2405.04324", 16 "doi": "10.48550/arXiv.2405.04324" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": false, 23 "justification": "The abstract claims models 'consistently reach state-of-the-art performance,' but tables show numerous counter-examples: CodeGemma-7B beats Granite-8B on MBPP by 10+ points (53.0% vs 42.2%) and on CRUXEval; StarCoder2-15B leads at mid-size on MultiPL-E.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper attributes superior explanation/fixing performance to 'data mixture and base model training decisions' but provides no ablation studies isolating the contribution of phase-2 training, FIM objective, or specific data sources.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Claims about suitability for 'enterprise software development workflows' are not bounded — all evidence comes from academic benchmarks with no real-world development task validation.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "When Granite outperforms on explanation/fixing tasks, the paper attributes this to training data without considering alternatives such as evaluation format alignment with training distribution.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "Pass@1 on benchmarks is equated with 'code intelligence' and enterprise productivity throughout without discussion of how academic benchmark performance relates to real developer outcomes.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "No dedicated limitations or threats-to-validity section exists; the paper moves directly from evaluation results to conclusion.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No threats to validity are discussed despite obvious concerns: IBM employees evaluating IBM models, and training data sourced from GitHub that likely overlaps with evaluation benchmarks.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The conclusion only mentions future work directions; no explicit statements about what the benchmark results do not demonstrate (e.g., real-world productivity, safety, alignment beyond instruction following).", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No explicit funding disclosure section is present; IBM authorship is evident but institutional funding is not formally declared.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors are listed with 'IBM Research' affiliation clearly disclosed in the author block.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "IBM employees evaluate IBM-developed models; the organization directly benefits commercially from positive evaluation results via watsonx Code Assistant.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests, patent, equity, or financial interest declaration is present anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": false, 101 "justification": "Terms like 'enterprise-grade,' 'all-around code model,' and 'code intelligence' are used throughout without precise definitions; what distinguishes 'enterprise' use from general use is never clarified.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The contribution is explicitly stated: a family of open-source code LLMs (3B–34B params) for enterprise code tasks, released under Apache 2.0, trained on 116 programming languages.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper actively compares against StarCoder, StarCoder2, CodeLlama, CodeGemma, and Llama-3, discussing gaps in prior work (task diversity beyond generation, enterprise license issues).", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Models released at https://github.com/ibm-granite/granite-code-models under Apache 2.0 license, explicitly stated in abstract and paper.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All evaluation benchmarks (HumanEvalPack, MBPP, RepoBench, CrossCodeEval, etc.) are standard publicly available datasets; the filtered training corpus is not released but evaluation data is accessible.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Training infrastructure (FlashAttention 2, NVIDIA Apex, Megatron-LM, BF16 precision) is mentioned but no requirements.txt, Dockerfile, or reproducible environment specification is provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step training or evaluation reproduction instructions are provided; evaluation scripts are mentioned ('same script and environment') but not shared in the paper.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars appear in any results table despite sampling multiple completions per problem (e.g., 50 for MultiPL-E).", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are conducted; small differences like 0.1% (Granite-20B vs StarCoder2-15B on HumanEvalSynthesize) are presented as meaningful.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Absolute percentage improvements are consistently reported (e.g., '12 points improvement on HumanEvalPack,' '4% improvement on HumanEvalSynthesize'), providing effect size context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Fixed benchmark sizes (e.g., HumanEval: 164 problems) are used without justification that they are sufficient for the numerous comparative claims made across 30+ model comparisons.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Despite sampling multiple completions per problem (40–50 samples for several benchmarks), no variance or standard deviation across sampling runs is reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Extensive baselines: StarCoder, StarCoder2, CodeLlama (7B/13B/34B/70B), CodeGemma, StableCode, Mistral, Llama-3, Gemma, Mixtral across all benchmarks.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include recently released models contemporary to the May 2024 submission: StarCoder2, CodeGemma, and Llama-3 all from early 2024.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "No ablation studies isolate the contribution of phase-2 training, FIM objective (α=0.5), depth upscaling, NeFTune noise, or specific data source contributions.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Multiple metrics used: Pass@1, exact match, edit similarity, ExcessCode, identifier F1, AST evaluation, executable evaluation, RP@1 (robustness), across 19 benchmarks.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is not applicable; automated test execution for functional correctness is the standard and appropriate method for code generation evaluation.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Standard benchmarks (HumanEval, MBPP, RepoBench, etc.) provide held-out test sets not used during training.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Per-language breakdowns provided for MultiPL-E (18 languages), DS-1000 (7 libraries), HumanEvalPack (6 languages × 3 tasks), and per-category for BFCL.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "No failure cases are discussed; the only failure mention is a footnote about Llama-3-8B generating invalid Python programs, not about Granite models.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper explicitly acknowledges losses: 'Granite-8B-Code-Base lags behind CodeGemma-7B on all [ReCode] categories,' 'no single model which performs consistently best at 3B parameters,' and Granite-3B falls short of StarCoder2-3B on MBPP.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "Baseline models referenced by paper name without pinned checkpoint hashes or release dates; some linked to GitHub but without specific version commits.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "Evaluation format (completion vs. instruction template) is mentioned but actual prompts are not shown; the paper states it follows official formats without providing them.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Training hyperparameters (AdamW β1=0.9, β2=0.95, learning rates, batch sizes, warmup steps) and evaluation parameters (temperature 0.2/0.8, top-p 0.95, max tokens) are thoroughly documented.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding used; this is base/instruct model evaluation on standard benchmarks.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 2 provides detailed documentation of crawling, language filtering with explicit rules, SHA256 + MinHash/LSH deduplication (Jaccard threshold 0.7), HAP keyword filtering, and StarPII-based redaction.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "The filtered training corpus is not publicly released; source datasets (GitHub Code Clean, StarCoderdata) are available but not the processed version actually used for training.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 2 describes data sources, filtering criteria, deduplication methodology, PII redaction, malware scanning, and natural language data curation in sufficient detail.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; standard public benchmarks used for evaluation.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Full pipeline documented: source data → language filtering → quality filtering → exact deduplication (SHA256) → fuzzy deduplication (MinHash/LSH) → HAP filtering → PII redaction → malware scan → tokenization.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff date is stated despite training on GitHub data that could substantially overlap with widely-used evaluation benchmarks.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of potential overlap between GitHub-sourced training data and evaluation benchmarks (HumanEval, MBPP) which are publicly available on GitHub and existed before data collection.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "HumanEval (2021) and MBPP (2021) were publicly available long before training data collection; no decontamination steps or overlap analysis are described.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference latency, throughput, or cost estimates are provided despite the paper positioning models for enterprise deployment where cost-per-token matters.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Training infrastructure is described and carbon emissions estimated (~455 tCO2eq), but total GPU-hours or FLOPs budget is not explicitly reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Granite-8B-Code-Base outperforms CodeGemma-8B by ~12 points on the full HumanEvalPack (synthesis+explanation+fixing)", 375 "evidence": "Figure 1 and Tables 3/10/11 show Granite-8B averages substantially higher than CodeGemma-7B on the combined HumanEvalPack tasks, particularly on explanation (26.4% vs 12.4%) and fixing (29.6% vs 10.1%)", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Granite Code models consistently reach state-of-the-art performance among open-source code LLMs", 380 "evidence": "Tables show multiple counter-examples: CodeGemma-7B beats Granite-8B on MBPP (53.0% vs 42.2%) and CRUXEval; StarCoder2-15B leads Granite-20B on several MultiPL-E languages; no single claim of universal SOTA is defensible", 381 "supported": "weak" 382 }, 383 { 384 "claim": "Granite-3B-Code-Instruct surpasses CodeLlama-34B-Instruct on HumanEvalSynthesize", 385 "evidence": "Table 3 directly contradicts this: Granite-3B-Code-Instruct averages 39.6% vs CodeLlama-34B-Instruct at 41.3%; this textual claim is false per the paper's own data", 386 "supported": "unsupported" 387 }, 388 { 389 "claim": "Granite-8B-Code-Base outperforms Llama-3-8B-Base by ~12 points on GSM8K and ~6 points on MATH", 390 "evidence": "Table 15 confirms: Granite-8B at 61.9% vs Llama-3-8B at 49.8% on GSM8K; 21.4% vs 15.6% on MATH", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Two-phase training with code then code+language data improves reasoning capabilities", 395 "evidence": "Asserted in paper but no ablation comparing phase-1-only vs phase-2 trained models is provided; claim is plausible but undemonstrated", 396 "supported": "weak" 397 }, 398 { 399 "claim": "Depth upscaling from 20B to 34B results in minimal performance drop that quickly recovers with continued pretraining", 400 "evidence": "Section 3 describes this qualitatively ('drop in performance is pretty small') but provides no pre/post quantitative comparison tables", 401 "supported": "weak" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval" 406 ], 407 "key_findings": "IBM's Granite Code Models (3B–34B parameters) achieve competitive performance on code benchmarks across generation, explanation, fixing, and translation tasks in 116 languages, with the 8B model approaching 70B-class performance on explanation/fixing tasks. The models demonstrate good performance-to-size ratios versus CodeLlama variants. However, performance is highly benchmark-dependent: CodeGemma-7B beats Granite-8B on MBPP by 10+ points and on robustness benchmarks, and no single Granite model leads consistently across all evaluations. The paper contains at least one factual internal inconsistency: the text claims Granite-3B-Instruct surpasses CodeLlama-34B-Instruct, but their own Table 3 shows the opposite (39.6% vs 41.3%).", 408 "red_flags": [ 409 { 410 "flag": "Internal claim contradicts own table", 411 "detail": "Section 6.1.1 states 'Granite-3B-Code-Instruct surpasses the performance of CodeLlama-34B-Instruct' but Table 3 shows 39.6% vs 41.3% average — the claim is directly contradicted by the paper's own data." 412 }, 413 { 414 "flag": "Self-evaluation conflict of interest", 415 "detail": "IBM employees evaluate IBM-developed models with no independent verification; the organization directly benefits commercially from positive results via watsonx Code Assistant, yet no competing interests are declared." 416 }, 417 { 418 "flag": "Benchmark contamination unaddressed", 419 "detail": "Training data is sourced from GitHub where HumanEval, MBPP, and other evaluation benchmarks have been publicly available since 2021; no decontamination analysis or training cutoff date is provided." 420 }, 421 { 422 "flag": "No ablation studies despite causal claims", 423 "detail": "The paper makes causal claims attributing performance gains to 'data mixture and training decisions' but provides zero ablations for phase-2 training, FIM objective weighting, depth upscaling, or NeFTune noise." 424 }, 425 { 426 "flag": "No statistical rigor for comparative claims", 427 "detail": "Differences as small as 0.1% are treated as meaningful (e.g., Section 6.1.1 on StarCoder2-15B comparison) with no confidence intervals, error bars, or significance tests across 30+ model comparisons." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "StarCoder: May the Source Be With You!", 433 "relevance": "Direct predecessor; Granite reuses StarCoder's tokenizer, FIM training format, and builds on StarCoderData as a training source" 434 }, 435 { 436 "title": "OctoPack: Instruction Tuning Code Large Language Models (HumanEvalPack benchmark)", 437 "relevance": "Provides the primary multi-task evaluation benchmark (synthesis, explanation, fixing) across 6 languages that is central to Granite's evaluation narrative" 438 }, 439 { 440 "title": "Code Llama: Open Foundation Models for Code", 441 "relevance": "Key baseline across all benchmarks; Granite explicitly positions itself relative to CodeLlama variants at each model size" 442 }, 443 { 444 "title": "StarCoder 2 and The Stack v2: The Next Generation", 445 "relevance": "Most direct contemporary competitor; competitive comparison throughout, especially on MultiPL-E and FIM tasks" 446 }, 447 { 448 "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion", 449 "relevance": "Repository-level code completion benchmark used to evaluate practical coding capability beyond isolated function generation" 450 }, 451 { 452 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 453 "relevance": "Code reasoning and execution benchmark used to evaluate deeper code understanding beyond surface generation" 454 }, 455 { 456 "title": "Program Synthesis with Large Language Models (MBPP)", 457 "relevance": "Standard Python code generation benchmark used across model comparisons" 458 }, 459 { 460 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 461 "relevance": "Foundation benchmark for code generation evaluation; forms basis of HumanEvalPack" 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 3, 467 "justification": "Models are immediately downloadable from HuggingFace under Apache 2.0 and cover real enterprise code tasks across 116 languages." 468 }, 469 "surprise_contrarian": { 470 "score": 1, 471 "justification": "Results confirm expected pattern that domain-specific code models beat general models; the 8B explanation/fixing performance is notable but not paradigm-shifting." 472 }, 473 "fear_safety": { 474 "score": 0, 475 "justification": "No AI safety, misuse, or risk concerns are raised; HAP filtering is framed as a feature." 476 }, 477 "drama_conflict": { 478 "score": 1, 479 "justification": "Implicitly competitive with Google (CodeGemma), Meta (CodeLlama/Llama-3), and BigCode (StarCoder2) in the open-source code model race." 480 }, 481 "demo_ability": { 482 "score": 3, 483 "justification": "Models are publicly available on HuggingFace with Apache 2.0 license; anyone can download and test immediately via standard inference libraries." 484 }, 485 "brand_recognition": { 486 "score": 2, 487 "justification": "IBM is a globally recognized enterprise technology brand with established credibility, linked to the commercial watsonx Code Assistant product." 488 } 489 }, 490 "hn_data": { 491 "threads": [ 492 { 493 "hn_id": "39385811", 494 "title": "Personality trait recognition using ECG spectrograms and deep learning", 495 "points": 48, 496 "comments": 40, 497 "url": "https://news.ycombinator.com/item?id=39385811", 498 "created_at": "2024-02-15T17:49:03Z" 499 }, 500 { 501 "hn_id": "31324857", 502 "title": "Panoptic Neural Fields: A Semantic Object-Aware Neural Scene Representation", 503 "points": 2, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=31324857", 506 "created_at": "2022-05-10T08:38:46Z" 507 }, 508 { 509 "hn_id": "42912008", 510 "title": "HarmBench: A Standardized Evaluation Framework for Robust Refusal", 511 "points": 1, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=42912008", 514 "created_at": "2025-02-02T21:26:17Z" 515 }, 516 { 517 "hn_id": "35991015", 518 "title": "Penguin Huddling: A Continuum Model", 519 "points": 1, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=35991015", 522 "created_at": "2023-05-18T17:08:38Z" 523 } 524 ], 525 "top_points": 48, 526 "total_points": 52, 527 "total_comments": 40 528 } 529 }