scan.json (26570B)
1 { 2 "paper": { 3 "title": "Genetic Instruct: Scaling up Synthetic Generation of Coding Instructions for Large Language Models", 4 "authors": [ 5 "Somshubra Majumdar", 6 "Vahid Noroozi", 7 "Mehrzad Samadi", 8 "Sean Narenthiran", 9 "Aleksander Ficek", 10 "Wasi Uddin Ahmad", 11 "Jocelyn Huang", 12 "Jagadeesh Balam", 13 "Boris Ginsburg" 14 ], 15 "year": 2024, 16 "venue": "Annual Meeting of the Association for Computational Linguistics", 17 "arxiv_id": "2407.21077", 18 "doi": "10.48550/arXiv.2407.21077" 19 }, 20 "scan_version": 2, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "Genetic-Instruct generates synthetic coding instructions using evolutionary crossover and mutation operations starting from 512 seed instructions, producing 7.5M instruction-code pairs. Models fine-tuned on this data achieve 69.7% average accuracy across HumanEval/MBPP benchmarks, outperforming alternative synthetic generation methods (best baseline 66.8%) and publicly available datasets. Combining mutation and crossover yields better results than either alone (68.0% vs 66.8% and 66.6%). Performance shows diminishing returns beyond ~6M samples, and even smaller generator models (Qwen-7B) can produce competitive synthetic data.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No source code repository is mentioned. The paper releases the dataset on HuggingFace but provides no code for the Genetic-Instruct pipeline itself." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The 7.5M synthetic dataset is publicly released at https://huggingface.co/datasets/nvidia/OpenCodeGeneticInstruct, mentioned in Section 1 and the conclusion." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions NeMo framework, NeMo Aligner, vLLM, BF16 precision, and tensor parallelism (Section 4.1) but provides no requirements.txt, Dockerfile, or specific library versions sufficient to recreate the environment." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. The algorithm is described in Section 3 and Algorithm 1, but there are no runnable scripts or README-style instructions." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '69.7%') with no confidence intervals, error bars, or ± notation." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper claims 'significant improvement' and that models 'consistently outperform' baselines, but no statistical significance tests (p-values, t-tests, etc.) are reported." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Tables 1-3 report absolute accuracy numbers for all methods, providing baseline context. E.g., Genetic-Instruct achieves 69.7% vs best public dataset at 62.9% (Table 1), allowing readers to compute effect sizes." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification for why 512 seed instructions, why 4M or 7.5M samples, or why these particular benchmark sizes. Values are stated without justification." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No standard deviation, variance across runs, or any spread measure is reported. All results appear to be single-run numbers." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 compares against WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT, and several public datasets (Code Parrot Apps, TACO, OpenCoder, Code Alpaca), plus Llama 3.1 8B Instruct." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines include WizardCoder (2024), OSS-Instruct/Magicoder (2024), INVERSE-INSTRUCT (2024), and OpenCoder (2024) — all contemporary with the paper." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 2 ablates crossover-only vs mutation-only vs combined. Table 3 ablates the effect of different generator models (Mixtral-8x22B, Mixtral-8x7B, Qwen-32B, Qwen-7B)." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are reported on four benchmarks: HumanEval, MBPP, HumanEval+, and MBPP+ (Tables 1-3)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation of generated instructions or code quality. All evaluation is automated via benchmark pass rates. Human evaluation of the quality/diversity of generated synthetic instructions would be relevant." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "Evaluation uses standard held-out benchmarks (HumanEval, MBPP, HumanEval+, MBPP+) that are separate from the synthetic training data, with decontamination applied (Section 3.6)." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Tables 1-3 provide per-benchmark breakdowns (HumanEval, MBPP, HumanEval+, MBPP+) rather than just aggregate averages." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": false, 113 "justification": "No discussion of failure cases — what types of instructions Genetic-Instruct struggles to generate, what kinds of code solutions fail the Judge-LLM, or where fine-tuned models break down." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Figure 2 reports diminishing returns beyond ~6M samples, showing the limits of scaling. The paper also reports that INVERSE-INSTRUCT performs poorly (41.1% average), and that mutation-only slightly outperforms the combined approach on HumanEval." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims of 'significant improvement,' 'highly parallelizable,' and 'effective even with small seed data and weaker generator models' are supported by Table 1 (improvement over baselines), Section 3.5 (parallelization), and Table 3 (Qwen-7B competitive with Qwen-32B)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The ablation study (Table 2) uses controlled single-variable manipulation — same base model, same data size, same generator, varying only the algorithm. This is adequate for causal claims about the contribution of mutation and crossover operations." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims 'Coding Instructions for Large Language Models' broadly, but evaluation is exclusively on Python benchmarks (HumanEval/MBPP). Section 4 states 'our evaluation focuses exclusively on Python coding benchmarks' but the abstract and title do not bound the claims to Python." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "No discussion of alternative explanations. Could the gains be from data volume alone? Are the baseline re-implementations optimal? Could different seed data change the ranking? None of these are addressed." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures pass rates on coding benchmarks and frames results as 'code generation capability' — the benchmarks directly test code generation, so the proxy closely matches the claim. No broader unsupported framing." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Specific model identifiers are provided: Llama3.1-8B-Base, Mixtral-8x22B, Mixtral-8x7B, Qwen2.5-7B-Base, Qwen-32B, Qwen-7B, Meta-Llama-3-70B-Instruct. These are sufficiently specific for open-source models with distinct releases." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Full prompt templates are provided in Appendices A-F: mutation prompts (Figure 3), crossover prompt (Figure 4), code generation prompt (Figure 5), fitness/judge prompt (Figure 6), decontamination prompt (Figure 7), and evaluation prompts (Figures 8-9)." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.1 reports: temperature 1.2 for Instructor-LLM, 1.0 for Coder/Judge-LLM, learning rate 5e-6 decaying to 5e-7, cosine annealing, 3 epochs, nucleus sampling, max sequence length 1024, batch sizes Bm=100, Bc=10, mutation probability Mp=0.5, 20 parallel colonies." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The system is a data generation pipeline with sequential LLM calls (Instructor → Coder → Judge), not an agent with tools, retry logic, or memory." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3 documents the full pipeline: seed selection from Tiger-Leetcode (512 samples), crossover/mutation operations, AST validation of generated code (Section 3.3), Judge-LLM filtering (Section 3.4), and decontamination (Section 3.6)." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "No dedicated limitations section exists. The paper proceeds from experiments (Section 4) directly to conclusion (Section 5) with no discussion of limitations or threats to validity." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No specific threats to validity are discussed anywhere in the paper." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "Section 4 briefly states 'our evaluation focuses exclusively on Python coding benchmarks' but does not systematically state scope boundaries — what was not tested, what populations are excluded, or what claims are not being made." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The full 7.5M synthetic dataset is released on HuggingFace (nvidia/OpenCodeGeneticInstruct), enabling independent verification of the training data." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3 describes the full data generation procedure: starting from 512 Tiger-Leetcode seeds, applying crossover/mutation via Instructor-LLM, generating code via Coder-LLM, filtering via AST checks and Judge-LLM, and decontamination." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data sources are standard public datasets (Tiger-Leetcode seeds, Stack v2 for baselines)." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "Algorithm 1 and Section 3 detail each pipeline stage: seed selection → crossover/mutation → instruction generation → code generation → AST validation → Judge-LLM filtering → aggregation → decontamination." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding disclosure or acknowledgments section. All authors are NVIDIA employees but no explicit funding statement is provided." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All nine authors are clearly identified as NVIDIA employees with @nvidia.com email addresses listed in the header." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "NVIDIA has commercial interest in compute-intensive methods succeeding — synthetic data generation at scale requires significant GPU resources. The company benefits from demonstrating the value of large-scale GPU-based data generation." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests statement or financial interest disclosures appear in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The training data cutoff dates for the base models (Llama 3.1, Mixtral, Qwen) are not stated, making it impossible to assess whether the models' pre-training data included benchmark solutions." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Section 3.6 describes a decontamination process for their synthetic data: embedding-based similarity search + LLM paraphrase detection against all benchmark datasets, with positional bias control." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section 3.6 applies a concrete decontamination methodology (Yang et al., 2023) using Sentence Transformer similarity search and Meta-Llama-3-70B-Instruct paraphrase detection, with dual-direction matching to control positional bias." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No inference cost, API costs, or wall-clock time is reported for the generation pipeline despite producing 7.5M samples using multiple LLMs." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No total GPU hours, training time, or hardware specification is provided. The paper mentions tensor parallelism and vLLM but does not quantify the total computational budget." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No results across multiple random seeds are reported. All results appear to be single-run numbers." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "Hyperparameter values are reported (Section 4.1) but no search budget is stated. Batch sizes are justified as 'based on our observation' without specifying how many configurations were tried." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "No explanation of how the final configuration was selected. Hyperparameters appear chosen but the selection process is not described." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical tests are performed at all, so multiple comparison correction is inapplicable." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors re-implement all baselines (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) themselves. While they justify this for fairness ('same generator model, seed population, base model'), they do not acknowledge the inherent bias of implementing competing methods (Lucic et al., 2018)." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "All methods generate the same number of samples (4M), but the compute cost per method likely differs (mutation requires per-sample LLM calls, crossover batches instructions). This is not analyzed or compared." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "No discussion of whether HumanEval/MBPP actually measure 'code generation capability' broadly, or their known limitations (e.g., simple function-level problems, limited language coverage)." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is involved — the evaluation is direct model fine-tuning and benchmark pass rates." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "HumanEval (2021) and MBPP (2021) were published years before Llama 3.1 and Mixtral were trained. The base models may have seen solutions during pre-training, but this temporal leakage is not discussed." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup provides hints not available in real usage." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No analysis of whether the synthetically generated instructions share structural similarities with benchmark problems, despite using coding questions as seeds." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": true, 368 "justification": "Section 3.6 applies a concrete leakage detection method: embedding-based similarity search using Sentence Transformer followed by LLM paraphrase detection (Meta-Llama-3-70B-Instruct) with positional bias control via dual-direction matching." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Genetic-Instruct achieves 69.7% average accuracy across coding benchmarks, outperforming alternative synthetic generation methods and public datasets", 375 "evidence": "Table 1: Genetic-Instruct 7.5M achieves 69.7% average vs best baseline Self-Instruct at 66.8% and best public dataset OpenCoder Stage 1 at 62.9%. Four benchmarks reported individually.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Scaling synthetic data improves model performance with diminishing returns beyond ~6M samples", 380 "evidence": "Figure 2 shows coding accuracy rising from ~45% baseline to ~69% at 7.5M samples, with plateau visible after ~6M. Six generations of ~1.5M samples each.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Combining mutation and crossover operations yields better performance than either alone", 385 "evidence": "Table 2: Combined approach achieves 68.0% average vs crossover-only 66.8% and mutation-only 66.6%, all at 4M samples with same base model.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Smaller generator models can produce competitive synthetic data", 390 "evidence": "Table 3: Qwen-7B as generator yields 66.5% (Llama base) and 76.7% (Qwen base) vs Qwen-32B at 66.9% and 77.3% respectively.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Models trained on Genetic-Instruct data outperform Llama3.1-8B-Instruct", 395 "evidence": "Table 1: Genetic-Instruct 7.5M achieves 69.7% average vs Llama 3.1 8B Instruct at 65.9%.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No error bars or uncertainty quantification", 402 "detail": "All results are reported as point estimates without confidence intervals, standard deviations, or multiple runs. The margin between Genetic-Instruct (68.0%) and Self-Instruct (66.8%) at 4M samples is 1.2 percentage points — this could easily be within noise for single-run results." 403 }, 404 { 405 "flag": "Company evaluating own compute-intensive method", 406 "detail": "All authors are NVIDIA employees. The proposed method requires massive GPU-scale parallelism (20 colonies, 7.5M LLM-generated samples). NVIDIA has commercial interest in demonstrating the value of compute-intensive approaches but does not acknowledge this conflict." 407 }, 408 { 409 "flag": "No limitations section", 410 "detail": "The paper has no limitations, threats to validity, or discussion of scope boundaries. This is a significant omission for an empirical paper." 411 }, 412 { 413 "flag": "Baseline re-implementations by competing method authors", 414 "detail": "The authors re-implement all baselines (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) without acknowledging the bias of implementing competing methods. INVERSE-INSTRUCT achieves only 41.1% in their re-implementation — an unusually poor result that warrants scrutiny." 415 }, 416 { 417 "flag": "Python-only evaluation with broad claims", 418 "detail": "The title and abstract claim 'Coding Instructions for Large Language Models' generally, while all evaluation is on Python-only benchmarks (HumanEval/MBPP). The abstract does not bound claims to Python." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Self-instruct: Aligning language models with self-generated instructions", 424 "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra"], 425 "year": 2023, 426 "relevance": "Foundational method for synthetic instruction generation using LLMs, serves as a baseline." 427 }, 428 { 429 "title": "WizardCoder: Empowering code large language models with evol-instruct", 430 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 431 "year": 2024, 432 "relevance": "Adapts Evol-Instruct for code generation via instruction mutation, key baseline in the synthetic coding data space." 433 }, 434 { 435 "title": "Magicoder: Empowering code generation with oss-instruct", 436 "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"], 437 "year": 2024, 438 "relevance": "Generates coding instructions from open-source code snippets (OSS-Instruct), key baseline for code-inspired instruction generation." 439 }, 440 { 441 "title": "InverseCoder: Unleashing the power of instruction-tuned code LLMs with inverse-instruct", 442 "authors": ["Yutong Wu", "Di Huang", "Wenxuan Shi"], 443 "year": 2024, 444 "arxiv_id": "2407.05700", 445 "relevance": "Generates instructions from existing code, representing a code-to-instruction paradigm for synthetic data generation." 446 }, 447 { 448 "title": "Evaluating large language models trained on code", 449 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 450 "year": 2021, 451 "arxiv_id": "2107.03374", 452 "relevance": "Introduces HumanEval benchmark for code generation, widely used for evaluating code LLMs." 453 }, 454 { 455 "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation", 456 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang"], 457 "year": 2023, 458 "relevance": "Introduces EvalPlus (HumanEval+, MBPP+) with additional test cases for more rigorous code evaluation." 459 }, 460 { 461 "title": "OpenCoder: The open cookbook for top-tier code large language models", 462 "authors": ["Siming Huang", "Tianhao Cheng", "Jason Klein Liu"], 463 "year": 2024, 464 "relevance": "Open-source code LLM with public training data, serves as a baseline dataset comparison." 465 }, 466 { 467 "title": "Rethinking benchmark and contamination for language models with rephrased samples", 468 "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng"], 469 "year": 2023, 470 "arxiv_id": "2311.04850", 471 "relevance": "Proposes decontamination methodology adopted by this paper to prevent benchmark leakage in training data." 472 }, 473 { 474 "title": "The llama 3 herd of models", 475 "authors": ["Aaron Grattafiori"], 476 "year": 2024, 477 "arxiv_id": "2407.21783", 478 "relevance": "Base model (Llama 3.1 8B) used for fine-tuning experiments in the primary evaluation." 479 }, 480 { 481 "title": "WizardLM: Empowering large pre-trained language models to follow complex instructions", 482 "authors": ["Can Xu", "Qingfeng Sun", "Kai Zheng"], 483 "year": 2024, 484 "relevance": "Introduces Evol-Instruct with meta-instructions for increasing instruction complexity, foundational to the mutation operation." 485 } 486 ] 487 }