scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27481B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Genetic Instruct: Scaling up Synthetic Generation of Coding Instructions for Large Language Models",
      6     "authors": [
      7       "Somshubra Majumdar",
      8       "Vahid Noroozi",
      9       "Mehrzad Samadi",
     10       "Sean Narenthiran",
     11       "Aleksander Ficek",
     12       "Wasi Uddin Ahmad",
     13       "Jocelyn Huang",
     14       "Jagadeesh Balam",
     15       "Boris Ginsburg"
     16     ],
     17     "year": 2024,
     18     "venue": "Annual Meeting of the Association for Computational Linguistics",
     19     "arxiv_id": "2407.21077",
     20     "doi": "10.48550/arXiv.2407.21077"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "All abstract claims are supported: algorithm presented (Algorithm 1, Sections 3.1–3.6), evolutionary principles explained, 7.5M samples generated (confirmed in introduction and table 1), improvements demonstrated in Table 1 (69.7% avg vs baselines).",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Causal claims about mutation/crossover improving performance are supported by ablation study (Table 2: Genetic-Instruct 68.0% > Mutation-Only 66.6% > Crossover-Only 66.8%), which is appropriate methodology for causal inference.",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Paper explicitly bounds generalization to Python code generation: 'we constrain the generated solutions to Python' (Section 4.1). Evaluation limited to four Python benchmarks (HumanEval, MBPP, HE+, MBPP+). No claims beyond this scope.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper presents alternative methods (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) with results but does not discuss why alternatives underperform, e.g., INVERSE-INSTRUCT 41.1% vs Genetic-Instruct 69.7% without root cause analysis.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Paper measures 'code accuracy' on standardized benchmarks (pass@1 on HumanEval, MBPP) and claims improved 'coding capability.' Measurement granularity matches claim granularity—both refer to benchmark performance.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Paper has no dedicated limitations or threats-to-validity section. Conclusion jumps directly from results to references without discussing scope constraints, generalization limitations, or methodological caveats.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Specific threats not systematically discussed. Section 3.3 mentions 'code may not be parseable or compilable' but doesn't discuss sample representativeness, seed bias, or benchmark-specific applicability as threats.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Paper does not explicitly state what findings do NOT show. Evaluation limited to Python benchmarks, but paper doesn't discuss whether results transfer to non-benchmark tasks, multi-language code, or real-world programming.",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding statement provided. Work is conducted at NVIDIA but no funder is explicitly named or acknowledged.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All author affiliations with NVIDIA are clearly listed: 'NVIDIA' as institution and '@nvidia.com' email addresses for all authors.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "NVIDIA employees (all authors) are developing and evaluating NVIDIA-adjacent synthetic data generation techniques. The funder (implied: NVIDIA) benefits directly from positive results. Not independent.",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement, patent disclosures, equity stakes, or financial interest declarations provided.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms defined contextually: 'synthetic instructions' used as instruction-code pairs throughout, 'alignment' referenced in standard ML sense (Ouyang et al. 2022), 'code generation capability' operationalized as benchmark accuracy.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Contribution explicitly stated in abstract and Section 1: 'We introduce Genetic-Instruct, a scalable algorithm to generate synthetic coding instructions' plus the released 7.5M dataset on Hugging Face.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section 2 engages with Self-Instruct (general tasks, not coding), Evol-Instruct (mutation-based evolution), WizardCoder (code-specific mutation), and code-from-snippet methods (OSS-Instruct, INVERSE-CODER). Empirical comparisons in Table 1 show differentiation.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Dataset is released on Hugging Face (nvidia/OpenCodeGeneticInstruct), but generation pipeline code is not mentioned as released. Prompts are in appendices, but orchestration/training code is not stated as available.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Paper explicitly states 'We released the dataset publicly' with link to Hugging Face: nvidia/OpenCodeGeneticInstruct. 7.5M instruction-code pairs are publicly accessible.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Section 4.1 provides hyperparameters (learning rate 5e-6, temperature 1.2/1.0, max sequence length 1024, batch sizes 100/10), frameworks (AdamW, NeMo, vLLM, BF16 precision), and optimizer details.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Algorithm 1 pseudo-code provided, Sections 3.1–3.6 describe pipeline steps, Section 4.1 lists all hyperparameters, and Appendices A–F contain all prompts. Sufficient detail for practitioners to implement.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Tables 1–3 report only point estimates (accuracy percentages). No confidence intervals, standard deviations, error bars, or variance measures provided across benchmarks or runs.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "No statistical significance tests reported. Improvements claimed (e.g., 69.7% vs 65.9%) are not tested for significance; no p-values or t-tests provided.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Effect sizes reported as percentage point improvements: Genetic-Instruct 69.7% vs Llama 3.1 Instruct baseline 65.9% (+3.8pp), vs WizardCoder 65.7% (+4.0pp).",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No power analysis or justification for 7.5M sample size chosen. Evaluation uses fixed benchmark sizes (HumanEval 164 tests, MBPP 427) without sample size justification.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Figure 2 shows single line with no error bars. Tables 1–3 show point estimates only. No variance, standard error, or spread reported for repeated runs or across metrics.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Table 1 includes five baseline methods (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) and five public datasets (Code Parrot, TACO, OpenCoder, Code Alpaca) plus Llama 3.1 Instruct.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Baselines are from 2023–2024 (WizardCoder 2024, OpenCoder 2024, Self-Instruct 2023), contemporary with this paper. All baselines are competitive and recent.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Table 2 compares Crossover-Only (66.8%), Mutation-Only (66.6%), and Full Genetic-Instruct (68.0%). Table 3 ablates generator model choice (Mixtral vs Qwen variants).",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Four benchmarks evaluated: MBPP, MBPP+, HumanEval, HumanEval+. Results reported per-metric and as averages.",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "No human evaluation. Code generation can be automatically verified by compilers/test suites, making human eval less critical.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Evaluation on standard held-out benchmarks: HumanEval, MBPP, HumanEval+, MBPP+ are all public test sets not used in generation.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "Results reported across four benchmarks but no breakdown by problem difficulty, code pattern, algorithm type, or language feature. No error analysis by category.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "No discussion of failure cases. Paper does not show examples of instructions that failed filtering, code that didn't parse, or predictions that were incorrect.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Diminishing returns reported: 'beyond approximately 6 million samples, the accuracy gains begin to plateau' (Figure 2 caption). This is a negative result.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Generator models named (Mixtral-8x22B, Qwen 32B, Llama3.1-8B-Base) but no snapshot dates, commit hashes, or exact checkpoint versions provided. Model papers cited (Jiang et al. 2024) but specific checkpoints unclear.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "All prompts provided in appendices: Mutation (A), Crossover (B), Code Generation (C), Fitness/Judge (D), Decontamination (E), Evaluation (F). Complete transparency.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Section 4.1 lists: learning rate (5e-6), temperature (1.2, 1.0), max sequence length (1024), batch sizes (Bm=100, Bc=10), mutation probability (0.5), colonies (20), few-shot examples (3-shot).",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Algorithm 1 and Sections 3.1–3.6 describe the Genetic-Instruct pipeline: mutation, crossover, code generation, fitness evaluation, decontamination. All steps detailed.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Section 4.1 mentions Python AST validation for syntactic correctness, Section 3.6 describes two-stage decontamination (embedding + paraphrase). Seed dataset specified (Tiger-Leetcode, 512 samples).",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Final synthetic dataset (7.5M samples) is released on Hugging Face and publicly available for verification and reuse.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Data collection is the Genetic-Instruct pipeline described in Algorithm 1 and Sections 3.1–3.6. Each generation step is detailed (mutation, crossover, code generation, fitness evaluation).",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": false,
    285           "answer": false,
    286           "justification": "No human participants. Seed dataset (Tiger-Leetcode, 512 samples) is described as standard benchmark data, not recruited.",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Full pipeline from seed to final dataset is documented: Algorithm 1 (overview), Sections 3.1–3.6 (detailed steps), Section 3.6 (decontamination).",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Not applicable; paper evaluates on public benchmarks, not model training cutoff. However, generator model training cutoffs are not stated.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "Section 3.6 'LLM Decontamination' explicitly addresses preventing test set leakage into synthetic training data. Two-stage process using embedding similarity and LLM paraphrase detection.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": true,
    311           "answer": true,
    312           "justification": "Section 3.6 describes decontamination against HumanEval, MBPP, HE+, MBPP+ benchmarks using Yang et al. (2023) methodology: embedding search + paraphrase detection with positional bias control.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No human participants; all evaluation is computational.",
    320         "source": "haiku"
    321       },
    322       "cost_and_practicality": {
    323         "inference_cost_reported": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "Paper mentions using vLLM for 'high-throughput inference' and 20 parallel colonies but does not report wall-clock time, GPU hours, cost, or latency metrics.",
    327           "source": "haiku"
    328         },
    329         "compute_budget_stated": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "No computation budget reported. Total GPU-hours, inference cost, or computational requirements for generating 7.5M samples are not disclosed.",
    333           "source": "haiku"
    334         }
    335       }
    336     }
    337   },
    338   "claims": [
    339     {
    340       "claim": "Genetic-Instruct generates 7.5M diverse and high-quality coding instruction-code pairs",
    341       "evidence": "Algorithm 1 describes the multi-generation evolutionary process; released dataset on Hugging Face (nvidia/OpenCodeGeneticInstruct) contains 7.5M samples",
    342       "supported": "strong"
    343     },
    344     {
    345       "claim": "Models fine-tuned on Genetic-Instruct data outperform other synthetic data generation methods and public datasets",
    346       "evidence": "Table 1: Genetic-Instruct 69.7% average vs WizardCoder 65.7%, Self-Instruct 66.8%, OpenCoder (best public) 62.9%. Caveat: no significance tests.",
    347       "supported": "strong"
    348     },
    349     {
    350       "claim": "Combining mutation and crossover operations yields better results than either operation alone",
    351       "evidence": "Table 2: Full Genetic-Instruct 68.0% > Mutation-Only 66.6% > Crossover-Only 66.8%. Improvement over mutation-only is ~1.4pp.",
    352       "supported": "moderate"
    353     },
    354     {
    355       "claim": "Smaller generator models (Qwen-7B) can produce competitive quality synthetic data compared to larger models (Qwen-32B)",
    356       "evidence": "Table 3: Qwen-7B (66.5% avg) vs Qwen-32B (66.9%)—only 0.4pp difference, but Qwen-32B still better. Finding is that smaller models are 'competitive' but not equal.",
    357       "supported": "moderate"
    358     },
    359     {
    360       "claim": "Decontamination prevents benchmark leakage into synthetic training data",
    361       "evidence": "Section 3.6 describes two-stage decontamination (embedding similarity + paraphrase detection). Process is detailed but impact (how many removed) not quantified.",
    362       "supported": "moderate"
    363     },
    364     {
    365       "claim": "The approach is highly parallelizable and achieves good scaling properties",
    366       "evidence": "Algorithm 1 shows parallel colony execution; Figure 2 demonstrates scaling from 0 to 7.5M samples; Section 3.5 describes 20-colony parallelization.",
    367       "supported": "strong"
    368     },
    369     {
    370       "claim": "Results transfer to standard Python code generation benchmarks (HumanEval, MBPP)",
    371       "evidence": "Fine-tuned models on synthetic data evaluated on four standard benchmarks with consistent improvements; Table 1 shows transfer.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Accuracy gains plateau beyond ~6 million samples, indicating diminishing returns",
    376       "evidence": "Figure 2 caption: 'beyond approximately 6 million samples, the accuracy gains begin to plateau.'",
    377       "supported": "strong"
    378     }
    379   ],
    380   "methodology_tags": [
    381     "empirical",
    382     "benchmark-eval"
    383   ],
    384   "key_findings": "Genetic-Instruct synthesizes 7.5M coding instruction-code pairs using an evolutionary algorithm combining mutation and crossover operations guided by LLM-based fitness evaluation. Fine-tuned language models achieve 69.7% average accuracy across four Python code benchmarks (HumanEval, MBPP, HE+, MBPP+), outperforming comparable synthetic generation baselines (WizardCoder, Self-Instruct, OSS-Instruct) and public coding datasets. The method scales effectively from small seed sets (512 Tiger-Leetcode questions) and parallelizes across colonies, though diminishing returns emerge beyond 6M samples.",
    385   "red_flags": [
    386     {
    387       "flag": "No statistical significance testing",
    388       "detail": "All improvements reported as point estimates without confidence intervals, p-values, or variance measures. Reported gains (e.g., 69.7% vs 65.9% = 3.8pp) lack statistical rigor; unclear if differences exceed noise."
    389     },
    390     {
    391       "flag": "Missing limitations section",
    392       "detail": "No dedicated discussion of scope boundaries, threats to validity, or generalization limits. Paper assumes findings universally applicable to 'code generation' without caveats."
    393     },
    394     {
    395       "flag": "Incomplete conflicts-of-interest disclosure",
    396       "detail": "All authors are NVIDIA employees evaluating NVIDIA-adjacent techniques. Affiliations listed but no CoI statement. NVIDIA has direct incentive for positive results."
    397     },
    398     {
    399       "flag": "No computational cost analysis",
    400       "detail": "Wall-clock time, GPU-hours, or computational budget not reported. Makes practical reproducibility and adoption difficult."
    401     },
    402     {
    403       "flag": "Decontamination impact not quantified",
    404       "detail": "Two-stage decontamination process described but no metrics on how many samples removed, percent of final dataset affected, or coverage of benchmark test sets."
    405     },
    406     {
    407       "flag": "Weak ablation gains",
    408       "detail": "Improvement from Mutation-Only (66.6%) to Full Genetic-Instruct (68.0%) is only ~1.4pp. Statistical significance unknown; could be noise."
    409     },
    410     {
    411       "flag": "No per-category performance breakdown",
    412       "detail": "No analysis of which problem types, difficulty levels, algorithm types, or code patterns the method excels at or struggles with."
    413     },
    414     {
    415       "flag": "Seed dataset bias not discussed",
    416       "detail": "All experiments use Tiger-Leetcode (interview-style coding) as seed. Generalization to other seed distributions or coding domains not explored."
    417     },
    418     {
    419       "flag": "No failure case analysis",
    420       "detail": "No examples of instructions that failed filtering, code that didn't parse, or model predictions that were incorrect."
    421     },
    422     {
    423       "flag": "Evaluation limited to Python benchmarks",
    424       "detail": "No evaluation on multi-language code, real-world programming tasks, or non-benchmark domains. Generalization beyond standard benchmarks unclear."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    430       "authors": "Wang et al.",
    431       "year": 2023,
    432       "relevance": "Direct predecessor; uses LLMs to generate instructions from seed set via few-shot examples (crossover operation in Genetic-Instruct)"
    433     },
    434     {
    435       "title": "Evol-Instruct: Evolving Instructions with Complexity",
    436       "authors": "Xu et al.",
    437       "year": 2024,
    438       "relevance": "Introduces instruction mutation operations to increase complexity; adapted by Genetic-Instruct"
    439     },
    440     {
    441       "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct",
    442       "authors": "Luo et al.",
    443       "year": 2024,
    444       "relevance": "Adapts Evol-Instruct to code domain; Genetic-Instruct is a direct competitor and comparison baseline"
    445     },
    446     {
    447       "title": "OSS-Instruct: Empowering Code Generation with Open-Source Software",
    448       "authors": "Wei et al.",
    449       "year": 2024,
    450       "relevance": "Alternative synthetic generation approach using code snippets as seed instead of instructions; included in comparative evaluation"
    451     },
    452     {
    453       "title": "INVERSE-INSTRUCT: Unleashing the Power of Instruction-Tuned Code LLMs",
    454       "authors": "Wu et al.",
    455       "year": 2024,
    456       "relevance": "Code-to-instruction inversion approach; baseline comparison in Table 1"
    457     },
    458     {
    459       "title": "Evaluating Large Language Models Trained on Code",
    460       "authors": "Chen et al.",
    461       "year": 2021,
    462       "relevance": "HumanEval benchmark used for evaluation in this paper"
    463     },
    464     {
    465       "title": "Program Synthesis with Large Language Models",
    466       "authors": "Odena et al.",
    467       "year": 2021,
    468       "relevance": "MBPP benchmark paper; used for evaluation"
    469     },
    470     {
    471       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    472       "authors": "Liu et al.",
    473       "year": 2023,
    474       "relevance": "HumanEval+ and MBPP+ extended benchmarks with additional test cases; used for rigorous evaluation"
    475     },
    476     {
    477       "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples",
    478       "authors": "Yang et al.",
    479       "year": 2023,
    480       "relevance": "Decontamination methodology (embedding similarity + paraphrase detection) adopted in Section 3.6"
    481     },
    482     {
    483       "title": "The Llama 3 Family of Models",
    484       "authors": "Grattafiori et al.",
    485       "year": 2024,
    486       "relevance": "Base model (Llama3.1-8B) used for fine-tuning and evaluation"
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 2,
    492       "justification": "Public dataset release enables practitioner adoption; method doesn't require proprietary models. However, computational cost to reproduce generation pipeline is not disclosed."
    493     },
    494     "surprise_contrarian": {
    495       "score": 1,
    496       "justification": "Core finding (mutation + crossover > either alone) is expected from evolutionary algorithm theory. Weaker models working is only marginally surprising. Prior work (Self-Instruct, WizardCoder) already showed synthetic data improves coding."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "Pure data generation for code models; no AI safety concerns, alignment risks, or societal impact raised."
    501     },
    502     "drama_conflict": {
    503       "score": 0,
    504       "justification": "Straightforward benchmarking paper with no controversy, conflict angle, or contested claims."
    505     },
    506     "demo_ability": {
    507       "score": 2,
    508       "justification": "Dataset publicly downloadable on Hugging Face, enabling practitioners to fine-tune. Generation pipeline code not released (only prompts), limiting full reproducibility."
    509     },
    510     "brand_recognition": {
    511       "score": 2,
    512       "justification": "NVIDIA is a major AI lab; Mixtral, Qwen, Llama are well-known models. Adds credibility but NVIDIA's COI may detract for some readers."
    513     }
    514   },
    515   "hn_data": {
    516     "threads": [
    517       {
    518         "hn_id": "41204287",
    519         "title": "Apple Intelligence Foundation Language Models",
    520         "points": 56,
    521         "comments": 23,
    522         "url": "https://news.ycombinator.com/item?id=41204287",
    523         "created_at": "2024-08-09T18:38:35Z"
    524       },
    525       {
    526         "hn_id": "40570738",
    527         "title": "Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-Modal LLMs",
    528         "points": 2,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=40570738",
    531         "created_at": "2024-06-04T04:38:36Z"
    532       },
    533       {
    534         "hn_id": "40200892",
    535         "title": "Fine Tuning LLM for Enterprise: Practical Guidelines and Recommendations",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=40200892",
    539         "created_at": "2024-04-29T16:53:53Z"
    540       }
    541     ],
    542     "top_points": 56,
    543     "total_points": 60,
    544     "total_comments": 23
    545   }
    546 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs