scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26570B)
      1 {
      2   "paper": {
      3     "title": "Genetic Instruct: Scaling up Synthetic Generation of Coding Instructions for Large Language Models",
      4     "authors": [
      5       "Somshubra Majumdar",
      6       "Vahid Noroozi",
      7       "Mehrzad Samadi",
      8       "Sean Narenthiran",
      9       "Aleksander Ficek",
     10       "Wasi Uddin Ahmad",
     11       "Jocelyn Huang",
     12       "Jagadeesh Balam",
     13       "Boris Ginsburg"
     14     ],
     15     "year": 2024,
     16     "venue": "Annual Meeting of the Association for Computational Linguistics",
     17     "arxiv_id": "2407.21077",
     18     "doi": "10.48550/arXiv.2407.21077"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "Genetic-Instruct generates synthetic coding instructions using evolutionary crossover and mutation operations starting from 512 seed instructions, producing 7.5M instruction-code pairs. Models fine-tuned on this data achieve 69.7% average accuracy across HumanEval/MBPP benchmarks, outperforming alternative synthetic generation methods (best baseline 66.8%) and publicly available datasets. Combining mutation and crossover yields better results than either alone (68.0% vs 66.8% and 66.6%). Performance shows diminishing returns beyond ~6M samples, and even smaller generator models (Qwen-7B) can produce competitive synthetic data.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No source code repository is mentioned. The paper releases the dataset on HuggingFace but provides no code for the Genetic-Instruct pipeline itself."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The 7.5M synthetic dataset is publicly released at https://huggingface.co/datasets/nvidia/OpenCodeGeneticInstruct, mentioned in Section 1 and the conclusion."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions NeMo framework, NeMo Aligner, vLLM, BF16 precision, and tensor parallelism (Section 4.1) but provides no requirements.txt, Dockerfile, or specific library versions sufficient to recreate the environment."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The algorithm is described in Section 3 and Algorithm 1, but there are no runnable scripts or README-style instructions."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '69.7%') with no confidence intervals, error bars, or ± notation."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims 'significant improvement' and that models 'consistently outperform' baselines, but no statistical significance tests (p-values, t-tests, etc.) are reported."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1-3 report absolute accuracy numbers for all methods, providing baseline context. E.g., Genetic-Instruct achieves 69.7% vs best public dataset at 62.9% (Table 1), allowing readers to compute effect sizes."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for why 512 seed instructions, why 4M or 7.5M samples, or why these particular benchmark sizes. Values are stated without justification."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No standard deviation, variance across runs, or any spread measure is reported. All results appear to be single-run numbers."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 1 compares against WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT, and several public datasets (Code Parrot Apps, TACO, OpenCoder, Code Alpaca), plus Llama 3.1 8B Instruct."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include WizardCoder (2024), OSS-Instruct/Magicoder (2024), INVERSE-INSTRUCT (2024), and OpenCoder (2024) — all contemporary with the paper."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 2 ablates crossover-only vs mutation-only vs combined. Table 3 ablates the effect of different generator models (Mixtral-8x22B, Mixtral-8x7B, Qwen-32B, Qwen-7B)."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are reported on four benchmarks: HumanEval, MBPP, HumanEval+, and MBPP+ (Tables 1-3)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation of generated instructions or code quality. All evaluation is automated via benchmark pass rates. Human evaluation of the quality/diversity of generated synthetic instructions would be relevant."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Evaluation uses standard held-out benchmarks (HumanEval, MBPP, HumanEval+, MBPP+) that are separate from the synthetic training data, with decontamination applied (Section 3.6)."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 1-3 provide per-benchmark breakdowns (HumanEval, MBPP, HumanEval+, MBPP+) rather than just aggregate averages."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No discussion of failure cases — what types of instructions Genetic-Instruct struggles to generate, what kinds of code solutions fail the Judge-LLM, or where fine-tuned models break down."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Figure 2 reports diminishing returns beyond ~6M samples, showing the limits of scaling. The paper also reports that INVERSE-INSTRUCT performs poorly (41.1% average), and that mutation-only slightly outperforms the combined approach on HumanEval."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims of 'significant improvement,' 'highly parallelizable,' and 'effective even with small seed data and weaker generator models' are supported by Table 1 (improvement over baselines), Section 3.5 (parallelization), and Table 3 (Qwen-7B competitive with Qwen-32B)."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The ablation study (Table 2) uses controlled single-variable manipulation — same base model, same data size, same generator, varying only the algorithm. This is adequate for causal claims about the contribution of mutation and crossover operations."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'Coding Instructions for Large Language Models' broadly, but evaluation is exclusively on Python benchmarks (HumanEval/MBPP). Section 4 states 'our evaluation focuses exclusively on Python coding benchmarks' but the abstract and title do not bound the claims to Python."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No discussion of alternative explanations. Could the gains be from data volume alone? Are the baseline re-implementations optimal? Could different seed data change the ranking? None of these are addressed."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures pass rates on coding benchmarks and frames results as 'code generation capability' — the benchmarks directly test code generation, so the proxy closely matches the claim. No broader unsupported framing."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Specific model identifiers are provided: Llama3.1-8B-Base, Mixtral-8x22B, Mixtral-8x7B, Qwen2.5-7B-Base, Qwen-32B, Qwen-7B, Meta-Llama-3-70B-Instruct. These are sufficiently specific for open-source models with distinct releases."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt templates are provided in Appendices A-F: mutation prompts (Figure 3), crossover prompt (Figure 4), code generation prompt (Figure 5), fitness/judge prompt (Figure 6), decontamination prompt (Figure 7), and evaluation prompts (Figures 8-9)."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 reports: temperature 1.2 for Instructor-LLM, 1.0 for Coder/Judge-LLM, learning rate 5e-6 decaying to 5e-7, cosine annealing, 3 epochs, nucleus sampling, max sequence length 1024, batch sizes Bm=100, Bc=10, mutation probability Mp=0.5, 20 parallel colonies."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The system is a data generation pipeline with sequential LLM calls (Instructor → Coder → Judge), not an agent with tools, retry logic, or memory."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 3 documents the full pipeline: seed selection from Tiger-Leetcode (512 samples), crossover/mutation operations, AST validation of generated code (Section 3.3), Judge-LLM filtering (Section 3.4), and decontamination (Section 3.6)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No dedicated limitations section exists. The paper proceeds from experiments (Section 4) directly to conclusion (Section 5) with no discussion of limitations or threats to validity."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No specific threats to validity are discussed anywhere in the paper."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "Section 4 briefly states 'our evaluation focuses exclusively on Python coding benchmarks' but does not systematically state scope boundaries — what was not tested, what populations are excluded, or what claims are not being made."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The full 7.5M synthetic dataset is released on HuggingFace (nvidia/OpenCodeGeneticInstruct), enabling independent verification of the training data."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 3 describes the full data generation procedure: starting from 512 Tiger-Leetcode seeds, applying crossover/mutation via Instructor-LLM, generating code via Coder-LLM, filtering via AST checks and Judge-LLM, and decontamination."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public datasets (Tiger-Leetcode seeds, Stack v2 for baselines)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Algorithm 1 and Section 3 detail each pipeline stage: seed selection → crossover/mutation → instruction generation → code generation → AST validation → Judge-LLM filtering → aggregation → decontamination."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding disclosure or acknowledgments section. All authors are NVIDIA employees but no explicit funding statement is provided."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "All nine authors are clearly identified as NVIDIA employees with @nvidia.com email addresses listed in the header."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "NVIDIA has commercial interest in compute-intensive methods succeeding — synthetic data generation at scale requires significant GPU resources. The company benefits from demonstrating the value of large-scale GPU-based data generation."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interest disclosures appear in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The training data cutoff dates for the base models (Llama 3.1, Mixtral, Qwen) are not stated, making it impossible to assess whether the models' pre-training data included benchmark solutions."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 3.6 describes a decontamination process for their synthetic data: embedding-based similarity search + LLM paraphrase detection against all benchmark datasets, with positional bias control."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section 3.6 applies a concrete decontamination methodology (Yang et al., 2023) using Sentence Transformer similarity search and Meta-Llama-3-70B-Instruct paraphrase detection, with dual-direction matching to control positional bias."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost, API costs, or wall-clock time is reported for the generation pipeline despite producing 7.5M samples using multiple LLMs."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No total GPU hours, training time, or hardware specification is provided. The paper mentions tensor parallelism and vLLM but does not quantify the total computational budget."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No results across multiple random seeds are reported. All results appear to be single-run numbers."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameter values are reported (Section 4.1) but no search budget is stated. Batch sizes are justified as 'based on our observation' without specifying how many configurations were tried."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No explanation of how the final configuration was selected. Hyperparameters appear chosen but the selection process is not described."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed at all, so multiple comparison correction is inapplicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors re-implement all baselines (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) themselves. While they justify this for fairness ('same generator model, seed population, base model'), they do not acknowledge the inherent bias of implementing competing methods (Lucic et al., 2018)."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "All methods generate the same number of samples (4M), but the compute cost per method likely differs (mutation requires per-sample LLM calls, crossover batches instructions). This is not analyzed or compared."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether HumanEval/MBPP actually measure 'code generation capability' broadly, or their known limitations (e.g., simple function-level problems, limited language coverage)."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved — the evaluation is direct model fine-tuning and benchmark pass rates."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "HumanEval (2021) and MBPP (2021) were published years before Llama 3.1 and Mixtral were trained. The base models may have seen solutions during pre-training, but this temporal leakage is not discussed."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup provides hints not available in real usage."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No analysis of whether the synthetically generated instructions share structural similarities with benchmark problems, despite using coding questions as seeds."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Section 3.6 applies a concrete leakage detection method: embedding-based similarity search using Sentence Transformer followed by LLM paraphrase detection (Meta-Llama-3-70B-Instruct) with positional bias control via dual-direction matching."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Genetic-Instruct achieves 69.7% average accuracy across coding benchmarks, outperforming alternative synthetic generation methods and public datasets",
    375       "evidence": "Table 1: Genetic-Instruct 7.5M achieves 69.7% average vs best baseline Self-Instruct at 66.8% and best public dataset OpenCoder Stage 1 at 62.9%. Four benchmarks reported individually.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Scaling synthetic data improves model performance with diminishing returns beyond ~6M samples",
    380       "evidence": "Figure 2 shows coding accuracy rising from ~45% baseline to ~69% at 7.5M samples, with plateau visible after ~6M. Six generations of ~1.5M samples each.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Combining mutation and crossover operations yields better performance than either alone",
    385       "evidence": "Table 2: Combined approach achieves 68.0% average vs crossover-only 66.8% and mutation-only 66.6%, all at 4M samples with same base model.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Smaller generator models can produce competitive synthetic data",
    390       "evidence": "Table 3: Qwen-7B as generator yields 66.5% (Llama base) and 76.7% (Qwen base) vs Qwen-32B at 66.9% and 77.3% respectively.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Models trained on Genetic-Instruct data outperform Llama3.1-8B-Instruct",
    395       "evidence": "Table 1: Genetic-Instruct 7.5M achieves 69.7% average vs Llama 3.1 8B Instruct at 65.9%.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or uncertainty quantification",
    402       "detail": "All results are reported as point estimates without confidence intervals, standard deviations, or multiple runs. The margin between Genetic-Instruct (68.0%) and Self-Instruct (66.8%) at 4M samples is 1.2 percentage points — this could easily be within noise for single-run results."
    403     },
    404     {
    405       "flag": "Company evaluating own compute-intensive method",
    406       "detail": "All authors are NVIDIA employees. The proposed method requires massive GPU-scale parallelism (20 colonies, 7.5M LLM-generated samples). NVIDIA has commercial interest in demonstrating the value of compute-intensive approaches but does not acknowledge this conflict."
    407     },
    408     {
    409       "flag": "No limitations section",
    410       "detail": "The paper has no limitations, threats to validity, or discussion of scope boundaries. This is a significant omission for an empirical paper."
    411     },
    412     {
    413       "flag": "Baseline re-implementations by competing method authors",
    414       "detail": "The authors re-implement all baselines (WizardCoder, Self-Instruct, OSS-Instruct, INVERSE-INSTRUCT) without acknowledging the bias of implementing competing methods. INVERSE-INSTRUCT achieves only 41.1% in their re-implementation — an unusually poor result that warrants scrutiny."
    415     },
    416     {
    417       "flag": "Python-only evaluation with broad claims",
    418       "detail": "The title and abstract claim 'Coding Instructions for Large Language Models' generally, while all evaluation is on Python-only benchmarks (HumanEval/MBPP). The abstract does not bound claims to Python."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Self-instruct: Aligning language models with self-generated instructions",
    424       "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra"],
    425       "year": 2023,
    426       "relevance": "Foundational method for synthetic instruction generation using LLMs, serves as a baseline."
    427     },
    428     {
    429       "title": "WizardCoder: Empowering code large language models with evol-instruct",
    430       "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"],
    431       "year": 2024,
    432       "relevance": "Adapts Evol-Instruct for code generation via instruction mutation, key baseline in the synthetic coding data space."
    433     },
    434     {
    435       "title": "Magicoder: Empowering code generation with oss-instruct",
    436       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    437       "year": 2024,
    438       "relevance": "Generates coding instructions from open-source code snippets (OSS-Instruct), key baseline for code-inspired instruction generation."
    439     },
    440     {
    441       "title": "InverseCoder: Unleashing the power of instruction-tuned code LLMs with inverse-instruct",
    442       "authors": ["Yutong Wu", "Di Huang", "Wenxuan Shi"],
    443       "year": 2024,
    444       "arxiv_id": "2407.05700",
    445       "relevance": "Generates instructions from existing code, representing a code-to-instruction paradigm for synthetic data generation."
    446     },
    447     {
    448       "title": "Evaluating large language models trained on code",
    449       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    450       "year": 2021,
    451       "arxiv_id": "2107.03374",
    452       "relevance": "Introduces HumanEval benchmark for code generation, widely used for evaluating code LLMs."
    453     },
    454     {
    455       "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation",
    456       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang"],
    457       "year": 2023,
    458       "relevance": "Introduces EvalPlus (HumanEval+, MBPP+) with additional test cases for more rigorous code evaluation."
    459     },
    460     {
    461       "title": "OpenCoder: The open cookbook for top-tier code large language models",
    462       "authors": ["Siming Huang", "Tianhao Cheng", "Jason Klein Liu"],
    463       "year": 2024,
    464       "relevance": "Open-source code LLM with public training data, serves as a baseline dataset comparison."
    465     },
    466     {
    467       "title": "Rethinking benchmark and contamination for language models with rephrased samples",
    468       "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng"],
    469       "year": 2023,
    470       "arxiv_id": "2311.04850",
    471       "relevance": "Proposes decontamination methodology adopted by this paper to prevent benchmark leakage in training data."
    472     },
    473     {
    474       "title": "The llama 3 herd of models",
    475       "authors": ["Aaron Grattafiori"],
    476       "year": 2024,
    477       "arxiv_id": "2407.21783",
    478       "relevance": "Base model (Llama 3.1 8B) used for fine-tuning experiments in the primary evaluation."
    479     },
    480     {
    481       "title": "WizardLM: Empowering large pre-trained language models to follow complex instructions",
    482       "authors": ["Can Xu", "Qingfeng Sun", "Kai Zheng"],
    483       "year": 2024,
    484       "relevance": "Introduces Evol-Instruct with meta-instructions for increasing instruction complexity, foundational to the mutation operation."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs