scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29951B)
      1 {
      2   "paper": {
      3     "title": "Code Llama: Open Foundation Models for Code",
      4     "authors": [
      5       "Baptiste Rozière",
      6       "Jonas Gehring",
      7       "Fabian Gloeckle",
      8       "Sten Sootla",
      9       "Itai Gat",
     10       "Xiaoqing Ellen Tan",
     11       "Yossi Adi",
     12       "Jingyu Liu",
     13       "Romain Sauvestre",
     14       "Tal Remez",
     15       "Jérémy Rapin",
     16       "Artyom Kozhevnikov",
     17       "Ivan Evtimov",
     18       "Joanna Bitton",
     19       "Manish Bhatt",
     20       "Cristian Canton Ferrer",
     21       "Aaron Grattafiori",
     22       "Wenhan Xiong",
     23       "Alexandre Défossez",
     24       "Jade Copet",
     25       "Faisal Azhar",
     26       "Hugo Touvron",
     27       "Louis Martin",
     28       "Nicolas Usunier",
     29       "Thomas Scialom",
     30       "Gabriel Synnaeve"
     31     ],
     32     "year": 2023,
     33     "venue": "arXiv",
     34     "arxiv_id": "2308.12950"
     35   },
     36   "checklist": {
     37     "artifacts": {
     38       "code_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper provides a GitHub repository URL: https://github.com/facebookresearch/codellama (footnote 1, Section 1). They also state: 'We provide inference code for both completion and infilling models in the accompanying repository.'"
     42       },
     43       "data_released": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The training dataset is described (Table 1) as publicly available code with sampling proportions, but the actual curated dataset is not released. The paper states the dataset consists of 'publicly available code' but does not provide a download link or archive for their specific data mix."
     47       },
     48       "environment_specified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The paper describes optimizer settings and training details but not the software environment needed to reproduce the work."
     52       },
     53       "reproduction_instructions": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper provides inference code via the GitHub repository, but no step-by-step instructions for reproducing the training pipeline (data preparation, training scripts, fine-tuning stages) are provided. The training details in Section 2.6 are at a high level."
     57       }
     58     },
     59     "statistical_methodology": {
     60       "confidence_intervals_or_error_bars": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "All results are reported as point estimates (e.g., '67% on HumanEval'). No confidence intervals, error bars, or uncertainty quantification is provided for any of the benchmark results in Tables 2-9."
     64       },
     65       "significance_tests": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper makes numerous comparative claims (e.g., 'Code Llama - Python 7B outperforms Llama 2 70B') based solely on comparing point estimates without any statistical significance tests."
     69       },
     70       "effect_sizes_reported": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper reports percentage point improvements with baseline context. For example, Section 3.1.1: 'training on 100B extra tokens of a Python-heavy data mix leads to significant gains... between 4.3% points and 8.3% points in HumanEval pass@1 and between 1.2% points and 6.4% points in MBPP pass@1.' Tables provide full baseline and model scores enabling effect size computation."
     74       },
     75       "sample_size_justified": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No justification is provided for the sizes of the evaluation benchmarks used (HumanEval has 164 problems, MBPP has ~500). No power analysis or discussion of whether these sample sizes are sufficient for the claims being made."
     79       },
     80       "variance_reported": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No standard deviation, variance, or spread measures are reported across runs. Pass@1 scores use greedy decoding (single deterministic run). Pass@10 and pass@100 use sampling but only report unbiased estimators without variance across independent evaluation runs."
     84       }
     85     },
     86     "evaluation_design": {
     87       "baselines_included": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Extensive baselines are included: Llama 2 (all sizes), StarCoder, CodeGen-Multi, CodeGeeX, code-cushman-001, GPT-3.5, GPT-4, PaLM, PaLM-Coder, PaLM 2-S, Falcon, MPT, and more. See Tables 2, 4, 9."
     91       },
     92       "baselines_contemporary": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Baselines include contemporary models as of 2023: GPT-4 (2023), PaLM 2-S (2023), StarCoder (2023), GPT-3.5/ChatGPT. These represent the state of the art at the time of publication."
     96       },
     97       "ablation_study": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Extensive ablation studies are presented: (i) fine-tuning Llama 2 vs. training from scratch (Section 3.4.1, Figure 5b), (ii) impact of infilling training on downstream tasks (Table 5), (iii) effect of long context fine-tuning (Tables 7, 10), (iv) self-instruct data value (Table 8), (v) temperature effects on pass@k (Figure 6), (vi) RoPE frequency ablations (Table 18)."
    101       },
    102       "multiple_metrics": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Multiple metrics are used: pass@1, pass@10, pass@100 across HumanEval, MBPP, APPS, MultiPL-E. Long context evaluations use perplexity, exact match, and BLEU (Table 7). Safety evaluations use TruthfulQA, ToxiGen, and BOLD (Table 9)."
    106       },
    107       "human_evaluation": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Red teaming with 25 Meta employees including domain experts in responsible AI, malware development, and offensive security engineering (Section 4, 'Red teaming'). Participants provided qualitative assessments and generated adversarial prompts. This constitutes human evaluation of the model's safety outputs."
    111       },
    112       "held_out_test_set": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "HumanEval, MBPP, APPS, and MultiPL-E are standard held-out test benchmarks. The paper explicitly notes that the LCC data 'are included in our code training data' (footnote 2, Section 3.3), showing awareness of the distinction, and resamples a separate 'LCC-balanced' set."
    116       },
    117       "per_category_breakdown": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Breakdowns are provided: per-language results in MultiPL-E (Table 4), per-difficulty in APPS (Table 3: introductory/interview/competition), per-demographic in safety benchmarks (Tables 20-25), per-file-length in LCC (Figure 9b), per-model-size throughout."
    121       },
    122       "failure_cases_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Several failure modes are discussed: (1) degradation from LCFT on short sequences (Section 3.3), (2) false refusals in safety (Section 4, 'False refusals'), (3) SPM format failure in random span infilling (Appendix E), (4) 7B model's poor retrieval for keys at the beginning of prompts (Section 3.3, Table 17)."
    126       },
    127       "negative_results_reported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Multiple negative results: (1) LCFT 'slightly hurts performance on standard code synthesis benchmarks' with quantified decreases (Section 3.3), (2) infilling training incurs 'a small cost on HumanEval and MBPP pass@k metrics' (Table 5), (3) instruction fine-tuning 'somewhat distillate Llama 2-Chat' at the cost of code generation performance (Section 6), (4) Code Llama - Python performs worse than Code Llama on introductory APPS problems (Section 3.1.1)."
    131       }
    132     },
    133     "claims_and_evidence": {
    134       "abstract_claims_supported": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Abstract claims are supported: 'state-of-the-art performance among open models' is supported by Tables 2 and 4; 'scores of up to 67% and 65% on HumanEval and MBPP' matches Table 2 (Code Llama - Instruct 70B: 67.8% HumanEval, Code Llama - Python 70B: 65.6% MBPP); 'Code Llama - Python 7B outperforms Llama 2 70B' is supported in Table 2."
    138       },
    139       "causal_claims_justified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Causal claims about the effect of training stages are supported by controlled ablations: FIM vs. no FIM (Table 5), LCFT vs. no LCFT (Tables 7, 10), self-instruct vs. no self-instruct (Table 8), Llama 2 init vs. training from scratch (Figure 5b). Each ablation manipulates a single variable while holding others constant."
    143       },
    144       "generalization_bounded": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The title 'Open Foundation Models for Code' and abstract claims of 'state-of-the-art performance' are broad. While the paper tests on multiple benchmarks and languages, it does not bound claims to the tested settings. The abstract says 'all our models outperform every other publicly available model on MultiPL-E' without qualifying which languages/tasks were tested. The paper does not discuss limitations of benchmark-based evaluation for real-world code generation."
    148       },
    149       "alternative_explanations_discussed": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether improvements from Llama 2 initialization could be due to data overlap between Llama 2's training data and the code benchmarks, or whether the self-instruct gains could be partially explained by the filtering mechanism rather than the data quality."
    153       }
    154     },
    155     "setup_transparency": {
    156       "model_versions_specified": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The paper specifies exact model sizes (7B, 13B, 34B, 70B), the base model (Llama 2), and for external comparisons includes specific versions: 'gpt-3.5-turbo-16k-0613' (Section 3.3), 'code-cushman-001' (Table 2). The Code Llama models themselves are the subject of the paper with clear specifications."
    160       },
    161       "prompts_provided": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Full prompt templates are provided in Appendix H: self-training question generation (Figure 10), unit test generation (Figure 11), solution generation (Figure 12), MBPP zero-shot evaluation (Figure 13), and APPS evaluation prompts (Figure 14). While these contain placeholders, the fill values are described (e.g., questions from the generation step)."
    165       },
    166       "hyperparameters_reported": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Detailed hyperparameters in Section 2.6: optimizer (AdamW, beta1=0.9, beta2=0.95), cosine schedule with 1000 warmup steps, batch size 4M tokens, learning rates (3e-4, 1.5e-4), LCFT learning rate 2e-5, RoPE base period theta=10^6. Evaluation: temperature, nucleus sampling p=0.95 specified per benchmark."
    170       },
    171       "scaffolding_described": {
    172         "applies": false,
    173         "answer": false,
    174         "justification": "No agentic scaffolding is used. Code Llama models are standard language models for code generation and infilling without tool use, feedback loops, or agent architectures."
    175       },
    176       "data_preprocessing_documented": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 2.2 documents the dataset: near-deduplicated publicly available code (85%), natural language related to code (8%), natural language (7%), with BPE tokenization. Table 1 provides sampling proportions, epochs, and disk sizes. The self-instruct pipeline (Section 2.5) documents each step: 62,000 questions generated, deduped to ~52,000, filtered by unit test execution to ~14,000 triplets."
    180       }
    181     },
    182     "limitations_and_scope": {
    183       "limitations_section_present": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The Discussion (Section 6) mentions some tradeoffs (LCFT and infilling cost on benchmarks, safety vs. coding performance) but these are framed as design decisions rather than limitations."
    187       },
    188       "threats_to_validity_specific": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as benchmark overfitting, data contamination, the representativeness of HumanEval/MBPP for real-world coding tasks, or limitations of automated safety benchmarks."
    192       },
    193       "scope_boundaries_stated": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of what populations, tasks, or settings are excluded from the claims. The Discussion (Section 6) mentions 'Further work is needed for LLMs to understand context and nuance in their instructions' but does not systematically bound the scope of results."
    197       }
    198     },
    199     "data_integrity": {
    200       "raw_data_available": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "Neither the training data nor the raw evaluation outputs are made available. Only aggregated benchmark scores are reported."
    204       },
    205       "data_collection_described": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 2.2 describes the training data collection: publicly available code (near-deduplicated), natural language datasets related to code, sampling proportions, and disk sizes (Table 1). The self-instruct pipeline (Section 2.5) describes generation of 62,000 questions using Llama 2 70B, deduplication, unit test generation and filtering."
    209       },
    210       "recruitment_methods_described": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "For the red teaming exercise (25 Meta employees), the paper does not describe how participants were recruited or selected beyond stating they include 'domain experts in responsible AI, malware development, and offensive security engineering.' Selection criteria and potential selection bias are not discussed."
    214       },
    215       "data_pipeline_documented": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The training pipeline is documented across Sections 2.1-2.6 and Figures 2, 8: Llama 2 base → code training (500B/1T tokens) → infilling training → long context fine-tuning → Python specialization and/or instruction fine-tuning. The self-instruct pipeline documents each stage with counts (62,000 → ~52,000 → ~14,000)."
    219       }
    220     },
    221     "conflicts_of_interest": {
    222       "funding_disclosed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No explicit funding disclosure or acknowledgment of funding sources is present. The paper is from Meta AI but does not include a standard funding statement."
    226       },
    227       "affiliations_disclosed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "All authors are listed with 'Meta AI' affiliation on the first page. One author has a dual affiliation with 'CERMICS École des Ponts ParisTech' and another with 'Hebrew University of Jerusalem.'"
    231       },
    232       "funder_independent_of_outcome": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "Meta AI funded this research and has a direct commercial interest in Code Llama performing well, as it is released under Meta's Llama 2 license for both research and commercial use. The funder is not independent of the outcome."
    236       },
    237       "financial_interests_declared": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No competing interests or financial interests statement is present in the paper. As Meta employees releasing a Meta product, the authors clearly have financial interests related to the findings, but these are not declared."
    241       }
    242     },
    243     "contamination": {
    244       "training_cutoff_stated": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No explicit training data cutoff date is stated for either Llama 2 or Code Llama. The paper does not specify when the training data was collected."
    248       },
    249       "train_test_overlap_discussed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "The paper explicitly notes that 'LCC data points are included in our code training data' (footnote 2, Section 3.3). For HumanEval infilling, it mentions potential overlap: 'both our models and the models from Allal et al. (2023) and Li et al. (2023) have been trained on datasets that may have an overlap with this evaluation dataset' (Appendix E). However, no analysis of overlap with HumanEval or MBPP benchmarks is provided."
    253       },
    254       "benchmark_contamination_addressed": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "While the paper notes the LCC overlap, it does not address contamination risk for HumanEval (published 2021), MBPP (published 2021), or APPS (published 2021) — all publicly available before the model's likely training cutoff. The paper uses these as primary benchmarks without analyzing whether solutions appeared in the training data."
    258       }
    259     },
    260     "human_studies": {
    261       "pre_registered": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "The red teaming exercise is a qualitative security evaluation, not a formal human subjects study. Pre-registration does not apply to adversarial red teaming."
    265       },
    266       "irb_or_ethics_approval": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "The red teaming was conducted by Meta employees as part of responsible AI evaluation, not as a formal human subjects study requiring IRB approval."
    270       },
    271       "demographics_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "The red teaming exercise involved Meta employees, not external human participants in a research study. Demographics are not applicable."
    275       },
    276       "inclusion_exclusion_criteria": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The red teaming exercise was an internal security evaluation with domain experts, not a human subjects study requiring formal inclusion/exclusion criteria."
    280       },
    281       "randomization_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Not applicable. The red teaming is not an experimental study with conditions requiring randomization."
    285       },
    286       "blinding_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Not applicable. The red teaming is not an experimental study requiring blinding."
    290       },
    291       "attrition_reported": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "Not applicable. The red teaming is not a longitudinal human subjects study."
    295       }
    296     },
    297     "cost_and_practicality": {
    298       "inference_cost_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No inference cost, latency, or tokens-per-second metrics are reported for any model size. The paper does not discuss the practical cost of running Code Llama models."
    302       },
    303       "compute_budget_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No GPU hours, total compute cost, or hardware specifications are provided. The paper states training token counts (500B, 1T) but does not report the computational resources required."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 67% and 65% on HumanEval and MBPP, respectively.",
    313       "evidence": "Table 2 shows Code Llama - Instruct 70B achieves 67.8% on HumanEval pass@1 and Code Llama - Python 70B achieves 65.6% on MBPP pass@1, exceeding all other open models listed (StarCoder, CodeGen-Multi, etc.).",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP.",
    318       "evidence": "Table 2: Code Llama - Python 7B achieves 38.4% HumanEval pass@1 vs. Llama 2 70B at 30.5%, and 47.6% MBPP pass@1 vs. Llama 2 70B at 45.4%.",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "All Code Llama models outperform every other publicly available model on MultiPL-E.",
    323       "evidence": "Table 4 shows Code Llama 7B (average 26.3%) outperforms StarCoder Python 15.5B (25.3%), CodeGen-Multi 16B (13.4%), and other baselines on the multilingual average. Larger Code Llama models show even larger margins.",
    324       "supported": "strong"
    325     },
    326     {
    327       "claim": "Infilling training incurs only a small cost on autoregressive code generation performance.",
    328       "evidence": "Table 5: FIM training causes 0.6pp average decrease for 7B and 1.1pp for 13B across HumanEval and MBPP metrics, with no cost on autoregressive test set loss.",
    329       "supported": "strong"
    330     },
    331     {
    332       "claim": "Long context fine-tuning enables stable behavior on sequences of up to 100,000 tokens.",
    333       "evidence": "Figure 4a shows perplexity decreasing steadily beyond 16K tokens up to 100K with only slight increase after. Table 7 shows LCFT models outperform non-LCFT models on long-context code completion. However, 'stable behavior' is a soft claim — key retrieval accuracy degrades beyond 16K tokens (Table 17).",
    334       "supported": "moderate"
    335     },
    336     {
    337       "claim": "Initializing from Llama 2 outperforms training from scratch on code for a given compute budget.",
    338       "evidence": "Section 3.4.1 and Figure 5b: the scratch-trained 7B model at 500B tokens has loss equivalent to Code Llama 7B at ~250B tokens, showing 'the loss of the model trained from scratch is equal to the loss of Code Llama 7B at about half of its training (with 240B less training tokens).'",
    339       "supported": "strong"
    340     },
    341     {
    342       "claim": "Self-instruct data improves coding benchmark scores and training reliability for instruction-following models.",
    343       "evidence": "Table 8: self-instruct improves HumanEval pass@1 from 30.5% to 34.8% (7B) and 40.9% to 42.7% (13B). For MBPP zero-shot, it dramatically improves the 13B model from 20.4% to 40.2%, which the paper attributes to more reliable format learning.",
    344       "supported": "strong"
    345     },
    346     {
    347       "claim": "Code Llama - Instruct is safer than ChatGPT based on safety reward model scores.",
    348       "evidence": "Figure 7 shows the KDE plot of Llama 2 70B safety reward model scores where Code Llama - Instruct models have more weight in the safer part of the distribution compared to GPT-3.5 Turbo. However, this is based on a single reward model from Meta (potential bias) on a specific set of red-team prompts.",
    349       "supported": "moderate"
    350     }
    351   ],
    352   "methodology_tags": [
    353     "benchmark-eval"
    354   ],
    355   "key_findings": "Code Llama is a family of code-specialized LLMs (7B to 70B parameters) derived from Llama 2, achieving state-of-the-art performance among open models on HumanEval (67.8%), MBPP (65.6%), and MultiPL-E benchmarks. The paper demonstrates that initializing from a general-purpose LLM and progressively specializing through code training, infilling, long context fine-tuning, and instruction tuning yields large gains, with infilling and LCFT incurring only modest costs on standard benchmarks. Notably, Code Llama - Python 7B outperforms Llama 2 70B on Python code generation, demonstrating the value of domain specialization over scale alone.",
    356   "red_flags": [
    357     {
    358       "flag": "Company evaluating own product",
    359       "detail": "All authors are Meta AI employees evaluating Meta's Code Llama models. The paper was released alongside the commercial product launch. While affiliations are disclosed, no competing interests statement is included, and the safety comparison (Figure 7) uses Meta's own Llama 2 safety reward model as the evaluation metric."
    360     },
    361     {
    362       "flag": "No uncertainty quantification",
    363       "detail": "All benchmark results are reported as point estimates without confidence intervals, error bars, or variance across runs. For pass@1 with greedy decoding, this is a single deterministic run. For sampling-based metrics (pass@10, pass@100), the unbiased estimator is computed but variance of the estimator is not reported. Small differences between models could be within noise."
    364     },
    365     {
    366       "flag": "Benchmark contamination risk unaddressed",
    367       "detail": "HumanEval, MBPP, and APPS were all publicly available before Code Llama's training. The paper explicitly notes LCC overlap with training data (footnote 2) but does not analyze whether HumanEval or MBPP solutions appeared in the code training corpus. The paper acknowledges potential overlap for the CodeXGLUE infilling benchmark (Appendix E) but takes no action."
    368     },
    369     {
    370       "flag": "No limitations section",
    371       "detail": "The paper lacks a dedicated limitations section. Known tradeoffs (LCFT cost, infilling cost) are discussed as engineering decisions rather than limitations. No discussion of benchmark representativeness, potential data contamination, or the gap between benchmark performance and real-world coding utility."
    372     },
    373     {
    374       "flag": "No compute budget disclosed",
    375       "detail": "Despite training 12 models across 4 sizes with multiple training stages totaling hundreds of billions to a trillion tokens, no GPU hours, hardware specifications, or energy consumption figures are reported. This makes the work unreproducible for resource-constrained researchers."
    376     }
    377   ],
    378   "cited_papers": [
    379     {
    380       "title": "Evaluating large language models trained on code",
    381       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    382       "year": 2021,
    383       "arxiv_id": "2107.03374",
    384       "relevance": "Introduces HumanEval benchmark and Codex, the foundational code LLM paper and primary evaluation benchmark used in this work."
    385     },
    386     {
    387       "title": "Llama 2: Open foundation and fine-tuned chat models",
    388       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    389       "year": 2023,
    390       "arxiv_id": "2307.09288",
    391       "relevance": "Base model that Code Llama is built upon; provides the foundation model, instruction tuning data, and safety alignment approach."
    392     },
    393     {
    394       "title": "StarCoder: May the source be with you!",
    395       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    396       "year": 2023,
    397       "arxiv_id": "2305.06161",
    398       "relevance": "Primary open-source code LLM baseline; demonstrates open data curation practices for code models."
    399     },
    400     {
    401       "title": "Competition-level code generation with AlphaCode",
    402       "authors": ["Yujia Li", "David H. Choi", "Junyoung Chung"],
    403       "year": 2022,
    404       "arxiv_id": "2203.07814",
    405       "relevance": "Key code generation baseline using massive sampling and filtering; evaluated on competitive programming tasks."
    406     },
    407     {
    408       "title": "GPT-4 technical report",
    409       "authors": ["OpenAI"],
    410       "year": 2023,
    411       "arxiv_id": "2303.08774",
    412       "relevance": "State-of-the-art closed-source LLM baseline for code generation and general language understanding."
    413     },
    414     {
    415       "title": "InCoder: A generative model for code infilling and synthesis",
    416       "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"],
    417       "year": 2023,
    418       "relevance": "Introduces causal masking for code infilling, a key training technique adopted in Code Llama."
    419     },
    420     {
    421       "title": "Efficient training of language models to fill in the middle",
    422       "authors": ["Mohammad Bavarian", "Heewoo Jun", "Nikolas Tezak"],
    423       "year": 2022,
    424       "arxiv_id": "2207.14255",
    425       "relevance": "Foundational fill-in-the-middle (FIM) training methodology directly used in Code Llama's infilling objective."
    426     },
    427     {
    428       "title": "Extending context window of large language models via positional interpolation",
    429       "authors": ["Shouyuan Chen", "Sherman Wong", "Liangjian Chen", "Yuandong Tian"],
    430       "year": 2023,
    431       "arxiv_id": "2306.15595",
    432       "relevance": "Related approach to extending LLM context length via RoPE modification, compared against Code Llama's approach."
    433     },
    434     {
    435       "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation",
    436       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    437       "year": 2023,
    438       "relevance": "Multilingual code generation benchmark used as a primary evaluation tool for Code Llama across 7 programming languages."
    439     },
    440     {
    441       "title": "Program synthesis with large language models",
    442       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    443       "year": 2021,
    444       "arxiv_id": "2108.07732",
    445       "relevance": "Introduces MBPP benchmark, one of the two primary Python code generation benchmarks used to evaluate Code Llama."
    446     },
    447     {
    448       "title": "Textbooks are all you need",
    449       "authors": ["Suriya Gunasekar", "Yi Zhang", "Jyoti Aneja"],
    450       "year": 2023,
    451       "arxiv_id": "2306.11644",
    452       "relevance": "Demonstrates that high-quality filtered training data can match larger models on code benchmarks; contrasted with Code Llama's approach."
    453     },
    454     {
    455       "title": "Training language models to follow instructions with human feedback",
    456       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    457       "year": 2022,
    458       "relevance": "RLHF methodology that underlies the instruction fine-tuning and safety alignment approach inherited from Llama 2."
    459     }
    460   ]
    461 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs