ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29937B)


      1 {
      2   "paper": {
      3     "title": "AI-Assisted Fixes to Code Review Comments at Scale",
      4     "authors": [
      5       "Chandra Maddila",
      6       "Negar Ghorbani",
      7       "James Saindon",
      8       "Parth Thakkar",
      9       "Vijayaraghavan Murali",
     10       "Rui Abreu",
     11       "Jingyue Shen",
     12       "Brian Zhou",
     13       "Nachiappan Nagappan",
     14       "Peter C. Rigby"
     15     ],
     16     "year": 2025,
     17     "venue": "Transactions on Software Engineering",
     18     "arxiv_id": "2507.13499",
     19     "doi": ""
     20   },
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No source code repository or URL is provided anywhere in the paper. The system (MetaMateCR) is an internal Meta production system with no public release."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Section 1.5 explicitly states: 'Unfortunately, we cannot release the benchmark and data set, and, as a result, we do not consider the benchmark to be a research contribution.' The training data (64k datapoints) and benchmark (206 datapoints) are internal to Meta."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications, dependency lists, or hardware details for model training/inference are provided. The paper mentions Llama 3.1 70B and 8B models but provides no training environment details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No reproduction instructions are provided. The system is proprietary to Meta and the data is internal, making full reproduction impossible. The paper describes methodology at a high level but not step-by-step instructions."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No confidence intervals or error bars are reported for any results. Offline results (Table 3) are point estimates only (e.g., '67.96%' EM). Online results (Table 5) report percentages and p-values but no confidence intervals."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Statistical significance tests are used throughout. Section 4.2.2 specifies Fisher tests for count data (ActionableToApplied, ShownToApplied) and t-tests for continuous metrics (TimeInReview, TimeSpent, WallClock). P-values are reported in Tables 4 and 5 (e.g., 'p = .029', 'p << .001')."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are reported as percentage point (pp) differences with baseline context throughout. For example, 'LargeLSFT ... ActionableToApplied rate (19.75%, p << .001) a 3.62 pp and 9.22 pp improvement over SmallLSFT and GPT-4o, respectively.' Safety trials report percentage changes (e.g., 'reviewers taking over 5% longer')."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No power analysis or sample size justification is provided. The offline benchmark is 206 examples, justified only by comparison to Google's 100 examples (Section 4.1). No justification for the safety trial sample sizes or production experiment durations."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported for any results. Offline metrics are single-run point estimates. Online metrics are aggregate percentages without confidence intervals or variance."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "GPT-4o serves as the baseline model, compared against SmallLSFT and LargeLSFT in both offline (Table 3) and online (Table 5) experiments. Safety trials compare 'No AI' control vs. 'GPT-4o AI suggestions'."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "GPT-4o was state-of-the-art when the research began in 2024 (Section 3.1: 'GPT-4o was one of the best performing models on various coding benchmarks such as MBPP, HumanEval, SWE-bench'). This is a reasonable contemporary baseline."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The progressive model comparison (GPT-4o -> SmallLSFT -> LargeLSFT) functions as an ablation, isolating the effect of fine-tuning (SmallLSFT) and scaling training data (LargeLSFT). The safety trials also ablate the UX design (showing vs. collapsing patches for reviewers)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: offline (Exact Match, Successful Patch Generation), online goal metrics (ActionableToApplied, ShownToApplied), and safety metrics (TimeInReview, TimeSpent, WallClock). Defined in Table 2."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The production experiments constitute a form of human evaluation: real engineers at Meta use the system and their acceptance/rejection of AI patches is the primary outcome metric. The benchmark was also created via human annotation by domain experts (Section 3.3)."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The offline benchmark of 206 datapoints is separate from the training data. Section 4.1 describes it as a dedicated 'test benchmark to evaluate each of the models,' curated independently from the SFT training sets (2.9K and 64K)."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No per-category breakdown is provided. Results are reported as aggregate percentages (EM, SPG, ActionableToApplied, ShownToApplied) without breakdowns by programming language, comment type, patch complexity, or other categories."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Failure cases are discussed. Section 5.1 describes GPT-4o's failure on internal coding patterns (using PHP instead of Hack functions). Section 7.1 reports the safety trial regression where reviewers took 5.5% longer, leading to a rollback and investigation. The ShownToApplied metric for LargeLSFT was not statistically significant (p=.054), which is reported honestly."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Important negative results are reported: Safety Trial Expt. 1 showed a significant regression (5.5% longer TimeInReview, 6.7% longer TimeSpent), leading to a rollback. The ShownToApplied improvement of LargeLSFT over SmallLSFT was not statistically significant (p=.054). These are prominently discussed."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims are well-supported by results. The abstract states: LargeLSFT 'creates an exact match patch 68% of the time outperforming GPT-4o by 9 percentage points' (Table 3 shows 67.96% vs 59.22%), 'ActionableToApplied rate of 19.7%, which is a 9.2pp improvement over GPT-4o' (Table 5 confirms), and the safety trial regression of 'over 5%' (Table 4 shows 5.5%)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims are justified through appropriate study designs. The safety trials use randomized controlled experiments (Section 4.2). The progressive model improvements are supported by controlled comparisons. The paper explicitly references medical trial methodology (Jadad & Enkin, 2007) for their safety trial design."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 8.1 (Generalizability) explicitly states: 'We have evaluated our Code Review Comment Resolution system only on an internal dataset. This may not be representative of general code reviewing activity in other software development scenarios.' The title specifies 'at Scale' rather than making universal claims."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 8.2 (Construct Validity) discusses 'other factors that influence code review comments and their resolution, such as code quality or team dynamics, that were not captured in our analysis.' Section 8.3 (Internal Validity) discusses that the approach only handles actionable comments, not all review comment types. The safety trial investigation identified the specific mechanism (reviewers checking AI patches) for the regression."
    138       }
    139     },
    140     "setup_transparency": {
    141       "model_versions_specified": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "GPT-4o is mentioned without a snapshot date or API version. Llama is described as 'LLaMA 3.1 70B' (Section 3.2) but the iCodeLlama variant is internal without version details. No model checkpoint dates or version identifiers are provided for any model."
    145       },
    146       "prompts_provided": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The GPT-4o baseline is described as 'calling the GPT-4o in a zero-shot fashion' (Section 5.1) but the actual prompt text is not provided. The actionability classifier uses 'a Few-Shot prompt' (Section 4.2.1) but the prompt text is not shown. Only natural language descriptions of prompting strategies are given."
    150       },
    151       "hyperparameters_reported": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "No hyperparameters are reported. Missing: temperature/sampling settings for GPT-4o and Llama models, learning rate for SFT, batch size, number of training epochs, context window sizes (128K and 16K are mentioned but not as hyperparameter specifications)."
    155       },
    156       "scaffolding_described": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 provides detailed description of the MetaMateCR system architecture (Figure 3): Phabricator integration, MetaMateCR backend orchestration, validation system (linter/test/build checks), LLM service, and the actionability classifier pipeline. The workflow from comment to patch to validation to display is well-described."
    160       },
    161       "data_preprocessing_documented": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Data preprocessing is documented. Section 3.3 describes the pipeline: mine random sample of review comment/patch pairs -> exclude diffs with failed lint/test/build -> human annotation with specific criteria (actionability, instruction quality, accuracy) -> yielding 2.9K from 5K. Section 3.4 describes scaling to 64K via a classifier trained on 7.5K labeled examples with precision/recall metrics."
    165       }
    166     },
    167     "limitations_and_scope": {
    168       "limitations_section_present": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 8 'Threats to Validity' is a dedicated section with three subsections: 8.1 Generalizability, 8.2 Construct Validity, and 8.3 Internal Validity."
    172       },
    173       "threats_to_validity_specific": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Threats are specific to this study. Section 8.1: 'specific set of code review tools, internal processes, and company culture might have influenced our experiment.' Section 8.3: 'focused only on fixing code review comments that are addressable by suggesting code changes' and 'models were trained on historical data, which may not be representative of future code software development scenarios.'"
    177       },
    178       "scope_boundaries_stated": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 8.3 explicitly states what the system does NOT handle: 'We did not consider other types of code review comments, such as question-answers, code pointers, knowledge transfer, and general discussions.' Section 8.1 bounds generalizability to Meta's internal setting."
    182       }
    183     },
    184     "data_integrity": {
    185       "raw_data_available": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "Raw data is not available. The paper explicitly states it cannot release the benchmark or dataset due to internal Meta data. No supplementary data is provided."
    189       },
    190       "data_collection_described": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Data collection is well-described. Section 3.3 details mining random samples of review comment/patch pairs from Meta's code review data, exclusion of failed diffs, and the human annotation process with specific criteria (actionability, instruction quality, accuracy). Section 3.4 describes scaling via an 8B classifier."
    194       },
    195       "recruitment_methods_described": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper mentions 'two domain experts' for annotation (Section 3.3) and 'contract workers with coding expertise' for the larger dataset (Section 3.4) but does not describe how these annotators were recruited, what their specific qualifications were, or whether selection might introduce bias. For the production experiments, all Meta engineers are implicitly the participants but no recruitment is described."
    199       },
    200       "data_pipeline_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The data pipeline is documented with counts at each stage: 5K random pairs -> pruning -> 2.9K human-labeled (Section 3.3); 7.5K training + 700 test for classifier -> 46K model-classified + 18K human-labeled = 64K total (Section 3.4). The benchmark pipeline: random sample -> pruning -> 206 high-quality datapoints (Section 4.1). The production funnel (Figure 2) documents Universe -> Actionable -> Shown -> Accepted stages."
    204       }
    205     },
    206     "conflicts_of_interest": {
    207       "funding_disclosed": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "No funding disclosure or acknowledgment of funding source is present. The acknowledgements section thanks individuals but does not mention funding. Given all authors are Meta employees, Meta is implicitly the funder but this is not explicitly disclosed."
    211       },
    212       "affiliations_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Author affiliations are clearly stated: 'All the authors are with Meta Platform Inc, Menlo Park, California, USA. P. C. Rigby is also with the Department of Computer Science and Software Engineering, Concordia University.' The affiliation with the company whose product is being evaluated is transparent."
    216       },
    217       "funder_independent_of_outcome": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "Meta is both the employer of all authors and the company deploying MetaMateCR. Meta has a direct financial interest in showing that their internal AI tool improves code review efficiency. The funder is not independent of the outcome."
    221       },
    222       "financial_interests_declared": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No competing interests statement or financial interests declaration is present in the paper. The authors clearly work for Meta and are evaluating Meta's product, but there is no formal declaration of conflicts."
    226       }
    227     },
    228     "contamination": {
    229       "training_cutoff_stated": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No training data cutoff date is stated for GPT-4o or the base Llama 3.1 model. The paper uses GPT-4o as a baseline evaluated on internal data, but does not discuss when GPT-4o's training data was collected."
    233       },
    234       "train_test_overlap_discussed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No discussion of potential train/test overlap. The internal benchmark could overlap with the SFT training data (both mined from Meta's code review data). The paper does not discuss whether the 206 benchmark examples were excluded from the 64K training set, though the separate curation processes suggest they were."
    238       },
    239       "benchmark_contamination_addressed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "Benchmark contamination is not addressed. For GPT-4o, the internal benchmark is unlikely to be contaminated (internal data), but for the fine-tuned Llama models, both training and benchmark data come from Meta's code review corpus. No deduplication or temporal split is described to ensure separation."
    243       }
    244     },
    245     "human_studies": {
    246       "pre_registered": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No pre-registration is mentioned for the safety trials or production experiments, despite them being randomized controlled trials on human participants (engineers)."
    250       },
    251       "irb_or_ethics_approval": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No IRB or ethics board approval is mentioned despite conducting randomized controlled experiments on tens of thousands of engineers. The experiments modified the tools engineers use and measured their behavior."
    255       },
    256       "demographics_reported": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No demographics of participating engineers are reported. The paper mentions 'tens of thousands of engineers' and that they are 'both collocated and working at multiple locations across the world' (Section 8.1) but provides no detailed demographics (experience level, team distribution, etc.)."
    260       },
    261       "inclusion_exclusion_criteria": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "No inclusion/exclusion criteria for participants are stated. The paper mentions the experiments cover 'two of Meta's largest code codebases and diffs in four major programming languages' but does not describe which engineers were included/excluded or why."
    265       },
    266       "randomization_described": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "The safety trials are described as 'randomized controlled safety experimental trials' (Section 7) but the randomization procedure is not described. No details on how engineers were assigned to control vs. treatment conditions, stratification variables, or randomization tools."
    270       },
    271       "blinding_described": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Blinding is not described. In the safety trials, it appears engineers in the test condition could see AI patches while control condition engineers could not, suggesting participants were not blinded to their condition. This is not discussed."
    275       },
    276       "attrition_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No participant attrition or dropout information is reported. The number of engineers/diffs in each experimental condition is not provided, nor is information about any participants who dropped out."
    280       }
    281     },
    282     "cost_and_practicality": {
    283       "inference_cost_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No inference cost or latency is reported for the MetaMateCR system, despite it being a production system processing thousands of code review comments per week. The paper mentions 'end-to-end inference latency requirements' (Section 3.2) as a factor in choosing the 70B model but does not quantify it."
    287       },
    288       "compute_budget_stated": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No computational budget is stated. Missing: GPU hours for fine-tuning, training time, hardware specifications for SFT on 64K datapoints with a 70B model, or API costs for GPT-4o baseline evaluation."
    292       }
    293     }
    294   },
    295   "claims": [
    296     {
    297       "claim": "LargeLSFT achieves 67.96% exact match on the offline benchmark, outperforming GPT-4o by approximately 9 percentage points.",
    298       "evidence": "Table 3 shows GPT-4o at 59.22% EM vs. LargeLSFT at 67.96% EM, a difference of 8.74 pp. Section 5.3 details these results.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "Showing AI patches to reviewers causes a statistically significant regression in review time (5.5% increase in TimeInReview, p=.029).",
    303       "evidence": "Table 4, Expt. 1 results: TimeInReview 5.5% increase (p=.029), TimeSpent 6.7% increase (p<<.001). This was a randomized controlled trial with control (no AI) vs. treatment (GPT-4o patches).",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "Collapsing AI patches for reviewers (showing only to authors) eliminates the review time regression.",
    308       "evidence": "Table 4, Expt. 2 results: TimeInReview 0.42% (p=.86), TimeSpent 1.24% (p=.66), WallClock 0.26% (p=.67). None statistically significant.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "In production, LargeLSFT achieves an ActionableToApplied rate of 19.75%, a 9.22pp improvement over GPT-4o.",
    313       "evidence": "Table 5 shows GPT-4o at 10.53% (Expt. 2) vs. LargeLSFT at 19.75% (Expt. 4), with p<<.001. However, Expts. 3 and 4 are not randomized controlled trials — they are full production rollouts compared against different time periods.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "Scaling the SFT dataset from 2.9K to 64K datapoints improves model performance, showing the importance of dataset scale.",
    318       "evidence": "Table 3 shows SmallLSFT (2.9K) at 63.11% EM vs. LargeLSFT (64K) at 67.96% EM. Table 5 shows SmallLSFT at 16.13% vs. LargeLSFT at 19.75% ActionableToApplied (p<<.001).",
    319       "supported": "strong"
    320     },
    321     {
    322       "claim": "LargeLSFT's ShownToApplied rate improvement over SmallLSFT is not statistically significant.",
    323       "evidence": "Table 5 shows SmallLSFT at 27.55% vs. LargeLSFT at 28.74% ShownToApplied with p=.054. The paper honestly reports this non-significant result.",
    324       "supported": "strong"
    325     }
    326   ],
    327   "methodology_tags": [
    328     "rct",
    329     "benchmark-eval",
    330     "case-study"
    331   ],
    332   "key_findings": "Meta's MetaMateCR system generates AI-assisted code fixes for reviewer comments at scale. Fine-tuned Llama 3.1 70B models (LargeLSFT) achieve 68% exact match on an internal benchmark, outperforming GPT-4o by 9 percentage points, with a 19.75% acceptance rate in production. Crucially, randomized controlled safety trials revealed that showing AI patches to reviewers increased review time by 5.5%, leading to a UX change where patches are only shown to authors. This demonstrates the importance of safety trials when deploying AI tools in developer workflows.",
    333   "red_flags": [
    334     {
    335       "flag": "Company evaluating own product",
    336       "detail": "All authors are Meta employees evaluating Meta's internal MetaMateCR system. While affiliations are disclosed, there is no formal conflicts of interest statement. Meta has a direct financial interest in demonstrating the effectiveness of their AI coding tools."
    337     },
    338     {
    339       "flag": "Non-reproducible evaluation",
    340       "detail": "The benchmark (206 examples), training data (64K examples), and all code are proprietary to Meta. Independent researchers cannot reproduce any results. The paper acknowledges this ('we cannot release the benchmark and data set') but it fundamentally limits verification."
    341     },
    342     {
    343       "flag": "Production experiments lack randomization",
    344       "detail": "Experiments 3 and 4 (SmallLSFT and LargeLSFT rollouts) are not randomized trials. They are sequential production rollouts compared across different time periods, introducing potential temporal confounds (seasonal effects, codebase changes, engineer turnover). The statistical comparisons between experiments use Fisher tests on data from different time periods."
    345     },
    346     {
    347       "flag": "No confidence intervals or variance measures",
    348       "detail": "Despite the paper's emphasis on statistical rigor (RCTs, p-values), no confidence intervals, error bars, or variance measures are reported for any metric. The offline benchmark is only 206 examples, making point estimates potentially unstable."
    349     },
    350     {
    351       "flag": "Missing sample sizes for safety trials",
    352       "detail": "The number of participants and diffs in each experimental condition for the safety trials is not reported. Without knowing the sample sizes, the practical significance of the reported effect sizes cannot be properly assessed."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "Resolving code review comments with machine learning",
    358       "authors": ["A. Froemmgen", "J. Austin", "P. Choy", "N. Ghelani", "L. Kharatyan", "G. Surita", "E. Khrapko", "P. Lamblin", "P.-A. Manzagol", "M. Revaj", "M. Tabachnyk", "D. Tarlow", "K. Villela", "D. Zheng", "S. Chandra", "P. Maniatis"],
    359       "year": 2024,
    360       "doi": "10.1145/3639477.3639746",
    361       "relevance": "Google's closely related system for resolving code review comments with ML, deployed at scale — direct comparison target for MetaMateCR."
    362     },
    363     {
    364       "title": "Automated code review in practice",
    365       "authors": ["U. Cihan", "V. Haratian", "A. Icoz", "M. K. Gul", "O. Devran", "E. F. Bayendur", "B. M. Ucar", "E. Tuzun"],
    366       "year": 2024,
    367       "arxiv_id": "2412.18531",
    368       "relevance": "Proposes LLM-based code review and evaluates it in an enterprise setting, directly related to AI-augmented code review."
    369     },
    370     {
    371       "title": "The llama 3 herd of models",
    372       "authors": ["G. etal"],
    373       "year": 2024,
    374       "arxiv_id": "2407.21783",
    375       "relevance": "Foundation model (Llama 3.1 70B) used as the base for fine-tuning in MetaMateCR."
    376     },
    377     {
    378       "title": "SWE-bench: Can language models resolve real-world github issues?",
    379       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"],
    380       "year": 2024,
    381       "arxiv_id": "2310.06770",
    382       "relevance": "Major benchmark for evaluating LLM code generation capability, used as a reference benchmark in this paper."
    383     },
    384     {
    385       "title": "AI-assisted code authoring at scale: Fine-tuning, deploying, and mixed methods evaluation",
    386       "authors": ["V. Murali", "C. Maddila", "I. Ahmad", "M. Bolin", "D. Cheng", "N. Ghorbani", "R. Fernandez", "N. Nagappan", "P. C. Rigby"],
    387       "year": 2024,
    388       "doi": "10.1145/3643774",
    389       "relevance": "Prior Meta work on AI-assisted code authoring at scale with fine-tuned LLMs, directly precedes MetaMateCR."
    390     },
    391     {
    392       "title": "Multi-line ai-assisted code authoring",
    393       "authors": ["O. Dunay", "D. Cheng", "A. Tait", "P. Thakkar", "P. C. Rigby", "A. Chiu", "I. Ahmad", "A. Ganesan", "C. Maddila", "V. Murali", "A. Tayyebi", "N. Nagappan"],
    394       "year": 2024,
    395       "doi": "10.1145/3663529.3663836",
    396       "relevance": "Prior Meta work on multi-line AI code completion, relevant to understanding the ecosystem of AI-assisted coding tools."
    397     },
    398     {
    399       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    400       "authors": ["S. Peng", "E. Kalliamvakou", "P. Cihon", "M. Demirer"],
    401       "year": 2023,
    402       "arxiv_id": "2302.06590",
    403       "relevance": "Foundational study on AI developer productivity showing 56% improvement in task completion time, cited for productivity context."
    404     },
    405     {
    406       "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers",
    407       "authors": ["Z. K. Cui", "M. Demirer", "S. Jaffe", "L. Musolff", "S. Peng", "T. Salz"],
    408       "year": 2024,
    409       "relevance": "Large-scale field experiments at Microsoft showing Copilot increased diffs by 25%, directly relevant to AI coding productivity measurement."
    410     },
    411     {
    412       "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial",
    413       "authors": ["E. Paradis", "K. Grey", "Q. Madison", "D. Nam", "A. Macvean", "V. Meimand", "N. Zhang", "B. Ferrari-Church", "S. Chandra"],
    414       "year": 2024,
    415       "arxiv_id": "2410.12944",
    416       "relevance": "Google RCT with 96 engineers measuring AI impact on development speed, directly comparable methodology (RCT in enterprise setting)."
    417     },
    418     {
    419       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    420       "authors": ["N. Jain", "K. Han", "A. Gu", "W.-D. Li", "F. Yan", "T. Zhang", "S. Wang", "A. Solar-Lezama", "K. Sen", "I. Stoica"],
    421       "year": 2024,
    422       "arxiv_id": "2403.07974",
    423       "relevance": "Contamination-free code evaluation benchmark, relevant to benchmark methodology and contamination concerns."
    424     },
    425     {
    426       "title": "AI-assisted SQL authoring at industry scale",
    427       "authors": ["C. Maddila", "N. Ghorbani", "K. Jabre", "V. Murali", "E. Kim", "P. Thakkar", "N. P. Laptev", "O. Harman", "D. Hsu", "R. Abreu", "P. C. Rigby"],
    428       "year": 2024,
    429       "arxiv_id": "2407.13280",
    430       "relevance": "Related Meta work on AI-assisted SQL authoring using similar fine-tuning methodology at industry scale."
    431     },
    432     {
    433       "title": "Evaluating large language models trained on code",
    434       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    435       "year": 2021,
    436       "arxiv_id": "2107.03374",
    437       "relevance": "HumanEval benchmark paper, foundational for LLM code evaluation methodology."
    438     }
    439   ]
    440 }

Impressum · Datenschutz