ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27380B)


      1 {
      2   "paper": {
      3     "title": "AI-Assisted Assessment of Coding Practices in Modern Code Review",
      4     "authors": [
      5       "Manushree Vijayvergiya",
      6       "Małgorzata Salawa",
      7       "Ivan Budiselić",
      8       "Dan Zheng",
      9       "Pascal Lamblin",
     10       "Marko Ivanković",
     11       "Juanjo Carin",
     12       "Mateusz Lewko",
     13       "Jovan Andonov",
     14       "Goran Petrović",
     15       "Daniel Tarlow",
     16       "Petros Maniatis",
     17       "René Just"
     18     ],
     19     "year": 2024,
     20     "venue": "AIware '24 (1st ACM International Conference on AI-Powered Software)",
     21     "arxiv_id": "2405.13565",
     22     "doi": "10.1145/3664646.3665664"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No source code or repository URL is provided. The system is internal to Google and no code release is mentioned."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No dataset is released. The paper states 'due to industrial confidentiality reasons we are unable to disclose absolute numbers' (Section 4). Training data comes from Google's internal code review system and is not shared."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions T5X framework, TPU pods, and TensorFlow but does not provide a requirements file, dependency versions, or detailed environment specifications. No reproducible environment setup is given."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No reproduction instructions are provided. The system is proprietary to Google and cannot be replicated externally."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported for any of the main results (useful ratio, comment resolution rate, URL coverage). All results are point estimates."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper states 'We did not detect any statistically significant change' in the A/B experiment (Section 4.4) but does not report the specific test used, test statistics, p-values, or sample sizes for any comparison."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No formal effect sizes are reported. The paper provides percentages (e.g., 40% resolution rate, 68% URL coverage) but these are descriptive statistics, not effect sizes with baseline context for comparative claims."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No sample size justification or power analysis is provided. The rater study uses 15 raters and ~370 comments; the comment resolution analysis uses 6000 snapshot pairs; the manual inspection uses 40 pairs. None of these sample sizes are justified."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported for any results. All metrics are single point estimates."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares AutoCommenter against human reviewer comments (Section 5.2) and traditional linters (Section 5.3). The A/B experiment (Section 4.4) includes a control group."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The baselines are appropriate: human reviewers represent the current standard, and traditional linters are the existing automated approach. The A/B experiment's control group (no AutoCommenter) is the most relevant baseline."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No ablation study is conducted. The system has multiple components (model, thresholds, URL filtering, summarization, beam search) but no systematic analysis of individual component contributions."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Multiple metrics are used: useful ratio (positive/negative feedback), comment resolution rate (~40%), URL coverage vs. human comments (68%), URL coverage vs. linters (66% beyond linters), precision/recall on validation/test datasets."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "An independent human rating study was conducted (Section 4.3) with 15 raters evaluating ~370 posted comments. Developer feedback via thumbs up/down buttons also constitutes ongoing human evaluation."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3.3.1 describes temporal splitting: 'We temporally split the dataset to ensure that the model has not been trained on future code-review snapshots of the code comments in the validation and test datasets.'"
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 6 provides a breakdown of the top-50 most frequently predicted URLs by best-practice type (code idioms, documentation, formatting, language, naming) and whether a linter exists. Figure 5 shows per-URL cumulative distributions."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4.3 provides detailed failure analysis from the rater study, including specific categories: 'Several topics or complex topic,' 'Importance of high-quality summaries,' 'Subjective and potentially contentious topic,' 'Systematic model error for some guidelines,' and 'Correct but low-value comments.'"
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The A/B experiment reports null results: 'We did not detect any statistically significant change in any of the following: total duration of code reviews, time developers actively spent on the code review, the number of comment-response iterations' (Section 4.4). Also reports per-language threshold failure (Section 4.1.1)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims the system is 'feasible and has a positive impact on the developer workflow.' The paper supports feasibility through deployment details and positive impact through the useful ratio (>80%) and comment resolution (~40%). The claims are appropriately hedged."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper's main causal claim (no adverse effects on code review process) is supported by a randomized A/B experiment (Section 4.4) with random assignment verified for balance. The 'slight improvement in coding speed' is reported as a conjecture, not a definitive causal claim."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper bounds its claims to Google's setting: 'in an industrial setting at Google' and 'in a large industrial setting at Google' (Sections 1, 2). It does not claim the results generalize to other companies or open-source settings."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper does not discuss alternative explanations for its results. For example, the 40% comment resolution rate could be inflated by unrelated code changes. While they note that 'unrelated code changes could have led to a specific comment no longer being predicted' (Section 5.1), they do not discuss this as a systematic alternative explanation for the overall claim."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper says 'a traditional transformer approach based on T5, using T5X' (Section 3.1) but does not specify the model size (number of parameters) or exact version/checkpoint used."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.1 provides the actual input/target format including the task prompt ('Check language best practices') and a complete worked example for Go. The prompt is a fixed-text code comment, and the full format is shown."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Very few hyperparameters are reported. Beam search n=4 is mentioned (Section 4.1.2), and initial confidence threshold t=0.98 is stated (Section 4.1.1). However, training hyperparameters (learning rate, batch size, number of training steps, etc.) are not reported. Context window is mentioned as 2048 tokens but labeled as a limitation."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "AutoCommenter is not an agentic system. It is a single-pass inference model that takes source code as input and outputs predicted violations. No agentic scaffolding (tool use, retry logic, multi-turn interaction) is involved."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Sections 3.2.1 and 3.2.2 describe the data pipeline: large-scale preprocessing identifies relevant code comments containing best-practice URLs, then dataset curation converts them into TensorFlow Examples. The temporal split strategy is described (Section 3.3.1). Training corpus size (~3 billion examples total, ~800k best practice examples) is stated."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are mentioned in passing (e.g., incomplete ground truth in Section 3.3.1, confidentiality constraints in Section 4) but there is no structured discussion."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No specific threats to validity are discussed. The paper mentions that precision/recall measures are noisy due to incomplete ground truth (Section 3.3.1) but does not discuss this as a formal threat to validity."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what its results do NOT show. It does not discuss limitations regarding generalizability beyond Google, the specific languages tested, or the types of best practices the system cannot handle."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw data is available. The paper explicitly states confidentiality prevents disclosure of absolute numbers (Section 4)."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.2.1 describes how training data is collected: extracting human-authored code review comments containing best-practice URLs from Google's code review system, along with corresponding source code and metadata including creation timestamps."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "For the rater study (Section 4.3), recruitment is described: '15 raters—developers from partner teams' were recruited. For the A/B experiment (Section 4.4), random assignment via SHA256 hash of developer email addresses is described, with verification that groups were balanced."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The data pipeline is documented across Sections 3.2.1-3.2.3 (preprocessing, curation, training) and Section 5.1 (comment resolution analysis with 6000 snapshot pairs, 50% absent from submitted snapshot, 80% of manual sample confirmed resolution)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding statement is provided. The acknowledgements section thanks individuals but does not disclose funding sources. Given that this is a Google product evaluated by Google employees, funding disclosure is important."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All author affiliations are clearly listed. 12 of 13 authors are Google employees, and one (René Just) is at University of Washington with a note 'Work done at Google.' The affiliation with the evaluated product's company is transparent."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Google funded this work (all but one author are Google employees) and Google is the entity whose product (AutoCommenter) is being evaluated. Google has a direct financial interest in demonstrating that its code review automation is effective."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is provided. The authors are employees of the company whose product is being evaluated, but this conflict is not explicitly acknowledged as such."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not state a specific training data cutoff date. It mentions training data 'stretches before 2022' (Section 4.2) but does not give a precise cutoff. The model was described as 'state of the art' in 2022."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 3.3.1 describes temporal splitting to avoid data leakage: 'We temporally split the dataset to ensure that the model has not been trained on future code-review snapshots of the code comments in the validation and test datasets.'"
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "The temporal split strategy (Section 3.3.1) directly addresses contamination. The evaluation on full historical code reviews (Section 3.3.2) and A/B experiment (Section 4.4) use live/contemporaneous data that could not have been in training, further mitigating contamination concerns."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No pre-registration is mentioned for either the rater study or the A/B experiment."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No IRB or ethics board approval is mentioned, despite the rater study involving human participants (15 raters) and the A/B experiment involving tens of thousands of developers."
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "For the rater study, participants are described only as 'developers from partner teams' (Section 4.3). For the A/B experiment, the paper confirms groups were balanced on 'tenure, seniority, programming languages and business units' (Section 4.4) but does not report the actual demographic distribution."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No inclusion/exclusion criteria are stated for the rater study participants. For the A/B experiment, all developers were included but selection criteria for the control vs. experiment group are described only as hash-based randomization."
    268       },
    269       "randomization_described": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "The A/B experiment randomization is described: 'We randomly assigned developers to an experiment group (AutoCommenter enabled) and a control group (AutoCommenter disabled). We randomized based on the last few digits of the SHA256 hash of the developers email address' (Section 4.4)."
    273       },
    274       "blinding_described": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "For the rater study: 'We did not show the original user feedback to the rater, to avoid biasing their evaluation' (Section 4.3). The raters were blinded to the developer feedback on the comments they were rating."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No attrition information is reported for either the rater study or the A/B experiment. It is not stated whether all 15 raters completed all assigned evaluations or whether any developers dropped out of the A/B experiment."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Latency is partially reported (median 2 seconds for beam search, sub-second for greedy in IDE, Section 4.1.2), but no compute cost, API cost, or cost per prediction is reported."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total compute budget is stated. The paper mentions using TPU pods for training (Section 3.2.3) and storing checkpoints every 1000 steps, but does not quantify training time, number of TPUs, or total compute cost."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "An end-to-end system for learning and enforcing coding best practices is feasible and has a positive impact on the developer workflow.",
    301       "evidence": "The system was deployed to all developers at Google. Developer feedback useful ratio exceeded 80% after refinements (Section 4.3). Comment resolution rate is approximately 40% (Section 5.1).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "AutoCommenter's comments cover 68% of best practice documents that human reviewers reference.",
    306       "evidence": "Section 5.2 and Figure 5 show the cumulative distribution of comments per URL for automated vs. human comments. AutoCommenter has created comments for 330 distinct URLs covering 68% of historical human comments.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "For 66% of the top-50 most frequently predicted best practices, violation detection is beyond the scope of traditional static analysis.",
    311       "evidence": "Section 5.3 and Figure 6: three expert authors independently categorized the 50 most frequently predicted URLs. For 33/50, a linter does not exist and cannot easily be built.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "The comment resolution rate is approximately 40%.",
    316       "evidence": "Section 5.1: Automated analysis of 6000 snapshot pairs found comments absent in 50% of submitted snapshots. Manual inspection of 40 such pairs found 80% were directly resolved. 50% * 80% = 40%.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "AutoCommenter did not adversely affect the code review process (total duration, active time, comment-response iterations).",
    321       "evidence": "Section 4.4: A/B experiment with random assignment of about half of all developers. 'We did not detect any statistically significant change in any of the following: total duration of code reviews, time developers actively spent on the code review, the number of comment-response iterations.'",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "A slight improvement in coding speed was detected when using AutoCommenter.",
    326       "evidence": "Section 4.4: 'We did, however, detect a slight improvement in coding speed.' No magnitude, test statistic, or p-value is reported. The authors 'conjecture that the reduction in context switches to documentation leads to this positive effect.'",
    327       "supported": "weak"
    328     }
    329   ],
    330   "methodology_tags": [
    331     "rct",
    332     "case-study",
    333     "benchmark-eval"
    334   ],
    335   "key_findings": "AutoCommenter, a T5-based LLM system for detecting coding best-practice violations in code review, was deployed to tens of thousands of developers at Google. After iterative refinements including per-URL thresholds, beam search decoding, and suppression of non-actionable URLs, the system achieved over 80% useful ratio on developer feedback. An A/B experiment found no adverse effects on code review duration or iteration count, and an estimated 40% of automated comments were resolved by code authors. The system covers 68% of best practices referenced by human reviewers, with 66% of its top-50 predicted violations being beyond the scope of traditional linters.",
    336   "red_flags": [
    337     {
    338       "flag": "Company evaluating own product",
    339       "detail": "12 of 13 authors are Google employees evaluating Google's AutoCommenter system. One external author (René Just, UW) also worked at Google during this research. No conflict of interest statement is provided despite this clear conflict."
    340     },
    341     {
    342       "flag": "Confidentiality prevents verification",
    343       "detail": "The paper states 'due to industrial confidentiality reasons we are unable to disclose absolute numbers of code reviews, developers, files, comments, or distribution of duration of code reviews' (Section 4). This makes all quantitative claims unverifiable by external parties."
    344     },
    345     {
    346       "flag": "No statistical details for A/B experiment",
    347       "detail": "The A/B experiment (Section 4.4) claims no statistically significant adverse effects and a 'slight improvement in coding speed' but reports no test statistics, p-values, sample sizes, effect sizes, or confidence intervals. The reader cannot assess the power of the null results or the strength of the positive finding."
    348     },
    349     {
    350       "flag": "Small manual verification sample",
    351       "detail": "The 40% comment resolution rate is derived from a chain of estimates: 50% absence rate from 6000 automated pairs, then 80% true resolution from manual inspection of only 40 pairs. The 40-pair manual sample is small and the multiplication of estimates compounds uncertainty."
    352     },
    353     {
    354       "flag": "No limitations section",
    355       "detail": "The paper has no dedicated limitations or threats-to-validity section, which is unusual for a paper making deployment claims at this scale."
    356     }
    357   ],
    358   "cited_papers": [
    359     {
    360       "title": "Expectations, outcomes, and challenges of modern code review",
    361       "authors": ["Alberto Bacchelli", "Christian Bird"],
    362       "year": 2013,
    363       "doi": "10.1109/ICSE.2013.6606617",
    364       "relevance": "Foundational study on modern code review practices and challenges, relevant to understanding the context for AI-assisted code review."
    365     },
    366     {
    367       "title": "Automating code review activities by large-scale pre-training",
    368       "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo", "Nan Duan", "Shailesh Jannu", "Grant Jenks", "Deep Majumder", "Jared Green", "Alexey Svyatkovskiy", "Shengyu Fu", "Neel Sundaresan"],
    369       "year": 2022,
    370       "doi": "10.1145/3540250.3549081",
    371       "relevance": "Pre-trained model approach to code review automation using LLMs, directly comparable methodology to AutoCommenter."
    372     },
    373     {
    374       "title": "Code Review Automation: Strengths and Weaknesses of the State of the Art",
    375       "authors": ["Rosalia Tufano", "Ozren Dabić", "Antonio Mastropaolo", "Matteo Ciniselli", "Gabriele Bavota"],
    376       "year": 2024,
    377       "relevance": "Survey of code review automation approaches evaluating strengths and weaknesses, relevant to understanding the state of AI-assisted code review."
    378     },
    379     {
    380       "title": "Resolving Code Review Comments with Machine Learning",
    381       "authors": ["Alexander Frömmgen", "Jacob Austin", "Peter Choy"],
    382       "year": 2024,
    383       "relevance": "Companion work evaluating a live ML system for the inverse task (creating code from review comments), providing relevant methodology for evaluating deployed ML-for-code systems."
    384     },
    385     {
    386       "title": "Modern Code Review: A Case Study at Google",
    387       "authors": ["Caitlin Sadowski", "Emma Söderberg", "Luke Church", "Michal Sipko", "Alberto Bacchelli"],
    388       "year": 2018,
    389       "relevance": "Prior case study of Google's code review process, establishing the baseline environment into which AutoCommenter was deployed."
    390     },
    391     {
    392       "title": "CommentFinder: a simpler, faster, more accurate code review comments recommendation",
    393       "authors": ["Yang Hong", "Chakkrit Tantithamthavorn", "Patanamon Thongtanunam", "Aldeida Aleti"],
    394       "year": 2022,
    395       "relevance": "Alternative approach to automated code review comment recommendation, relevant as a comparison point for AI-based code review tools."
    396     },
    397     {
    398       "title": "Auger: Automatically generating review comments with pre-training models",
    399       "authors": ["Lingwei Li", "Li Yang", "Huaxi Jiang"],
    400       "year": 2022,
    401       "relevance": "Pre-training approach for automated code review comment generation, directly related to AI-assisted code review methodology."
    402     },
    403     {
    404       "title": "AutoTransform: Automated code transformation to support modern code review process",
    405       "authors": ["Patanamon Thongtanunam", "Chanathip Pornprasit", "Chakkrit Tantithamthavorn"],
    406       "year": 2022,
    407       "relevance": "Automated code transformation for code review, relevant methodology for evaluating AI tools in the code review workflow."
    408     },
    409     {
    410       "title": "Using pre-trained models to boost code review automation",
    411       "authors": ["Rosalia Tufano", "Simone Masiero", "Antonio Mastropaolo", "Luca Pascarella", "Denys Poshyvanyk", "Gabriele Bavota"],
    412       "year": 2022,
    413       "relevance": "Evaluation of pre-trained models for code review tasks, relevant methodology and baselines for AI-assisted code review research."
    414     },
    415     {
    416       "title": "Scaling up models and data with t5x and seqio",
    417       "authors": ["Adam Roberts", "Hyung Won Chung"],
    418       "year": 2023,
    419       "relevance": "The T5X framework used as the foundation for AutoCommenter's model, relevant to understanding LLM infrastructure for code tasks."
    420     },
    421     {
    422       "title": "Please fix this mutant: How do developers resolve mutants surfaced during code review?",
    423       "authors": ["Goran Petrović", "Marko Ivanković", "Gordon Fraser", "René Just"],
    424       "year": 2023,
    425       "relevance": "Prior work on developer interaction with automated code review analysis at Google, providing methodology for measuring comment resolution."
    426     }
    427   ]
    428 }

Impressum · Datenschutz