scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26326B)
      1 {
      2   "paper": {
      3     "title": "Automated Code Review In Practice",
      4     "authors": [
      5       "Umut Cihan",
      6       "Vahid Haratian",
      7       "Arda İçöz",
      8       "Mert Kaan Gül",
      9       "Ömercan Devran",
     10       "Emircan Furkan Bayendur",
     11       "Baykal Mehmet Uçar",
     12       "Eray Tüzün"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv",
     16     "arxiv_id": "2412.18531",
     17     "doi": "10.5281/zenodo.13917481"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper provides a replication package via Zenodo (https://doi.org/10.5281/zenodo.13917481), referenced in Section III-E as containing 'data analysis scripts, survey questions, and results.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The replication package at Zenodo (doi.org/10.5281/zenodo.13917481) includes survey results and data analysis scripts. The paper states in Section III-E: 'Our data analysis scripts, survey questions, and results are shared in our replication package.'"
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications (requirements.txt, library versions, Python version) are mentioned in the paper beyond a brief reference to using the pandas library."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is mentioned but no README or reproduction guide is described."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper reports point estimates (e.g., '73.8% resolved', average closure durations) without confidence intervals or error bars. Standard deviation is mentioned once (rating std dev of 1.79) but no CIs on the main results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper uses independent samples t-tests for closure duration comparisons (p-value < 0.001) and Poisson regression for human comment volume analysis (Section IV-C), reporting p-values for each project."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Effect sizes are reported with baseline context: closure durations changed from 5h52m to 8h20m overall, with per-project breakdowns (e.g., Project #1 from 2h48m to 4h38m). Human comments went from 0.31 to 0.28 per PR. These raw differences with baselines provide sufficient context."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or sample size justification is provided. The general opinion survey has only 22 respondents and the code review survey only 10 respondents, with no discussion of whether these are adequate sample sizes."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Standard deviation is reported once for review ratings (1.79) but variance/spread is not reported for the main quantitative results (closure durations, comment counts). Figures show averages without spread measures."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study uses a before/after design comparing metrics (closure duration, human comment volume) before and after CodeReviewBot deployment, serving as a natural baseline comparison."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baseline is the same projects' own pre-intervention data, which is the appropriate contemporary baseline for this type of industry case study."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is an industry case study evaluating a single tool (CodeReviewBot). There is no multi-component system to ablate."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The study uses multiple metrics: comment resolution labels (73.8% resolved), PR closure duration, number of human reviewer comments, developer survey ratings, and commits after review."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The study includes human evaluation through two surveys: a per-PR code review survey (10 respondents rating automated comments) and a general opinion survey of 22 practitioners evaluating the tool's impact on code quality, speed, and knowledge sharing."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "This is an industry case study, not a benchmark evaluation. There is no train/test split applicable."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per project (Projects #1, #2, #3) for all major metrics: comment labels (Figure 9), closure durations (Figure 11), and human comments (Figure 12), revealing significant variation across projects."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section V discusses failure cases including unnecessary review comments (26.2% not acted upon), out-of-scope suggestions, irrelevant comments, and concerns about over-reliance. Developer quotes illustrate specific failures."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports several negative results: PR closure duration increased overall (5h52m to 8h20m), human review volume did not significantly decrease, and 26.2% of automated comments were labeled 'Won't Fix' or 'Closed'. Developers reported disadvantages including out-of-scope suggestions."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims are supported: 73.8% resolved comments (Section IV-C), increased closure duration from 5h52m to 8h20m (Section IV-C), and minor perceived code quality improvement (Section IV-B). The claims are hedged appropriately."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims (e.g., 'automated code reviews led to longer pull request closure times') but the before/after design without controlling for confounds (seasonality, team changes, project maturity) is weak for causal inference. The paper acknowledges seasonality as a threat but does not use causal identification strategies."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper appropriately bounds its findings to the specific company and tool. Section VI-C explicitly states: 'Since we used this specific tool and model, we acknowledge that other LLMs and automated code review tools might exhibit different behavior.' Section VI-D notes: 'This study might have led to different results in a different company setting.'"
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper discusses several alternative explanations in the threats to validity: seasonality effects (summer vacations), the mandatory comment resolution policy potentially causing unreliable labels, and bot accounts contaminating data. The discussion section considers that closure time increases could be due to developers engaging more with feedback rather than tool inefficiency."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'GPT-4-32K Model' (Section III-A) but does not provide a specific version identifier or snapshot date (e.g., 'gpt-4-32k-0613'). The model behavior could differ across versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The tool is based on open-source Qodo PR Agent, but the paper does not provide the actual prompts or system instructions used by CodeReviewBot. No prompt text is shown in the paper or described as being in the replication package."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens) for the GPT-4-32K API calls are reported. The paper only mentions '3,937 tokens per pull request' as average usage, not the API configuration."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper mentions CodeReviewBot is 'based on the open-source Qodo PR-Agent' but does not describe the scaffolding, workflow, or how the tool processes PRs beyond a brief flow diagram (Figure 3). The customizations Beko made are not detailed."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section III-C-4 describes data preprocessing in detail: filtering late-arriving comments using elbow evaluation (93% threshold), removing bot accounts (SonarQube), excluding deleted comments and non-main-branch PRs, and collaborative data cleaning sessions. The pipeline from raw Azure DevOps data to final dataset is documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VI 'Threats to Validity' provides a dedicated, substantive discussion of limitations organized into construct, internal, external, and conclusion validity subsections."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The threats are specific to this study: seasonality of data collection (summer vacations at Beko), one author being a manager introducing potential bias, mandatory comment resolution policy potentially causing unreliable labels, specific tool/model limitations, and two bot accounts identified in the data."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section VI-C states the results apply only to this specific tool (Qodo PR Agent) and model (GPT-4-32K). Section VI-D explicitly states 'This study might have led to different results in a different company setting' and 'we limited our scope and did not aim for statistical generalizations.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The replication package (Zenodo, doi.org/10.5281/zenodo.13917481) is referenced as containing data analysis scripts and results. However, the raw Azure DevOps data (individual PR data, comments) may be partially restricted due to industry confidentiality."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section III-C describes data collection from Azure DevOps API in detail, including what data was extracted (PRs, comments, commits, labels), the timeline (Figure 1), and the three data sources (repository data, PR surveys, general opinion surveys)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The paper describes that 238 practitioners across 10 projects had access to the tool, 22 developers who contributed to the 3 study projects were surveyed (Table III provides demographics), and PR survey recipients were authors of PRs that received automated reviews."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section III-C-4 documents the pipeline: Azure DevOps API extraction → relational database → CSV conversion → pandas processing → filtering (elbow evaluation for late comments, bot account removal, deleted comment exclusion). The pipeline stages and cleaning decisions are described with rationale."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section VIII 'Acknowledgements' states: 'This work has been supported by the ITEA4 GENIUS project, which has been funded by the national funding authorities of the participating countries.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly show four authors from Bilkent University and four from Beko (the company being studied). The paper header lists these affiliations with institutional markers."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The funder is ITEA4 GENIUS project (a European research consortium), which has no direct financial interest in whether automated code review tools succeed or fail. The funder appears independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided. Four authors are Beko employees, and the study evaluates a tool adopted by Beko, but no explicit declaration of potential conflicts or financial interests is made."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This study does not evaluate a pre-trained model's capability on a benchmark. It evaluates an automated code review tool in practice (industry deployment), so training cutoff is not applicable."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This is an industry case study examining tool impact, not a benchmark evaluation. Train/test overlap is not applicable."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is performed. The study measures real-world usage metrics and developer perceptions, so contamination is not applicable."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No mention of pre-registration (OSF, AsPredicted, or similar) is found in the paper. The study involves human participants through surveys."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of IRB or ethics board approval is found in the paper, despite involving human participants through surveys and collecting data about developer behavior."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Table III reports participant demographics for the general opinion survey: experience levels (0-2, 2-5, 5-10, 10+ years) and positions (individual contributor vs. lead/manager), with counts for each category."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "The paper states surveys were sent to 'developers who contributed to the three projects within our research scope' (Section III-D-2). For quantitative data, three projects were selected based on 'their longer duration of tool usage' while others were excluded for insufficient data."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "This is an observational case study, not an experimental study with random assignment to conditions. Randomization is not applicable."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "This is an observational case study. Blinding is not feasible — developers knew they were using CodeReviewBot, and this is inherent to the study design."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "The paper notes 38 pull requests received automated reviews during the survey period, but only 10 people completed the full survey. For the general opinion survey, 23 responses were received, with 22 consenting to published results and 2 non-useful responses excluded."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section V states: 'the CodeReviewBot used an average of 3,937 tokens per pull request at a cost of 0.48$', providing per-PR cost information."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The total computational budget (total API spend across all PRs, Azure DevOps hosting, etc.) is not stated. Only the per-PR token cost is provided."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "73.8% of automated code review comments were labeled as 'Resolved' by developers, indicating they implemented the tool's suggestions.",
    296       "evidence": "Section IV-C-1 reports analysis of 1,408 CodeReviewBot comments on merged PRs: 73.8% Resolved, 21.3% Won't Fix. Figure 9 shows per-project breakdown.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The overall average pull request closure duration increased significantly from 5 hours 52 minutes to 8 hours 20 minutes after CodeReviewBot introduction.",
    301       "evidence": "Section IV-C-3 reports the increase with independent samples t-test (p < 0.001). Per-project results vary: Project #1 increased (2h48m to 4h38m), Project #2 decreased (6h06m to 3h07m), Project #3 increased (20h22m to 30h51m).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The number of human review comments per pull request did not significantly change after introducing automated code review.",
    306       "evidence": "Section IV-C-4: overall human comments decreased from 0.31 to 0.28 per PR, but Poisson regression showed this was not statistically significant (p >= 0.05). Per-project trends differed (Project #1 increased, Project #2 decreased, Project #3 unchanged).",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Most practitioners perceived a minor improvement in code quality as a result of automated code reviews.",
    311       "evidence": "Section IV-B: 14 of 22 survey respondents indicated minor improvement in code quality. 68.8% perceived minor improvement per Section V discussion.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "LLM-based automated code reviews are highly useful in this company's context.",
    316       "evidence": "RQ1 answer box cites 73.8% resolved comments, 88 commits after bot reviews, and survey results showing perceived code quality improvement.",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "case-study",
    322     "observational",
    323     "qualitative"
    324   ],
    325   "key_findings": "An industry case study at Beko found that 73.8% of automated code review comments (from a Qodo PR Agent-based tool using GPT-4-32K) were labeled as resolved by developers. However, overall PR closure duration significantly increased from 5h52m to 8h20m, with substantial variation across projects. Human review volume did not significantly change. Most of the 22 surveyed practitioners perceived minor code quality improvements but raised concerns about irrelevant suggestions and potential over-reliance on automation.",
    326   "red_flags": [
    327     {
    328       "flag": "Insider evaluation",
    329       "detail": "Four of eight authors are Beko employees, including one in a managerial position. The study evaluates a tool adopted by Beko. While the paper notes results were written by non-practitioner authors, the potential for bias is significant and no formal competing interests statement is provided."
    330     },
    331     {
    332       "flag": "Very small survey sample",
    333       "detail": "The code review survey received only 10 responses and the general opinion survey only 22 responses. The paper acknowledges dissatisfied developers may have been less motivated to complete surveys, creating potential response bias, but does not quantify this threat."
    334     },
    335     {
    336       "flag": "No causal identification strategy",
    337       "detail": "The before/after comparison lacks controls for temporal confounds. The paper acknowledges seasonality (summer data collection) but does not use difference-in-differences, matching, or other quasi-experimental methods to isolate the tool's causal effect from other changes over time."
    338     },
    339     {
    340       "flag": "Contradictory project-level results",
    341       "detail": "Results vary substantially across the three projects (e.g., closure duration increased in two projects but decreased in one; human comments increased in one, decreased in another, unchanged in a third). The paper aggregates to overall statistics that may obscure the heterogeneity, and the 'highly useful' conclusion for RQ1 is based primarily on comment resolution labels, which may not reflect genuine usefulness."
    342     },
    343     {
    344       "flag": "Comment resolution label reliability",
    345       "detail": "The 73.8% 'Resolved' rate is a key finding, but the paper acknowledges developers may not have followed the labeling policy correctly ('developers might also disregard the expected labeling policy and use Resolved instead of Closed or Won't Fix'). This undermines the primary usefulness metric."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "AI-Assisted Assessment of Coding Practices in Modern Code Review",
    351       "authors": ["M. Vijayvergiya", "M. Salawa", "I. Budiselic"],
    352       "year": 2024,
    353       "relevance": "Industry study from Google on AutoCommenter, an LLM-backed automated code review system at scale — directly relevant to evaluating LLM code review tools."
    354     },
    355     {
    356       "title": "Exploring the Capabilities of LLMs for Code Change Related Tasks",
    357       "authors": ["L. Fan", "J. Liu", "Z. Liu", "D. Lo", "X. Xia", "S. Li"],
    358       "year": 2024,
    359       "arxiv_id": "2407.02824",
    360       "relevance": "Evaluates LLM capabilities on code review generation, commit message generation, and just-in-time comment update — core LLM code generation evaluation."
    361     },
    362     {
    363       "title": "Security Code Review by Large Language Models",
    364       "authors": ["J. Yu", "P. Liang", "Y. Fu", "A. Tahir", "M. Shahin", "C. Wang", "Y. Cai"],
    365       "year": 2024,
    366       "arxiv_id": "2401.16310",
    367       "relevance": "Explores LLM capabilities for security-focused code review, relevant to AI safety and code quality assessment."
    368     },
    369     {
    370       "title": "Code Review Automation: Strengths and Weaknesses of the State of the Art",
    371       "authors": ["R. Tufano", "O. Dabic", "A. Mastropaolo", "M. Ciniselli", "G. Bavota"],
    372       "year": 2024,
    373       "doi": "10.1109/TSE.2023.3348172",
    374       "relevance": "Systematic evaluation of code review automation approaches including ChatGPT, directly relevant to understanding automated code review effectiveness."
    375     },
    376     {
    377       "title": "Fine-Tuning and Prompt Engineering for Large Language Models-Based Code Review Automation",
    378       "authors": ["C. Pornprasit", "C. Tantithamthavorn"],
    379       "year": 2024,
    380       "relevance": "Investigates prompt engineering and fine-tuning for LLM-based code review, relevant to LLM methodology in SE tasks."
    381     },
    382     {
    383       "title": "AI-Powered Code Review with LLMs: Early Results",
    384       "authors": ["Z. Rasheed", "M. A. Sami", "M. Waseem", "K.-K. Kemell", "X. Wang", "A. Nguyen", "K. Systa", "P. Abrahamsson"],
    385       "year": 2024,
    386       "relevance": "Early results on AI-powered code review with LLMs, directly within the scope of automated code review evaluation."
    387     },
    388     {
    389       "title": "On the Use of ChatGPT for Code Review: Do Developers Like Reviews by ChatGPT?",
    390       "authors": ["M. Watanabe", "Y. Kashiwa", "B. Lin", "T. Hirao", "K. Yamaguchi", "H. Iida"],
    391       "year": 2024,
    392       "doi": "10.1145/3661167.3661183",
    393       "relevance": "Investigates developer reception of ChatGPT-generated code reviews in GitHub projects, directly comparable to this study's developer perception findings."
    394     },
    395     {
    396       "title": "Tales from the Trenches: Expectations and Challenges from Practice for Code Review in the Generative AI Era",
    397       "authors": ["N. Davila", "J. Melegati", "I. Wiese"],
    398       "year": 2024,
    399       "relevance": "Grey literature review on generative AI for code reviews in practice, complementary survey of the same topic area."
    400     },
    401     {
    402       "title": "CodeAgent: Collaborative Agents for Software Engineering",
    403       "authors": ["D. Tang", "K. Kim", "Y. Song", "C. Lothritz", "B. Li", "S. Ezzini", "H. Tian", "J. Klein", "T. F. Bissyande"],
    404       "year": 2024,
    405       "relevance": "LLM agent approach to software engineering tasks including code review, relevant to agentic AI workflows."
    406     },
    407     {
    408       "title": "Towards Automating Code Review Activities",
    409       "authors": ["R. Tufano", "L. Pascarella", "M. Tufano", "D. Poshyvanyk", "G. Bavota"],
    410       "year": 2021,
    411       "doi": "10.1109/ICSE43902.2021.00027",
    412       "relevance": "Foundational work on automating code review activities, cited as motivation for the time-saving potential of automated code review."
    413     },
    414     {
    415       "title": "GPT-4 Technical Report",
    416       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    417       "year": 2023,
    418       "arxiv_id": "2303.08774",
    419       "relevance": "Technical report for GPT-4, the underlying model used in the CodeReviewBot tool evaluated in this study."
    420     }
    421   ]
    422 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs