scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21638B)
      1 {
      2   "paper": {
      3     "title": "The Impact of Large Language Models (LLMs) on Code Review Process",
      4     "authors": ["Antonio Collante", "Samuel Abedu", "SayedHassan Khatoonabadi", "Ahmad Abdellatif", "Ebube Alor", "Emad Shihab"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.11034"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "The paper provides a public repository: https://github.com/acollant/GPT-Assistance-PR with scripts and dataset (Section 1, contributions)."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The dataset is shared at the same GitHub repository. The paper states 'we publicly share our scripts and dataset online.'"
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are provided in the paper. The repository is referenced but no specific instructions for replicating results are described."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No confidence intervals or error bars are reported for any results. Only point estimates (medians, percentages) are provided."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Mann-Whitney U test is used to compare GPT-assisted vs non-assisted PRs (Sections 4.1, 4.2), with p-values reported (e.g., 4.88e-11 for is_gpt-assisted in Table 2)."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Effect sizes are reported as percentage reductions with baseline context: 61% reduction in merge time (9 vs 23 hours), 66.7% review time reduction, 87.5% waiting time reduction. Regression coefficients with standard errors are also provided in Table 2."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The 25,473 PR dataset size is described but not justified via power analysis. The 450-PR sample for heuristic validation uses 95% confidence / 5% margin of error, but the main analysis sample size is not justified."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No variance, standard deviation, or spread measures are reported for the main results. Only medians are given without IQR or other dispersion measures."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "GPT-non-assisted PRs serve as the baseline comparison group, matched using Manhattan distance on structural features (Section 3.3.1)."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The comparison is between contemporaneous GPT-assisted and non-assisted PRs from the same time period and platform (GitHub, collected May 2024)."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is an observational study comparing two groups, not a system with components to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple metrics are used: overall merge time, phase-specific times (review, waiting for change, change), and task type distributions across phases."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Manual annotation was performed: 450 PRs manually reviewed for heuristic development (3 annotators), 310 PRs manually labeled for task classification (RQ3, Section 4.3)."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is an observational mining study, not a predictive modeling task requiring train/test splits."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down by PR phase (submission, review, waiting, change, post-acceptance) in Table 3, and by task type (enhancement, implementation, bug fix, testing, documentation, other) in Table 4."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper discusses cases where reviewers rejected GPT-generated code (e.g., 'I will not blindly accept any AI-generated code' in Section 4.3) and phases where GPT had no impact (submission, post-acceptance)."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper reports no significant difference in the At Change phase between GPT-assisted and non-assisted PRs (Table 3, both 1 hour), and notes GPT had no detectable impact during submission and post-acceptance phases."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Abstract claims of 61% merge time reduction, 66.7% review time reduction, and 87.5% waiting time reduction are supported by results in Tables 2-3 and Figure 3. Task distribution claims (60% optimization, 26% bug fixing, 12% documentation) are supported by Table 4."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper uses causal language ('GPT-assisted PRs reduced median resolution time', 'can boost the effectiveness', 'leading to considerable time savings') but the study design is observational with matched comparisons, not a randomized experiment. The regression controls for some confounders but cannot establish causation. The authors acknowledge this in Section 6.2 but the main text uses causal framing."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title claims 'The Impact of Large Language Models on Code Review Process' broadly, but the study only examines GitHub PRs where GPT usage was explicitly mentioned. The abstract and conclusion make broad claims about 'GPT to accelerate code reviews' without bounding to the specific population of PRs that openly reference GPT."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Section 6 discusses multiple alternative explanations: task simplicity, developer experience, team workflow dynamics, and code change complexity as potential confounders. Section 6.1 specifically notes that 'other factors, such as task simplicity or team dynamics, could also influence outcomes.'"
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper measures PR merge time and phase durations but frames results as 'efficiency', 'effectiveness', and 'code quality' improvement without distinguishing between the proxy (time metrics) and the claimed outcomes. Faster reviews do not necessarily mean better reviews, but this gap is not acknowledged."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "The paper studies GPT usage by developers in the wild, not a specific model version controlled by the researchers. The paper refers generically to 'GPT' and 'ChatGPT' without controlling which version developers used."
    140       },
    141       "prompts_provided": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "The paper does not use prompting as part of its methodology — it mines existing PRs where developers used GPT externally."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters for the statistical models (e.g., R's lm function configuration) or the n-gram extraction are reported beyond the 5-gram maximum."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used in this study."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Data preprocessing is documented: filtering criteria (≥10 stars, ≥2 contributors, ≥1 event), keyword search methodology, n-gram extraction approach, heuristic refinement rounds, and the Manhattan distance matching procedure (Sections 3.1-3.3)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 6 'Threats to Validity' provides a dedicated multi-subsection discussion covering construct, internal, and external validity."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The threats are specific to this study: false negatives from keyword-based detection, GitHub-only platform bias, the ≥10 stars selection criterion introducing bias, and the inability to capture undisclosed GPT use (Section 6)."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "While threats to validity are discussed, the paper does not explicitly state what the results do NOT show. The conclusion makes broad claims without bounding them to the specific population studied (PRs that explicitly mention GPT on GitHub)."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The dataset is publicly shared at https://github.com/acollant/GPT-Assistance-PR, allowing independent verification."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3.1 describes data collection: GitHub REST API, keyword 'GPT', collected May 9, 2024, yielding 81,234 PRs filtered to 25,473. Fields collected are enumerated."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants were recruited. The study mines public GitHub repositories."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Figure 1 and Section 3 document the full pipeline: search (81,234 PRs) → filter (25,473) → heuristic labeling (1,600 GPT-assisted) → similarity matching → phase mapping. Counts at each stage are provided."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Acknowledgements section states: 'This work was supported by the NSERC CREATE grant number 555406, 2021.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Concordia University and University of Calgary. No authors are affiliated with GPT/OpenAI."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "NSERC is a Canadian government research funding agency with no financial interest in GPT or code review outcomes."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement is included in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This is a mining study analyzing developer behavior, not evaluating a pre-trained model's capability on a benchmark."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Not applicable — no model benchmark evaluation is performed."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not applicable — no model benchmark evaluation is performed."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "This is a repository mining study with no human participants."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants — the study mines public GitHub data."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants — inclusion/exclusion criteria apply to repositories and PRs, not people."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants or experimental conditions."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants or experimental conditions."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is an observational mining study, not a system with inference costs."
    282       },
    283       "compute_budget_stated": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "This is an observational mining study with standard statistical analysis; compute budget is not a meaningful concern."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "GPT-assisted PRs reduced median resolution time by more than 61% (from 23 hours to 9 hours).",
    293       "evidence": "Multiple linear regression model (Table 2) shows is_gpt-assisted coefficient of -0.4921 with p=4.88e-11. Figure 3 shows median merge times of 9 vs 23 hours.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "GPT-assisted PRs reduce review time by 66.7% (from 3 hours to 1 hour median).",
    298       "evidence": "Table 3 shows median At Review time of 1 hour for GPT-assisted vs 3 hours for non-assisted. Mann-Whitney U test confirms statistical significance (p < 0.05).",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "GPT-assisted PRs reduce waiting time before acceptance by 87.5% (from 24 hours to 3 hours median).",
    303       "evidence": "Table 3 shows median At Waiting for Change of 3 hours for GPT-assisted vs 24 hours for non-assisted. Mann-Whitney U test confirms statistical significance.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Developers predominantly use GPT for code optimization (60%), bug fixing (26%), and documentation updates (12%).",
    308       "evidence": "Table 4 reports task frequencies from manual analysis of 310 GPT-assisted PRs. Enhancement is 60.26% at Review phase, bug fix is 25.64% at Review.",
    309       "supported": "moderate"
    310     }
    311   ],
    312   "methodology_tags": ["observational"],
    313   "key_findings": "This observational study of 25,473 GitHub PRs finds that GPT-assisted PRs have 61% shorter median merge times (9 vs 23 hours), with the largest reductions in review time (66.7%) and waiting-before-change time (87.5%). Manual analysis of 310 GPT-assisted PRs shows developers primarily use GPT for code enhancement (60%), bug fixing (26%), and documentation (12%), with minimal use for implementation from scratch or testing. However, these are correlational findings from PRs that explicitly mention GPT, not causal estimates.",
    314   "red_flags": [
    315     {
    316       "flag": "Severe selection bias in treatment group",
    317       "detail": "GPT-assisted PRs are identified by keyword mentions of 'GPT' in PR text. This captures only PRs where developers openly disclose GPT use, which is a tiny, non-representative subset. Developers who mention GPT may be more experienced, more transparent, or working on simpler tasks. The 1,600 GPT-assisted PRs out of 25,473 (6.3%) likely represent a biased sample."
    318     },
    319     {
    320       "flag": "Causal language with observational design",
    321       "detail": "The paper uses causal language ('reduced', 'boost effectiveness', 'leading to time savings') throughout, but the observational design with matched comparisons cannot establish causation. The Manhattan distance matching on 4 features is insufficient to rule out confounders like task complexity, developer skill, or project maturity."
    322     },
    323     {
    324       "flag": "No variance or uncertainty quantification on main results",
    325       "detail": "All main findings are reported as median point estimates (9 hours, 1 hour, 3 hours) without confidence intervals, IQR, or any spread measure. The reader cannot assess result stability."
    326     },
    327     {
    328       "flag": "Matching methodology is weak",
    329       "detail": "The Manhattan distance matching uses only 4 features (no_commits, PR_size, no_changed_files, project_age) and selects only the single closest match per GPT-assisted PR. Important confounders like programming language, task type, developer experience, project domain, and team size are not controlled for."
    330     },
    331     {
    332       "flag": "R² not reported for regression model",
    333       "detail": "The paper mentions calculating R² but does not report it, making it impossible to assess how much variance the model explains. The methodology section says they evaluate R² but the results section omits it."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Unveiling chatgpt's usage in open source projects: A mining-based study",
    339       "authors": ["R. Tufano", "A. Mastropaolo", "F. Pepe", "O. Dabić", "M. Di Penta", "G. Bavota"],
    340       "year": 2024,
    341       "relevance": "Directly relevant study on ChatGPT usage patterns in open source, including commit message generation and PR descriptions."
    342     },
    343     {
    344       "title": "Generative ai for pull request descriptions: Adoption, impact, and developer interventions",
    345       "authors": ["T. Xiao", "H. Hata", "C. Treude", "K. Matsumoto"],
    346       "year": 2024,
    347       "relevance": "Studies Copilot impact on PR merging likelihood, finding 1.57x higher merge rate for Copilot-assisted PRs."
    348     },
    349     {
    350       "title": "AI-powered code review with llms: Early results",
    351       "authors": ["Z. Rasheed", "M. A. Sami", "M. Waseem"],
    352       "year": 2024,
    353       "arxiv_id": "2404.18496",
    354       "relevance": "Examines LLM-powered code review agents and their dual benefit of enhancing code quality and developer learning."
    355     },
    356     {
    357       "title": "An empirical study on developers' shared conversations with chatgpt in github pull requests and issues",
    358       "authors": ["H. Hao", "K. A. Hasan", "H. Qin"],
    359       "year": 2024,
    360       "relevance": "Studies developer-ChatGPT interactions in GitHub PRs and issues, finding common request types including code generation and review."
    361     },
    362     {
    363       "title": "Automated code review in practice",
    364       "authors": ["U. Cihan", "V. Haratian", "A. İçöz"],
    365       "year": 2025,
    366       "relevance": "Empirical study of LLM-based automated code review across 4,335 PRs in industry, finding 74% application rate but longer closure times."
    367     },
    368     {
    369       "title": "Automated code review using large language models at ericsson: An experience report",
    370       "authors": ["S. Ramesh", "J. Bose", "H. Singh"],
    371       "year": 2025,
    372       "arxiv_id": "2507.19115",
    373       "relevance": "Industry case study of LLM-augmented code review combining static analysis with prompt engineering at Ericsson."
    374     },
    375     {
    376       "title": "Tales from the trenches: Expectations and challenges from practice for code review in the generative ai era",
    377       "authors": ["N. Davila", "J. Melegati", "I. Wiese"],
    378       "year": 2024,
    379       "relevance": "Identifies five challenges to adopting generative AI in code review: trustworthiness, context, misleading comments, security, token limits."
    380     },
    381     {
    382       "title": "Analyzing developer use of chatgpt generated code in open source github projects",
    383       "authors": ["B. Grewal", "W. Lu", "S. Nadi", "C.-P. Bezemer"],
    384       "year": 2024,
    385       "relevance": "Studies ChatGPT-generated code adoption in open source projects."
    386     },
    387     {
    388       "title": "AI-assisted assessment of coding practices in modern code review",
    389       "authors": ["M. Vijayvergiya", "M. Salawa"],
    390       "year": 2024,
    391       "relevance": "Describes AutoCommenter, an LLM-powered code review tool enforcing coding best practices at scale."
    392     },
    393     {
    394       "title": "How do software developers use chatgpt? an exploratory study on github pull requests",
    395       "authors": ["M. Chouchen", "N. Bessghaier", "M. Begoug"],
    396       "year": 2024,
    397       "relevance": "Exploratory study of ChatGPT usage patterns in GitHub PRs."
    398     }
    399   ]
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs