scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25302B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Generative AI for Pull Request Descriptions: Adoption, Impact, and Developer Interventions",
      6     "authors": [
      7       "Tao Xiao",
      8       "Hideaki Hata",
      9       "Christoph Treude",
     10       "Kenichi Matsumoto"
     11     ],
     12     "year": 2024,
     13     "venue": "Proc. ACM Softw. Eng.",
     14     "arxiv_id": "2402.08967",
     15     "doi": "10.1145/3643773"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All three abstract claims (growing adoption, reduced review time/higher merge likelihood, developer supplementation) are directly supported by RQ1–RQ3 results with specific numbers: 19.3 hours reduction, 1.57x merge odds ratio, 13 supplementary info categories.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper explicitly frames RQ2 as 'causal inference' and uses Entropy Balancing / Propensity Score Weighting with 17 confounding variables to construct a pseudo-balanced comparison; this is a recognized quasi-experimental method appropriate for observational causal claims.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper explicitly states in External Validity (§6.2): 'our results are not universally applicable to the broader open-source developer community, but are more pertinent to these early adopters,' bounding all conclusions to the early-adopter cohort.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Internal Validity only offers a generic disclaimer ('there may be other confounding variables not accounted for') without enumerating specific alternative explanations such as selection effects where teams using Copilot may already be more efficient or organized.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Review time and merge likelihood are used as proxies for 'review quality' and developer efficiency, but the recommendations section advocates Copilot adoption to 'amplify clarity' without discussing whether shorter review time could indicate less thorough review rather than improved quality.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6.2 is a dedicated 'Threats to validity' section covering construct, internal, and external validity with multiple specific sub-points.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats are named: bot detection methodology by Golzadeh et al. for construct validity, manual coding subjectivity with kappa measurement for internal validity, and the early-adopter limited-access cohort for external validity.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly restricts scope to early adopters of a limited-release feature (March–August 2023), 146 repositories, and notes developers less eager to adopt new technologies are not represented.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgments explicitly disclose: JSPS Grant-in-Aid JP23KJ1589, JSPS KAKENHI JP20H05706, and JST PRESTO JPMJPR22P6.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four author affiliations are disclosed on the title page: Nara Institute of Science and Technology, Shinshu University, Singapore Management University — all academic, none affiliated with GitHub.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "JSPS and JST are Japanese government research funding agencies with no financial stake in GitHub Copilot outcomes.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement appears anywhere in the paper; absence of such a declaration means this criterion is not met.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 2.1 defines Copilot for PRs and its four marker tags; 'review time' is operationally defined in Table 2 as 'time interval between PR creation time and closed time in hours'; 'early adopter' context is explained.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit RQs are stated in §1: adoption extent, impact on review time and merge likelihood, and developer adaptation patterns — the contribution is clear as an observational study of a new tool's real-world effects.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2.2 positions the work relative to LLMs for SE (Hou et al. 2023 systematic review) and PR summarization (Liu et al. 2019, Fang et al. 2022), explicitly contrasting this study's focus on real-world applicability versus prior lab-style LLM evaluation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The Data Availability section provides a live GitHub URL (https://github.com/NAIST-SE/CopilotForPRsEarlyAdoption) and a Zenodo DOI (10.5281/zenodo.10656106) explicitly including scripts.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The replication package includes lists of studied PRs, PR features for RQ2, and coding results for RQ3; the underlying PR data is from public GitHub repositories.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned anywhere in the paper; the statistical environment (R packages for WeightIt, entropy balancing) is referenced by citation but not enumerated.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper provides a replication package URL but does not include step-by-step reproduction instructions within the paper itself; it is unclear from the paper alone how to re-run the analyses.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "The odds ratio for RQ2.2 is reported with a 95% CI (1.35 to 1.84); standard errors are reported for all coefficients in Table 4.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": true,
    155           "justification": "p-values are reported for all regression coefficients in Table 4; the main treatment effect has p=1.64e-17 (review time) and p<0.001 (merge likelihood).",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are explicitly reported: ATT of -19.3 hours for review time and OR=1.57 for merge likelihood, with baseline context provided.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or sample size justification is discussed; the sample size is determined entirely by what was available on GitHub during the data collection window.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Standard errors are reported for all regression coefficients in Table 4 alongside point estimates.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "54,188 non-Copilot PRs from the same 146 repositories during the same time period serve as the control/baseline group.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Control PRs were specifically selected from the same repositories during the same time window (post-Copilot introduction to August 2023) to control for temporal confounding.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "This is an observational study of a commercial tool — no ablation is applicable.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple outcome metrics are used: review time (continuous), merge likelihood (binary), marker tag distribution (RQ1), and 13 supplementary info categories + 7 editorial action categories (RQ3).",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Three raters independently coded 1,437 developer revisions to Copilot-generated content, achieving kappa=0.64 and 0.62 respectively, constituting human evaluation of system outputs.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is an observational study, not a prediction task requiring a held-out test set.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Tables 5 and 6 provide per-category frequency breakdowns for supplementary information types (13 categories) and editorial actions (7 main categories with sub-types).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 5.3 and Table 6 discuss cases where developers deleted, refined, or excluded Copilot content with concrete examples (e.g., 'Nope, you didn't get it this time'), illustrating model failures.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that 96 of 146 repositories use Copilot for PRs in fewer than 50% of PRs, and that developers frequently remove or replace generated content (deletion 22.9%, exclusion 17.4%), indicating the tool often falls short.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The paper states Copilot for PRs uses 'the GPT-4 model by OpenAI' but provides no snapshot date, API version, or model hash — the GPT-4 version used is unspecified.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "Copilot for PRs is a closed commercial tool; internal prompts are proprietary and not available to the researchers or readers.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "This study evaluates a black-box commercial tool; no configurable hyperparameters (temperature, top-p) are accessible or reported.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "The study evaluates a black-box commercial service (Copilot for PRs); no custom agentic scaffolding was built by the researchers.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3 provides a detailed multi-step preprocessing pipeline: GitHub GraphQL search with time-window splitting, false positive exclusion, obsolete PR removal, bot filtering via Golzadeh et al. methods, and git-diff-based revision analysis.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The replication package at Zenodo (DOI: 10.5281/zenodo.10656106) and GitHub provides lists of studied PRs and extracted features; underlying PR data is from public GitHub.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3 describes the full collection procedure in detail: GitHub GraphQL queries, the 1000-result limit workaround via time-window halving, the copilot4prs bot identification, and each filtering step with counts.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "This study mines public GitHub data programmatically; there is no participant recruitment.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Section 3 traces the full pipeline from collection (18,858 PRs) through obsolete exclusion (18,322) through bot filtering (18,256) to the revision analysis (311 PRs with edits), with counts at each step.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This study examines adoption and impact of a deployed tool on PR metadata; it is not evaluating model capabilities on a benchmark where training cutoff would be relevant.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not applicable; the study is observational analysis of PR process metrics, not a model benchmark evaluation.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "No benchmark evaluation is conducted in this study.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "The study mines public GitHub data and conducts qualitative coding of text artifacts; no human subjects participation requires pre-registration.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "The study analyzes publicly available GitHub data without direct human subject involvement requiring IRB review.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participant demographics are applicable; the study analyzes public GitHub repositories and PR metadata.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No participant recruitment; selection criteria apply to repositories and PRs, not human subjects.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No experimental randomization of human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human subjects experiment requiring blinding.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No longitudinal human participants to drop out.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "The study observes a commercial tool in production use; researchers did not run model inference and there is no cost to report.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "The analysis involves GitHub API queries and statistical modeling (entropy balancing), which requires no notable compute budget worth disclosing.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Copilot for PRs reduces PR review time by an average of 19.3 hours (ATT, p=1.64e-17)",
    374       "evidence": "Quasi-experiment with entropy balancing on 17,177 treatment and 50,695 control merged/closed PRs, controlling for 17 covariates; Table 4 reports coefficient -19.3 ± 2.27 hours",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "PRs with Copilot descriptions are 1.57x more likely to be merged (95% CI: 1.35–1.84, p<0.001)",
    379       "evidence": "Marginal log odds ratio estimated via avg_comparisons() with entropy-balanced weights on the same 68K PR dataset; raw merge rates are 84% vs 71%",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Copilot for PRs adoption among early-adopter repositories grew steadily from March to August 2023",
    384       "evidence": "Figure 3 shows cumulative time-series of PRs using Copilot for PRs increasing monotonically from ~0 to 18,256 while repository count stabilized at ~140",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "copilot:summary is the most popular marker tag with 13,231 instances; most popular combination is summary+walkthrough (5,598 PRs)",
    389       "evidence": "Table 3 reports full frequency distribution of all four marker tags and 19 combinations extracted via regex from 18,256 PRs",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Deletion (22.9%) is the most common developer editorial action on Copilot-generated PR content",
    394       "evidence": "Qualitative coding of 1,437 revisions from 311 PRs by three raters (kappa=0.62); Table 6 shows frequency distribution of 7 main editorial categories",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Developers most commonly supplement Copilot content with static template information (22.8%) and associated links (22.7%)",
    399       "evidence": "Table 5 reports 13-category coding schema with frequencies from qualitative analysis of 311 PRs; three raters achieved kappa=0.64",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "observational",
    405     "qualitative"
    406   ],
    407   "key_findings": "Analyzing 18,256 Copilot-assisted PRs and 54,188 control PRs from 146 GitHub repositories during Copilot for PRs' early-access period, this study finds that Copilot-assisted PRs required 19.3 fewer review hours and were 1.57x more likely to be merged after controlling for 17 confounding variables via entropy balancing. Adoption grew steadily among the limited early-adopter cohort, with copilot:summary being the dominant marker. Qualitative analysis of 1,437 developer revisions reveals that developers frequently complement AI content with templates and links, and most commonly partially delete generated content, indicating that AI-generated PR descriptions serve as a starting scaffold rather than a final product.",
    408   "red_flags": [
    409     {
    410       "flag": "Self-selection bias in early adopters",
    411       "detail": "Repositories authorized for the beta feature may be fundamentally different (better organized, more experienced teams) from average projects, creating unobserved confounding that entropy balancing over observable covariates cannot eliminate."
    412     },
    413     {
    414       "flag": "Review time as quality proxy",
    415       "detail": "Shorter review time is used as an indicator of benefit, but could equally indicate reduced thoroughness — the paper does not measure actual review quality, code defects introduced, or reviewer satisfaction."
    416     },
    417     {
    418       "flag": "GPT-4 version unspecified",
    419       "detail": "The paper notes Copilot for PRs uses 'the GPT-4 model by OpenAI' with no snapshot date, making reproducibility and comparison with future studies problematic."
    420     },
    421     {
    422       "flag": "Kappa reflects multi-label difficulty, not agreement ceiling",
    423       "detail": "Kappa of 0.62–0.64 is reported as 'substantial' but 24–26 combinations of multi-label codes were encountered; the authors attribute low agreement to complexity rather than coding schema ambiguity without validating this claim."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Automatic generation of pull request descriptions",
    429       "relevance": "Direct predecessor work on automated PR summarization (PRSummarizer) that this study's real-world evaluation contextualizes"
    430     },
    431     {
    432       "title": "GitHub Discussions: An Exploratory Study of Early Adoption",
    433       "relevance": "Methodological template for studying early adoption of GitHub features during beta access, explicitly cited as the parallel study design"
    434     },
    435     {
    436       "title": "Work practices and challenges in pull-based development: The contributor's perspective",
    437       "relevance": "Foundational work on pull-based development that motivates the study of PR-related automation"
    438     },
    439     {
    440       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    441       "relevance": "Broad context for LLM applications in SE, reviewed 229 studies and cited to frame the study's positioning"
    442     },
    443     {
    444       "title": "On the accuracy of bot detection techniques",
    445       "relevance": "Provides the bot detection methodology used to filter bot-submitted PRs from the dataset"
    446     },
    447     {
    448       "title": "An empirical study of the impact of modern code review practices on software quality",
    449       "relevance": "Provides the code review confounding variables (PR size, files, commits) used in the propensity score model"
    450     },
    451     {
    452       "title": "Entropy balancing for causal effects: A multivariate reweighting method",
    453       "relevance": "Methodological basis for the causal inference approach used in RQ2"
    454     },
    455     {
    456       "title": "PRHAN: automated pull request description generation based on hybrid attention network",
    457       "relevance": "Prior technical work on automated PR description generation that this real-world deployment study extends"
    458     }
    459   ],
    460   "engagement_factors": {
    461     "practical_relevance": {
    462       "score": 3,
    463       "justification": "Directly actionable for developers and teams deciding whether to adopt GitHub Copilot for PRs, with quantified time savings and merge rate improvements."
    464     },
    465     "surprise_contrarian": {
    466       "score": 1,
    467       "justification": "Results confirm intuitive expectations that AI-assisted descriptions help; no counterintuitive findings emerge."
    468     },
    469     "fear_safety": {
    470       "score": 0,
    471       "justification": "No AI risk or safety concerns are raised; the paper is neutral on AI displacement of developer work."
    472     },
    473     "drama_conflict": {
    474       "score": 1,
    475       "justification": "Mild tension in showing developers frequently delete or override AI-generated content, suggesting the tool is imperfect, but no strong controversy angle."
    476     },
    477     "demo_ability": {
    478       "score": 3,
    479       "justification": "Copilot for PRs is a live GitHub product that any authorized user can try immediately."
    480     },
    481     "brand_recognition": {
    482       "score": 2,
    483       "justification": "GitHub Copilot is a widely recognized product; the study of its PR feature benefits from strong brand recognition among developers."
    484     }
    485   },
    486   "hn_data": {
    487     "threads": [],
    488     "top_points": 0,
    489     "total_points": 0,
    490     "total_comments": 0
    491   }
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs