scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (25908B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Experience with GitHub Copilot for Developer Productivity at Zoominfo",
      6     "authors": [
      7       "Gal Bakal",
      8       "Ali Dasdan",
      9       "Yaniv Katz",
     10       "Michael Kaufman",
     11       "Guy Levin"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2501.13282",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims (33% suggestion acceptance, 20% line acceptance, 72% satisfaction, 400+ developers) are all supported by the results in Sections 7-10.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper implies Copilot improves productivity (title: 'Developer Productivity') but the study design cannot establish causality. The paper acknowledges this: 'Once we establish a reliable causality between these metrics and the GitHub Copilot usage, we are planning to report the results in a subsequent paper.' However, claims like '90% respondents stated that GitHub Copilot reduces the amount of time' present self-reported causal attributions without causal design.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper is consistently scoped to ZoomInfo's experience. The title says 'at Zoominfo', conclusions reference 'enterprise-scale deployment', and it acknowledges that 'exact figures naturally differ due to such reasons as types of tasks, interview questions vs. production work, programming languages, students vs. developers.'",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are discussed for the observed acceptance rates or satisfaction scores. For example, novelty effect, self-selection bias in voluntary adoption, or Hawthorne effect are not considered.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures suggestion acceptance rates (33%) and self-reported time savings (20%), then frames these as evidence that 'GitHub Copilot usage significantly contributed to the productivity of our developers.' Acceptance rate is a proxy for engagement, not productivity. Self-reported time savings are subjective estimates. The paper does not discuss what 'productivity' actually entails or how these metrics map to it. The conclusion claims '100s of 1000s of lines of production code contributed' without discussing whether more lines equals more productivity.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 11 'Limitations: Observed and Potential' provides substantive discussion of both observed and anticipated limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations section discusses product limitations of Copilot (contextual understanding, security concerns, creativity) but does not discuss threats to the validity of the study itself — e.g., self-selection bias, survey response bias, Hawthorne effect, or limitations of acceptance rate as a productivity proxy.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "While the paper is implicitly scoped to ZoomInfo, it does not explicitly state what the results do NOT show or what populations/settings are excluded.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure or acknowledgments section. All authors are ZoomInfo employees but no statement about funding.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are clearly identified as ZoomInfo employees with ZoomInfo email addresses and affiliation.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "ZoomInfo has a financial interest in the outcome — they are evaluating a tool they purchased and deployed. Positive results justify the investment. This conflict is not acknowledged.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Acceptance rate is precisely defined in Section 6. Developer productivity is grounded in DORA/SPACE/DevEx frameworks in Section 3. DevSat is defined as a net-sentiment calculation.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states its contribution is a detailed case study of Copilot's enterprise deployment addressing five stated research questions, contributing to knowledge about AI-assisted development at scale.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 12 is an extensive related work review with 37 references, comparing acceptance rates to GitHub and Google reports, and contrasting with the ANZ Bank study (the closest prior comparable work).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code, analysis scripts, or repository links are provided.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No raw acceptance rate data, survey responses, or datasets are released. Only aggregated figures and tables are shown in the paper.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment or dependency specifications are provided. The paper describes using GitHub Copilot for Business but provides no technical setup details for reproducing the analysis.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No reproduction instructions are provided. The deployment process is described narratively but there are no steps to replicate the analysis.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates (e.g., 33% acceptance rate, 72% satisfaction) with no uncertainty quantification.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used. Claims like language-specific performance differences and weekend vs weekday patterns are made without any tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No formal effect sizes are reported. While raw percentages are given (33% acceptance, 20% line acceptance), there are no standardized effect sizes or baseline comparisons with context.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for sample sizes. The trial used 126 engineers with 72 survey respondents (57% response rate) and the satisfaction survey sample size is not stated. No power analysis or justification for adequacy.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Standard deviations are reported in the daily data table (Fig. 2): 'The standard deviations are close to the half of these numbers in each case.' Averages, standard deviations, and medians are shown for daily metrics.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baseline comparisons are included. There is no pre-Copilot measurement or control group comparison. The paper acknowledges difficulty measuring productivity impact and defers causal analysis to future work.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "No baselines are included at all, so contemporaneity is moot.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "This is a deployment case study evaluating a third-party tool, not a system with removable components.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are reported: suggestion acceptance rate, line acceptance rate, per-language breakdowns, per-editor breakdowns, developer satisfaction scores, time savings, task completion, and quality perception.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Developer satisfaction surveys provide human evaluation of the tool's impact. Quarterly pulse surveys with Likert scales gauge satisfaction, productivity perceptions, and qualitative feedback.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a benchmark evaluation study. There is no test set.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by programming language (Fig. 5-7, 12 languages), by IDE (Fig. 8, JetBrains vs VS Code), and by time period (weekday vs weekend).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 11 (Limitations) discusses observed failures: struggles with domain-specific logic, security concerns, creativity limitations, and includes negative developer feedback quotes.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports limitations including low acceptance rates for HTML/CSS/JSON/SQL, need for code modification (3/5 initial participants), and includes negative developer feedback. It also cites related work showing 55-92% test failure rates.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper states 'GitHub Copilot uses a version of Codex' but does not specify which version of Copilot or the underlying model was used during the evaluation period. No API version or snapshot date.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "The paper evaluates GitHub Copilot as a black-box IDE tool. Developers use it through inline code completion, not explicit prompting.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": false,
    249           "answer": false,
    250           "justification": "GitHub Copilot is used as a black-box tool with no user-configurable hyperparameters relevant to report.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool. Authors cannot describe internal scaffolding they have no access to.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "No documentation of how acceptance rate data was collected, filtered, or processed. The paper mentions excluding languages with small numbers but does not specify the threshold or criteria. Survey data processing is also undocumented.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw data is available. Only aggregated tables and figures are presented.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described: acceptance rates from GitHub Copilot's telemetry dashboard for Nov 14 - Dec 9 2024, survey data from quarterly pulse surveys using Likert scales, and trial survey from the two-week trial phase with 72 respondents.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Section 5.2 describes the trial recruitment: 'structured recruitment process', 'stratified voluntary sampling' across specializations, experience levels, locations, and tech stacks. 126 engineers (32% of developers) participated.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The pipeline from raw telemetry/survey data to reported figures is not documented. How acceptance rates were aggregated across developers, how satisfaction scores were computed, and filtering decisions are not detailed.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This paper does not evaluate a model's capability on a benchmark. It measures acceptance rates of an IDE tool in production use.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "No benchmark evaluation. Contamination is not applicable to measuring acceptance rates of a deployed tool.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "No benchmark evaluation is performed.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration mentioned. The study involves human participants (400+ developers and survey respondents).",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics board approval is mentioned, despite collecting survey data and behavioral data from human participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Minimal demographics. The paper mentions geographic distribution (US, Europe, India, Israel) and total developer count (~400) but does not report experience levels, gender, roles, or other demographic characteristics of participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Section 5.2 describes prerequisites: completion of security training, written acknowledgment of compliance requirements, and commitment to provide feedback. The trial was stratified across specializations, experience levels, and locations.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "This is an observational deployment study, not an experimental study with randomized assignment to conditions.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "Not an experimental study. Blinding is not feasible when evaluating a visible IDE tool.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": true,
    352           "justification": "Trial survey response rate is reported: 72 respondents out of 126 participants (57% response rate). The adoption curve (Fig. 1) shows license uptake over time.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No cost information is reported. The cost of GitHub Copilot for Business licenses for 400+ developers is not disclosed, nor is any cost-benefit analysis provided.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": false,
    365           "answer": false,
    366           "justification": "This is a deployment study of a third-party SaaS tool, not a compute-intensive research experiment.",
    367           "source": "opus"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "GitHub Copilot achieves a 33% suggestion acceptance rate and 20% line acceptance rate across 400+ developers over a 26-day production window",
    375       "evidence": "Figure 2 shows day-by-day telemetry with means and standard deviations; Figure 4 shows aggregate rates with upward trend lines",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "72% developer satisfaction score — the highest among all measured tools in the development toolchain",
    380       "evidence": "Figure 9 shows DevSat survey results from quarterly pulse surveys beginning Q2 2024, with GitHub Copilot leading all other tools listed",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "GitHub Copilot significantly contributed to developer productivity",
    385       "evidence": "Inferred from acceptance rates and self-reported satisfaction; no control group or causal design; authors explicitly state causality with DORA metrics is unestablished",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "90% of survey respondents report approximately 20% time savings when using GitHub Copilot",
    390       "evidence": "Stated in Section 10 from the developer satisfaction survey; self-reported without objective verification or pre/post comparison",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Acceptance rates for top languages (TypeScript, Java, Python, JavaScript) are ~30%, while HTML/CSS/JSON/SQL have substantially lower rates",
    395       "evidence": "Figures 5 and 7 show per-language breakdown with specific rates; 14-32% range across the top 12 languages",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Over 75,000 lines of AI-generated code were accepted into production in the 26-day study window, with hundreds of thousands total since deployment",
    400       "evidence": "Figure 2 shows cumulative line counts for the study period; total production impact stated in abstract and Section 7",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "observational",
    406     "case-study",
    407     "qualitative"
    408   ],
    409   "key_findings": "GitHub Copilot achieved a 33% suggestion acceptance rate and 20% line acceptance rate across 400+ developers at Zoominfo over a 26-day production window, consistent with industry benchmarks from GitHub and Google. Developer satisfaction reached 72% (highest among all measured tools), with 90% of survey respondents reporting ~20% perceived time savings. The paper follows a structured four-phase deployment methodology with explicit governance requirements. However, all productivity conclusions rest on proxy metrics and self-report surveys with no control group, no pre/post comparison, and no causal design — the authors explicitly defer causal claims about DORA metrics to a future paper.",
    410   "red_flags": [
    411     {
    412       "flag": "No control group",
    413       "detail": "No comparison between Copilot users and non-users, and no pre-Copilot baseline, making productivity claims unverifiable against a counterfactual."
    414     },
    415     {
    416       "flag": "Self-evaluation bias",
    417       "detail": "All authors are Zoominfo employees evaluating a tool their organization deployed and paid for, with organizational interest in justifying the decision."
    418     },
    419     {
    420       "flag": "Volunteer selection bias",
    421       "detail": "Trial participants self-selected via voluntary application; those most enthusiastic about AI tools are over-represented, inflating satisfaction and acceptance metrics."
    422     },
    423     {
    424       "flag": "Proxy-outcome conflation",
    425       "detail": "Acceptance rate and self-reported time savings are used to assert productivity improvement despite authors acknowledging causality with objective metrics is unestablished."
    426     },
    427     {
    428       "flag": "Unverified self-reported outcomes",
    429       "detail": "The 20% time savings figure comes from developer surveys with no objective task-timing verification, blind conditions, or comparison group."
    430     },
    431     {
    432       "flag": "Model version unspecified",
    433       "detail": "GitHub Copilot version during the study period is not identified, preventing comparison with other studies or replication."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Measuring GitHub Copilot's impact on productivity",
    439       "relevance": "GitHub's own measurement study establishing acceptance rate as a productivity proxy — the core metric adopted by this paper"
    440     },
    441     {
    442       "title": "The impact of AI tool on engineering at ANZ bank: an empirical study on GitHub Copilot within corporate environment",
    443       "relevance": "Closest prior comparable work: corporate deployment with ~1,000 engineers, controlled experiment, reports 40-50% task-time improvement"
    444     },
    445     {
    446       "title": "The SPACE of developer productivity: There's more to it than you think",
    447       "relevance": "Foundational multidimensional framework for developer productivity measurement referenced in Section 3"
    448     },
    449     {
    450       "title": "GitHub Copilot AI pair programmer: Asset or liability?",
    451       "relevance": "Early evaluation of Copilot correctness vs. human programmers on algorithmic tasks"
    452     },
    453     {
    454       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    455       "relevance": "Quantifies project-level productivity gains (6.5%) and integration time costs from Copilot use in open source"
    456     },
    457     {
    458       "title": "Evaluating large language models trained on code (Codex)",
    459       "relevance": "Describes the foundational model behind GitHub Copilot and its limitations including over-reliance and security implications"
    460     },
    461     {
    462       "title": "DevEx: What actually drives productivity: The developer-centric approach to measuring and improving productivity",
    463       "relevance": "Developer experience framework complementing DORA/SPACE, used in Section 3 productivity discussion"
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Provides a concrete enterprise rollout playbook and real acceptance-rate benchmarks practitioners can compare against their own Copilot deployment."
    470     },
    471     "surprise_contrarian": {
    472       "score": 0,
    473       "justification": "All findings (33% acceptance, 20% time savings, high satisfaction) confirm widely reported industry numbers with no unexpected results."
    474     },
    475     "fear_safety": {
    476       "score": 0,
    477       "justification": "Security and IP risks are listed as speculative future concerns, not demonstrated or analyzed."
    478     },
    479     "drama_conflict": {
    480       "score": 0,
    481       "justification": "The paper is a straightforward positive vendor endorsement with no controversy, critique, or challenge to any claims."
    482     },
    483     "demo_ability": {
    484       "score": 0,
    485       "justification": "No code, tools, or reproducible artifacts are released; results are internal telemetry and surveys."
    486     },
    487     "brand_recognition": {
    488       "score": 2,
    489       "justification": "GitHub Copilot is a widely known product, though ZoomInfo itself is not a major tech brand in the developer community."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs