ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24476B)


      1 {
      2   "scan_version": 3,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "Experience with GitHub Copilot for Developer Productivity at Zoominfo",
      6     "authors": [
      7       "Gal Bakal",
      8       "Ali Dasdan",
      9       "Yaniv Katz",
     10       "Michael Kaufman",
     11       "Guy Levin"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv preprint"
     15   },
     16   "methodology_tags": [
     17     "observational",
     18     "case-study"
     19   ],
     20   "key_findings": "ZoomInfo deployed GitHub Copilot to 400+ developers via a four-phase rollout. Average suggestion acceptance rate was 33% and line acceptance rate was 20%, consistent with industry reports. Developer satisfaction reached 72%, with 90% reporting time savings (median 20% reduction). Language-specific acceptance rates varied from 14% to 32%, with general-purpose languages (TypeScript, Java, Python) performing best.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No source code, analysis scripts, or repository links are provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No raw acceptance rate data, survey responses, or datasets are released. Only aggregated figures and tables are shown in the paper."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment or dependency specifications are provided. The paper describes using GitHub Copilot for Business but provides no technical setup details for reproducing the analysis."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No reproduction instructions are provided. The deployment process is described narratively but there are no steps to replicate the analysis."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates (e.g., 33% acceptance rate, 72% satisfaction) with no uncertainty quantification."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are used. Claims like language-specific performance differences and weekend vs weekday patterns are made without any tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No formal effect sizes are reported. While raw percentages are given (33% acceptance, 20% line acceptance), there are no standardized effect sizes or baseline comparisons with context."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for sample sizes. The trial used 126 engineers with 72 survey respondents (57% response rate) and the satisfaction survey sample size is not stated. No power analysis or justification for adequacy."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Standard deviations are reported in the daily data table (Fig. 2): 'The standard deviations are close to the half of these numbers in each case.' Averages, standard deviations, and medians are shown for daily metrics."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No baseline comparisons are included. There is no pre-Copilot measurement or control group comparison. The paper acknowledges difficulty measuring productivity impact and defers causal analysis to future work."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No baselines are included at all, so contemporaneity is moot."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "This is a deployment case study evaluating a third-party tool, not a system with removable components."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are reported: suggestion acceptance rate, line acceptance rate, per-language breakdowns, per-editor breakdowns, developer satisfaction scores, time savings, task completion, and quality perception."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Developer satisfaction surveys provide human evaluation of the tool's impact. Quarterly pulse surveys with Likert scales gauge satisfaction, productivity perceptions, and qualitative feedback."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "This is not a benchmark evaluation study. There is no test set."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by programming language (Fig. 5-7, 12 languages), by IDE (Fig. 8, JetBrains vs VS Code), and by time period (weekday vs weekend)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 11 (Limitations) discusses observed failures: struggles with domain-specific logic, security concerns, creativity limitations, and includes negative developer feedback quotes."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports limitations including low acceptance rates for HTML/CSS/JSON/SQL, need for code modification (3/5 initial participants), and includes negative developer feedback. It also cites related work showing 55-92% test failure rates."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims (33% suggestion acceptance, 20% line acceptance, 72% satisfaction, 400+ developers) are all supported by the results in Sections 7-10."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper implies Copilot improves productivity (title: 'Developer Productivity') but the study design cannot establish causality. The paper acknowledges this: 'Once we establish a reliable causality between these metrics and the GitHub Copilot usage, we are planning to report the results in a subsequent paper.' However, claims like '90% respondents stated that GitHub Copilot reduces the amount of time' present self-reported causal attributions without causal design."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper is consistently scoped to ZoomInfo's experience. The title says 'at Zoominfo', conclusions reference 'enterprise-scale deployment', and it acknowledges that 'exact figures naturally differ due to such reasons as types of tasks, interview questions vs. production work, programming languages, students vs. developers.'"
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are discussed for the observed acceptance rates or satisfaction scores. For example, novelty effect, self-selection bias in voluntary adoption, or Hawthorne effect are not considered."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper measures suggestion acceptance rates (33%) and self-reported time savings (20%), then frames these as evidence that 'GitHub Copilot usage significantly contributed to the productivity of our developers.' Acceptance rate is a proxy for engagement, not productivity. Self-reported time savings are subjective estimates. The paper does not discuss what 'productivity' actually entails or how these metrics map to it. The conclusion claims '100s of 1000s of lines of production code contributed' without discussing whether more lines equals more productivity."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper states 'GitHub Copilot uses a version of Codex' but does not specify which version of Copilot or the underlying model was used during the evaluation period. No API version or snapshot date."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The paper evaluates GitHub Copilot as a black-box IDE tool. Developers use it through inline code completion, not explicit prompting."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "GitHub Copilot is used as a black-box tool with no user-configurable hyperparameters relevant to report."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool. Authors cannot describe internal scaffolding they have no access to."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No documentation of how acceptance rate data was collected, filtered, or processed. The paper mentions excluding languages with small numbers but does not specify the threshold or criteria. Survey data processing is also undocumented."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 11 'Limitations: Observed and Potential' provides substantive discussion of both observed and anticipated limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The limitations section discusses product limitations of Copilot (contextual understanding, security concerns, creativity) but does not discuss threats to the validity of the study itself — e.g., self-selection bias, survey response bias, Hawthorne effect, or limitations of acceptance rate as a productivity proxy."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While the paper is implicitly scoped to ZoomInfo, it does not explicitly state what the results do NOT show or what populations/settings are excluded."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data is available. Only aggregated tables and figures are presented."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Data collection is described: acceptance rates from GitHub Copilot's telemetry dashboard for Nov 14 - Dec 9 2024, survey data from quarterly pulse surveys using Likert scales, and trial survey from the two-week trial phase with 72 respondents."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 5.2 describes the trial recruitment: 'structured recruitment process', 'stratified voluntary sampling' across specializations, experience levels, locations, and tech stacks. 126 engineers (32% of developers) participated."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The pipeline from raw telemetry/survey data to reported figures is not documented. How acceptance rates were aggregated across developers, how satisfaction scores were computed, and filtering decisions are not detailed."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding disclosure or acknowledgments section. All authors are ZoomInfo employees but no statement about funding."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are clearly identified as ZoomInfo employees with ZoomInfo email addresses and affiliation."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "ZoomInfo has a financial interest in the outcome — they are evaluating a tool they purchased and deployed. Positive results justify the investment. This conflict is not acknowledged."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper does not evaluate a model's capability on a benchmark. It measures acceptance rates of an IDE tool in production use."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No benchmark evaluation. Contamination is not applicable to measuring acceptance rates of a deployed tool."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No benchmark evaluation is performed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No pre-registration mentioned. The study involves human participants (400+ developers and survey respondents)."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No IRB or ethics board approval is mentioned, despite collecting survey data and behavioral data from human participants."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "Minimal demographics. The paper mentions geographic distribution (US, Europe, India, Israel) and total developer count (~400) but does not report experience levels, gender, roles, or other demographic characteristics of participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "Section 5.2 describes prerequisites: completion of security training, written acknowledgment of compliance requirements, and commitment to provide feedback. The trial was stratified across specializations, experience levels, and locations."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is an observational deployment study, not an experimental study with randomized assignment to conditions."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Not an experimental study. Blinding is not feasible when evaluating a visible IDE tool."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Trial survey response rate is reported: 72 respondents out of 126 participants (57% response rate). The adoption curve (Fig. 1) shows license uptake over time."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No cost information is reported. The cost of GitHub Copilot for Business licenses for 400+ developers is not disclosed, nor is any cost-benefit analysis provided."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a deployment study of a third-party SaaS tool, not a compute-intensive research experiment."
    297       }
    298     }
    299   },
    300   "claims": [
    301     {
    302       "claim": "Average acceptance rate for suggestions is 33% and for lines is 20%",
    303       "evidence": "Fig. 2 and Fig. 4 show daily data from Nov 14 to Dec 9, 2024 with averages, standard deviations, and medians (Section 7).",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "GitHub Copilot has the highest developer satisfaction at 72% among all surveyed tools",
    308       "evidence": "Fig. 9 shows developer satisfaction statistics from quarterly pulse surveys, with Copilot leading other tools like Jenkins, SonarQube, ArgoCD (Section 10).",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "90% of respondents stated that GitHub Copilot reduces the amount of time to complete tasks with a median reduction of 20%",
    313       "evidence": "Self-reported survey data in Section 10. No objective measurement of time savings.",
    314       "supported": "weak"
    315     },
    316     {
    317       "claim": "Acceptance rates are consistent with industry reports from GitHub and other companies",
    318       "evidence": "Section 7 references GitHub's report [34] and Google's Q3 earnings [18], noting alignment. However, this is a descriptive comparison, not a formal test.",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "General-purpose languages (TypeScript, Java, Python) have higher acceptance rates than markup/query languages (HTML, CSS, JSON, SQL)",
    323       "evidence": "Fig. 5 and Fig. 7 show per-language data for 12 languages over a 26-day period (Section 8).",
    324       "supported": "moderate"
    325     }
    326   ],
    327   "red_flags": [
    328     {
    329       "flag": "Vendor evaluating its own purchase",
    330       "detail": "All authors are ZoomInfo employees evaluating a tool ZoomInfo purchased. Positive results justify the procurement decision. This conflict is not acknowledged."
    331     },
    332     {
    333       "flag": "Acceptance rate as productivity proxy",
    334       "detail": "The paper uses acceptance rate as a 'better predictor of perceived productivity' based on GitHub's own recommendation [34]. However, acceptance rate measures adoption of suggestions, not actual productivity improvement. A developer could accept bad suggestions or reject good ones after reading them."
    335     },
    336     {
    337       "flag": "Self-reported productivity gains without baseline",
    338       "detail": "Claims like '90% report time savings' and '77% report quality improvement' are self-reported perceptions with no pre-deployment baseline, no control group, and no objective measurement."
    339     },
    340     {
    341       "flag": "Self-selection bias in trial",
    342       "detail": "The trial used 'stratified voluntary sampling' — participants self-selected, likely biasing toward developers already enthusiastic about AI tools. The 57% survey response rate introduces additional response bias."
    343     },
    344     {
    345       "flag": "No study validity threats discussed",
    346       "detail": "The limitations section (Section 11) discusses limitations of GitHub Copilot as a product, not limitations of the study methodology. Threats like novelty effect, Hawthorne effect, self-selection bias, and response bias are not addressed."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "The impact of AI tool on engineering at ANZ bank an empirical study on GitHub Copilot within corporate environment",
    352       "authors": [
    353         "S. Chatterjee",
    354         "C.L. Liu",
    355         "G. Rowland",
    356         "T. Hogarth"
    357       ],
    358       "year": 2024,
    359       "arxiv_id": "2402.05636",
    360       "relevance": "Similar enterprise deployment study of Copilot with controlled experiments and surveys at a bank."
    361     },
    362     {
    363       "title": "Evaluating large language models trained on code",
    364       "authors": [
    365         "M. Chen",
    366         "J. Tworek"
    367       ],
    368       "year": 2021,
    369       "arxiv_id": "2107.03374",
    370       "relevance": "Codex paper — the model behind GitHub Copilot, introduces HumanEval benchmark."
    371     },
    372     {
    373       "title": "Measuring GitHub Copilot's impact on productivity",
    374       "authors": [
    375         "A. Ziegler",
    376         "E. Kalliamvakou"
    377       ],
    378       "year": 2024,
    379       "relevance": "GitHub's own productivity measurement study; establishes acceptance rate as productivity proxy used in this paper."
    380     },
    381     {
    382       "title": "An empirical evaluation of GitHub Copilot's code suggestions",
    383       "authors": [
    384         "N. Nguyen",
    385         "S. Nadi"
    386       ],
    387       "year": 2022,
    388       "relevance": "Evaluates Copilot correctness (60% Java, 30% JavaScript) on LeetCode problems."
    389     },
    390     {
    391       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    392       "authors": [
    393         "F. Song",
    394         "A. Agarwal",
    395         "W. Wen"
    396       ],
    397       "year": 2024,
    398       "arxiv_id": "2410.02091",
    399       "relevance": "Measures Copilot's impact on OSS productivity (6.5% improvement) and integration time (42% increase)."
    400     },
    401     {
    402       "title": "Using GitHub Copilot for test generation in Python: An empirical study",
    403       "authors": [
    404         "K. El Haji",
    405         "C. Brandt",
    406         "A. Zaidman"
    407       ],
    408       "year": 2024,
    409       "relevance": "Evaluates Copilot for test generation, finding high failure rates (55-92%)."
    410     },
    411     {
    412       "title": "Evaluating the code quality of AI-assisted code generation tools: An empirical study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT",
    413       "authors": [
    414         "B. Yetistiren",
    415         "I. Ozsoy",
    416         "M. Ayerdem",
    417         "E. Tuzun"
    418       ],
    419       "year": 2023,
    420       "arxiv_id": "2304.10778",
    421       "relevance": "Comparative evaluation of multiple AI code generation tools including Copilot on HumanEval."
    422     },
    423     {
    424       "title": "GitHub Copilot AI pair programmer: Asset or liability?",
    425       "authors": [
    426         "A.M. Dakhel",
    427         "V. Majdinasab"
    428       ],
    429       "year": 2022,
    430       "arxiv_id": "2206.15331",
    431       "relevance": "Evaluates Copilot on algorithmic problems, finding it underperforms human programmers."
    432     },
    433     {
    434       "title": "On the robustness of code generation techniques: An empirical study on GitHub Copilot",
    435       "authors": [
    436         "A. Mastropaolo",
    437         "L. Pascarella"
    438       ],
    439       "year": 2023,
    440       "arxiv_id": "2302.00438",
    441       "relevance": "Studies robustness of Copilot to semantically equivalent prompts, finding ~50% inconsistency."
    442     },
    443     {
    444       "title": "The SPACE of developer productivity: There's more to it than you think",
    445       "authors": [
    446         "N. Forsgren",
    447         "M.-A. Storey",
    448         "C. Maddila",
    449         "T. Zimmermann"
    450       ],
    451       "year": 2021,
    452       "relevance": "Developer productivity measurement framework referenced for contextualizing productivity claims."
    453     },
    454     {
    455       "title": "The impact of large language models on open-source innovation: Evidence from GitHub Copilot",
    456       "authors": [
    457         "D. Yeverechyahu",
    458         "R. Mayya",
    459         "G. Oestreicher-Singer"
    460       ],
    461       "year": 2024,
    462       "arxiv_id": "2409.08379",
    463       "relevance": "Studies Copilot's impact on OSS collaboration, finding more maintenance than development contributions."
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Provides a concrete enterprise rollout playbook and real acceptance-rate benchmarks practitioners can compare against their own Copilot deployment."
    470     },
    471     "surprise_contrarian": {
    472       "score": 0,
    473       "justification": "All findings (33% acceptance, 20% time savings, high satisfaction) confirm widely reported industry numbers with no unexpected results."
    474     },
    475     "fear_safety": {
    476       "score": 0,
    477       "justification": "Security and IP risks are listed as speculative future concerns, not demonstrated or analyzed."
    478     },
    479     "drama_conflict": {
    480       "score": 0,
    481       "justification": "The paper is a straightforward positive vendor endorsement with no controversy, critique, or challenge to any claims."
    482     },
    483     "demo_ability": {
    484       "score": 0,
    485       "justification": "No code, tools, or reproducible artifacts are released; results are internal telemetry and surveys."
    486     },
    487     "brand_recognition": {
    488       "score": 2,
    489       "justification": "GitHub Copilot is a widely known product, though ZoomInfo itself is not a major tech brand in the developer community."
    490     }
    491   }
    492 }

Impressum · Datenschutz