ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25485B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Experience with GitHub Copilot for Developer Productivity at Zoominfo",
      6     "authors": [
      7       "Gal Bakal",
      8       "Ali Dasdan",
      9       "Yaniv Katz",
     10       "Michael Kaufman",
     11       "Guy Levin"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2501.13282",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims (33% suggestion acceptance, 20% lines acceptance, 72% satisfaction, four-phase methodology, 400+ developers, language-specific variations) are directly supported by Figures 2, 4, 9, and the methodology sections.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper asserts GitHub Copilot 'significantly contributed to productivity' and that '90% report time savings', but the study is purely observational with no control group, no pre/post comparison, and Section 6 explicitly defers causality to a future paper.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are mostly bounded to 'medium-scale enterprise deployment' at Zoominfo, with explicit caveats that DORA metric causality is future work and that results align with (rather than supersede) prior industry reports.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are considered for observed acceptance rates or satisfaction scores — Hawthorne effect, selection bias (voluntary, enthusiastic participants), or Zoominfo's organizational investment in the tool's success are never discussed.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Section 6 explicitly acknowledges acceptance rate is used as a proxy because 'the impact of GitHub Copilot on developer productivity seems difficult to measure' and cites the GitHub paper recommending it as a 'better predictor of perceived productivity.'",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 11 'Limitations: Observed and Potential' is a dedicated section listing contextual understanding failures, security concerns, creativity limits, and a set of potential future limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Section 11 discusses limitations of the tool (domain-specific logic, security), not threats to the study's validity — selection bias, Hawthorne effect, voluntary participation skew, and lack of control group are never mentioned.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly scopes to 'medium-scale enterprise deployment' and states that causal relationships with DORA metrics are not yet established and will be reported separately.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source or acknowledgment section is present; the GitHub Copilot licenses were purchased by Zoominfo but this is not framed as a disclosure.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list Zoominfo affiliation and Zoominfo email addresses on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Zoominfo employees are evaluating a paid tool their company deployed; the organization has a financial and reputational interest in a positive outcome, making it not independent.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 3 defines 'developer productivity' as output per input unit; Section 6 defines 'acceptance rate of shown suggestions' precisely; Section 10 defines 'DevSat' as a net-sentiment score.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction lists five explicit research questions and frames the contribution as a medium-scale enterprise deployment case study filling a gap in empirical evidence.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 12 is a comprehensive related work section that compares findings to GitHub's own productivity paper, ANZ Bank deployment, open-source studies, code correctness studies, and tool comparisons, situating the work within the literature.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No analysis scripts, survey instruments, or data processing code are released; the paper only references a ServiceNow workflow and GitHub's telemetry dashboard.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The acceptance rate telemetry and developer survey response data are not publicly released.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": false,
    135           "answer": false,
    136           "justification": "This is an observational deployment study of a commercial tool; no experimental environment, dependencies, or software stack requiring specification exists.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": false,
    141           "answer": false,
    142           "justification": "Reproduction of a specific company's internal deployment study is not feasible, making this criterion not applicable.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Standard deviations are reported in Figure 2 for daily aggregate counts, but no confidence intervals are computed for the main reported acceptance rates (33%, 20%) or satisfaction score (72%).",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used despite comparative claims (e.g., language-to-language acceptance rate differences, IDE comparisons, satisfaction score claims).",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Acceptance rates (33% suggestions, 20% lines), time savings (20% median reduction), and satisfaction (72%) are reported as percentages with industry comparison context from GitHub and Google.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The trial used 126 of 400+ engineers ('about 32%') but no power analysis or justification for why this sample size is sufficient is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Figure 2 explicitly reports standard deviations for all daily metrics including suggestion counts and acceptance rates across the 26-day period.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "There is no within-study control condition; informal references to GitHub's and Google's reported acceptance rates serve as external comparisons but not controlled baselines.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "No proper baselines are included in the study design, making this criterion not applicable.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "The study evaluates a single commercial tool as a monolith; ablation is not applicable.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The study uses suggestion acceptance rate, lines acceptance rate, developer satisfaction (DevSat), qualitative survey free-text, and per-language and per-IDE breakdowns.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Section 10 presents developer satisfaction surveys (Likert scale + free-form) where developers directly evaluate Copilot's outputs and impact on their work.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is a production deployment observational study, not a prediction task; held-out test sets are not applicable.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by programming language (Fig 5-7, 12 languages) and by IDE (Fig 8, JetBrains vs VS Code).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 11 describes observed failures (domain-specific logic, security risks, creativity limitations) and the qualitative section includes a negative developer quote and reports on cases requiring modification.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Lower acceptance rates for HTML, CSS, JSON, SQL are explicitly flagged and unexplained; qualitative negatives are quoted; 92% of generated tests failing outside test suites is cited from related work.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper refers only to 'GitHub Copilot' by marketing name without specifying any model version, snapshot date, or which underlying LLM version was active during Nov-Dec 2024.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "GitHub Copilot is evaluated as a black-box IDE plugin; no custom prompts are constructed or controlled by the researchers.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": false,
    249           "answer": false,
    250           "justification": "Commercial black-box tool evaluation — hyperparameters are not accessible or configurable by the researchers.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding; GitHub Copilot is used as a standard IDE plugin without custom orchestration.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "The paper states data comes from GitHub Copilot's telemetry dashboard but does not document how weekend/weekday splits were computed, how languages were categorized, or how partial acceptances were handled beyond a brief definition.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw telemetry data or survey response data is made publicly available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper describes data collection: telemetry from GitHub Copilot dashboard over Nov 14-Dec 9 2024 (26 days), and quarterly developer satisfaction surveys with Likert scale questions since Q2 2024.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Section 5.2 describes stratified voluntary sampling with explicit prerequisites (security training, compliance acknowledgments), formal application process, and tracking via unique participant identifiers for the 126-person trial.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "The path from GitHub Copilot telemetry API to the reported figures is not documented; no data extraction, aggregation, or analysis scripts are described or released.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This study measures developer acceptance rates in production use, not model capability on benchmarks; training cutoff is irrelevant.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Not a benchmark evaluation; train-test overlap is not applicable.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "No benchmark evaluation is conducted; contamination is not applicable.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned; this was an internal corporate evaluation, not a pre-registered academic study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics approval is mentioned despite collecting developer behavior data and survey responses from human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Only geographic distribution (US, Europe, India, Israel) and broad technical role stratification are mentioned; no age, gender, years of experience, or other standard demographic breakdowns are reported.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Section 5.2 lists explicit inclusion criteria: completion of security training, written acknowledgment of five compliance documents, and commitment to provide structured post-trial feedback.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "Participation was voluntary with stratified sampling; no randomization of participants to treatment/control conditions was used.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding was possible or attempted; all participants knew they were using and being evaluated on GitHub Copilot.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": true,
    352           "justification": "The paper reports 126 trial participants and 72 survey respondents, explicitly noting a 57% response rate, which constitutes attrition reporting.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "License procurement is mentioned but per-query cost, latency, or total inference cost is never reported.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": false,
    365           "answer": false,
    366           "justification": "No model training or self-hosted inference; compute budget is not applicable for a commercial SaaS tool evaluation.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Average acceptance rate of 33% for suggestions and 20% for lines of code over a 26-day production period",
    375       "evidence": "Figure 2 and 4 show daily telemetry from Nov 14 to Dec 9, 2024 with averages computed across ~400 developers",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Developer satisfaction with GitHub Copilot is 72%, the highest among all evaluated tools",
    380       "evidence": "Figure 9 shows quarterly developer satisfaction survey results comparing GitHub Copilot against Jenkins, SonarQube, ArgoCD, and Backstage",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "90% of surveyed developers report that GitHub Copilot reduces task completion time, with a median reduction of 20%",
    385       "evidence": "Section 10 reports this from developer satisfaction surveys; self-reported, no objective time measurement",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Top four languages (TypeScript, Java, Python, JavaScript) sustain approximately 30% acceptance rates",
    390       "evidence": "Figure 5-7 show per-language breakdown; these four languages also cover ~80-85% of total suggestions",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "HTML, CSS, JSON, and SQL show meaningfully lower acceptance rates than general-purpose languages",
    395       "evidence": "Figure 5 and 7 show rates ranging 14-32% with HTML/CSS/JSON/SQL at the lower end; no statistical test confirms significance",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "GitHub Copilot significantly contributed to developer productivity at Zoominfo",
    400       "evidence": "Acceptance rates and satisfaction surveys are cited; authors explicitly acknowledge in Section 6 that causality has not been established and is deferred to future work",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "observational",
    406     "case-study",
    407     "qualitative"
    408   ],
    409   "key_findings": "A four-phase deployment of GitHub Copilot across 400+ Zoominfo developers yielded consistent acceptance rates of 33% (suggestions) and 20% (lines) over a 26-day production window in late 2024, with high developer satisfaction (72% DevSat, highest among evaluated tools). Language-specific variations were observed, with general-purpose languages achieving ~30% acceptance while HTML, CSS, JSON, and SQL underperformed; IDE differences were also noted (VS Code had ~50% higher lines acceptance rate than JetBrains). Developer surveys report 20% median time savings and high satisfaction with boilerplate/test generation, but causal attribution to actual productivity remains unestablished pending DORA metric analysis.",
    410   "red_flags": [
    411     {
    412       "flag": "Causal productivity claim without causal design",
    413       "detail": "The paper claims Copilot 'significantly contributed to productivity' but uses only observational acceptance rates with no control group, pre/post design, or counterfactual. The authors themselves defer causal claims to future work."
    414     },
    415     {
    416       "flag": "Self-evaluating company employees",
    417       "detail": "All authors are Zoominfo employees evaluating a tool their company paid for and deployed; no independence mechanism, no competing interests statement."
    418     },
    419     {
    420       "flag": "Voluntary participant selection bias",
    421       "detail": "Trial participants were volunteers who applied and met compliance prerequisites — systematically more enthusiastic about the tool than average developers, biasing satisfaction and acceptance results upward."
    422     },
    423     {
    424       "flag": "Model version unspecified",
    425       "detail": "The paper refers to 'GitHub Copilot' throughout without specifying any model version or snapshot date, making the evaluation unreproducible and temporally ambiguous."
    426     },
    427     {
    428       "flag": "No IRB for human study",
    429       "detail": "Developer behavior and survey data were collected from human participants with no mention of ethics review or IRB approval."
    430     },
    431     {
    432       "flag": "No statistical significance testing",
    433       "detail": "Language-to-language and IDE comparisons are presented as factual differences without any tests of statistical significance."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Measuring GitHub Copilot's Impact on Productivity",
    439       "relevance": "Foundational paper (Ziegler et al., CACM 2024) establishing acceptance rate as the primary productivity proxy metric — directly adopted by this study"
    440     },
    441     {
    442       "title": "The Impact of AI Tool on Engineering at ANZ Bank: An Empirical Study on GitHub Copilot within Corporate Environment",
    443       "relevance": "Most directly comparable prior work: similar enterprise deployment, ~1000 engineers, controlled experiment design, reports 40-50% productivity boost"
    444     },
    445     {
    446       "title": "The SPACE of Developer Productivity: There's More to It Than You Think",
    447       "relevance": "Framework paper defining multidimensional developer productivity metrics used to contextualize what Copilot's acceptance rates do and don't measure"
    448     },
    449     {
    450       "title": "The Impact of Generative AI on Collaborative Open-Source Software Development: Evidence from GitHub Copilot",
    451       "relevance": "Quantifies Copilot's effect on open-source project productivity (+6.5% code contributions) with the negative finding of +42% integration time"
    452     },
    453     {
    454       "title": "GitHub Copilot AI Pair Programmer: Asset or Liability?",
    455       "relevance": "Empirical evaluation of Copilot on algorithmic tasks, finding performance below human programmers — provides contrast to enterprise deployment positive results"
    456     },
    457     {
    458       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    459       "relevance": "Finds ~60% correctness for Java and ~30% for JavaScript on LeetCode problems — directly relevant benchmark for interpreting Zoominfo's language-specific acceptance rates"
    460     },
    461     {
    462       "title": "DevEx: What Actually Drives Productivity",
    463       "relevance": "Developer experience framework providing the conceptual basis for developer satisfaction as a productivity metric alongside DORA metrics"
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 3,
    469       "justification": "Directly actionable for engineering leaders evaluating Copilot: four-phase deployment methodology, compliance framework, and language-specific acceptance benchmarks are immediately applicable."
    470     },
    471     "surprise_contrarian": {
    472       "score": 1,
    473       "justification": "Findings largely confirm GitHub's own reported acceptance rates and prior enterprise studies; no surprising reversals or counter-intuitive findings beyond the unexplained weekend rate increase."
    474     },
    475     "fear_safety": {
    476       "score": 1,
    477       "justification": "Security risks from auto-generated code are mentioned in limitations, but treated as a process concern rather than a serious safety finding."
    478     },
    479     "drama_conflict": {
    480       "score": 0,
    481       "justification": "No controversy; positive tone throughout with company evaluating its own successful deployment."
    482     },
    483     "demo_ability": {
    484       "score": 2,
    485       "justification": "GitHub Copilot is a widely available commercial product that practitioners can immediately try using the same IDE plugins described."
    486     },
    487     "brand_recognition": {
    488       "score": 2,
    489       "justification": "GitHub Copilot is a high-recognition product; Zoominfo is a publicly traded enterprise software company with broad name recognition in B2B circles."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }

Impressum · Datenschutz