ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23790B)


      1 {
      2   "paper": {
      3     "title": "Usage, Effects and Requirements for AI Coding Assistants in the Enterprise: An Empirical Study",
      4     "authors": ["Maja Vukovic", "Rangeet Pan", "Tin Kam Ho", "Rahul Krishna", "Raju Pavuluri", "Michele Merler"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.20112"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No code or analysis scripts are released. No repository URL is provided."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "Survey responses are not released. The 35-paper corpus is described but no structured dataset is provided."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No environment or dependency information provided. The paper mentions using Gemini 2.5 pro and Claude Sonnet 4 for LLM-based summarization but provides no setup details."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No reproduction instructions are provided. The survey questionnaire is included in Appendix A, but there are no instructions for reproducing the analysis."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Results are reported as raw counts and percentages (e.g., '88% perceived productivity gains', '44 out of 57 reported at least 25% increase') with no confidence intervals or error bars."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper makes comparative claims across divisions and experience levels (e.g., differences in motivations across divisions, Figure 6) but uses no statistical tests to determine whether differences are significant."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Productivity gains are reported as self-reported percentages (25%, 50%, 100%) without any standardized effect sizes or baseline context beyond raw numbers."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "N=57 for the primary survey with no justification for sample size, no power analysis, and no discussion of whether this sample is sufficient for the subgroup comparisons attempted."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No variance, standard deviation, or spread measures are reported. Only raw counts and percentages are given."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper compares its own 57-person survey results against 35 prior surveys (Section 3), providing a baseline of existing knowledge about AI coding assistant usage."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The 35 reviewed surveys are from 2023-2025, which are contemporary for the topic. Selection criteria explicitly required recency."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is a survey study, not a system with components to ablate."
     76       },
     77       "multiple_metrics": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "This is a descriptive survey, not an evaluation with metrics."
     81       },
     82       "human_evaluation": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "The paper itself is a survey collecting human opinions; there is no system output to evaluate."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Not applicable to survey research."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are broken down by division (Figures 6, 7), years of experience (Figure 3b), programming language proficiency (Figure 3c), and feature categories (Figure 9)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 4.3.6 specifically discusses users who perceived no gain (13 of 57, 22%), including detailed reasons: limitations of the assistant, correctness issues, prompting overhead."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper reports that 22% of users found no productivity gain, that AI tools made debugging harder for some, and that experienced developers see discrepancies between perceived and measured productivity."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims the paper surveys 57 developers and reviews 35 surveys, and discusses requirements for AI coding assistants. These are all supported by the paper content."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper uses causal language like 'AI coding assistants are helping drive the productivity' (Section 5) from observational self-report data without addressing confounds such as selection bias (users who adopt tools may already be more productive) or placebo effects."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The title says 'in the Enterprise' broadly, but the sample is 57 developers from a single company (IBM). The paper does not bound its conclusions to this specific organizational context."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "No alternative explanations are discussed for the findings. For example, the high perceived productivity could reflect novelty effects, self-selection bias, or demand characteristics from surveying within the employer organization."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper measures self-reported perceived productivity gains and presents them as actual productivity improvements without distinguishing between perceived and measured productivity. The related work acknowledges 'discrepancies between perceived and measured productivity' (Section 2) but does not apply this insight to its own findings."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper mentions using 'Gemini 2.5 pro and Claude Sonnet 4' for LLM-based summarization of surveys but provides no version IDs or snapshot dates."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Full prompts used for LLM-based analysis of the 35 surveys are provided in Tables 1 and 2 (Supplementary Material), including the complete extraction instructions for both Gemini and Claude."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "No hyperparameters (temperature, top-p, etc.) are reported for the LLM-based analysis pipeline."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The LLMs are prompted directly for information extraction."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3.1 describes the search and selection criteria for the 35 surveys, and Section 3.2 describes the LLM-based extraction and human verification methodology. The pipeline is shown in Figure 1. Filtering criteria (recency, practitioner surveys, SE-specific, publicly available) are stated, with counts (approximately 50 → 35)."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no dedicated limitations or threats-to-validity section. The paper ends with a brief conclusion (Section 5) that mentions future work but does not discuss study limitations."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "No specific threats to validity are discussed anywhere in the paper."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that findings from 57 IBM employees may not generalize to other enterprises."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Neither the survey responses nor the extracted data from the 35 reviewed papers are made available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.1 states the survey was launched in May 2025 within a tech company, collected via an online form with 25 questions and 57 responses. The full questionnaire is in Appendix A."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper says 57 responses were received from within a 'tech company' but does not describe how participants were recruited, whether it was voluntary, how they were invited, or what the response rate was."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper does not describe how free-text survey responses were coded into categories. Section 4.3.2 mentions responses were 'manually mapped' to themes but the mapping criteria and process are not documented."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding or acknowledgments section is present in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All six authors are clearly listed as IBM Research, Yorktown Heights. The paper surveys IBM employees about AI coding assistants including IBM's own watsonx Code Assistant."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "IBM Research employees surveyed IBM employees about AI coding assistants including IBM's watsonx Code Assistant. IBM has a financial interest in positive findings about enterprise AI coding assistants."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present. IBM authors evaluating IBM's own product (watsonx Code Assistant is prominently featured in Figure 5) without declaring conflicts."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate a pre-trained model on any benchmark. It is a survey study."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Not applicable — no model benchmark evaluation is performed."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not applicable — no model benchmark evaluation is performed."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No pre-registration is mentioned for this survey study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No IRB or ethics board approval is mentioned despite surveying 57 human participants."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Detailed demographics are provided: business unit breakdown (Figure 3a), years of SE experience (Figure 3b), programming language proficiency (Figure 3c), IDE usage (Figure 4)."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No inclusion or exclusion criteria are stated for participant selection. The paper says 57 responses were received but does not describe who was eligible or how participants were selected."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "This is a cross-sectional survey, not an experimental study with randomized conditions."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "Not applicable to a cross-sectional survey."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Some questions have different response counts (e.g., '35 respondents to this question' in Section 4.3.3 vs. 57 total) but no systematic reporting of attrition or explanation for missing responses."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a survey paper; cost reporting is not applicable."
    282       },
    283       "compute_budget_stated": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "This is a survey paper; compute budget is not applicable."
    287       }
    288     },
    289     "survey_methodology": {
    290       "prisma_or_structured_protocol": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper describes a web search process (Section 3.1) with filtering criteria but does not follow PRISMA or any named review protocol. No flow diagram with counts at each filtering stage is provided. The search terms are listed but not as reproducible database queries."
    294       },
    295       "quality_assessment_of_sources": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The 35 reviewed surveys are not assessed for methodological quality. All are treated equally regardless of sample size (ranging from 10 to 17,420), study design, or rigor."
    299       },
    300       "publication_bias_discussed": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No discussion of publication bias. The paper does not consider whether the 35 surveyed studies skew toward positive findings about AI coding assistants."
    304       }
    305     }
    306   },
    307   "claims": [
    308     {
    309       "claim": "88% of surveyed developers perceived productivity gains when using AI coding assistants",
    310       "evidence": "Section 4.3.3: '88% of them perceived productivity gains when using AI Coding assistant. Among the 57 responding users, 44 of them reported at least a 25% increase in productivity.'",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "73.7% of practitioners reported that AI code assistants transformed their development process",
    315       "evidence": "Section 4.3.5: '42 out of 57 (73.7%), reported that the use of AI code assistants has transformed their development process.'",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "There is no 'one size fits all' motivating use case across users of AI coding assistants",
    320       "evidence": "Section 4.3.2 and Figure 6 show different motivation distributions across divisions (Research focused on productivity, Sales balanced with code quality, Consulting emphasizing code understanding).",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "Over 65% of respondents replaced Google Search and StackOverflow with AI coding assistants",
    325       "evidence": "Section 4.3.3: 'Over 65% respondents reported that they previously relied on Google Search, followed by StackOverflow.'",
    326       "supported": "weak"
    327     },
    328     {
    329       "claim": "Most existing surveys are dominated by ChatGPT and GitHub Copilot, leaving understanding of other assistants limited",
    330       "evidence": "Section 3.3 and Figure 2a show ChatGPT and GitHub Copilot dominate the distribution of studied tools across 35 surveys.",
    331       "supported": "strong"
    332     }
    333   ],
    334   "methodology_tags": ["observational", "meta-analysis", "qualitative"],
    335   "key_findings": "A survey of 57 enterprise developers at IBM found 88% perceived productivity gains from AI coding assistants, with significant variation in motivations and valued features across business divisions. A companion review of 35 prior surveys revealed that existing research is dominated by ChatGPT and GitHub Copilot studies, focuses on short-term productivity rather than long-term quality, and draws from homogeneous participant pools. Users requested deeper context awareness, full customization, and evolution toward autonomous agent capabilities as key requirements for future AI coding assistants.",
    336   "red_flags": [
    337     {
    338       "flag": "Company evaluating its own product",
    339       "detail": "All six authors are from IBM Research. IBM's watsonx Code Assistant is the most-used tool among the surveyed 57 IBM employees (Figure 5). The paper does not acknowledge this conflict of interest."
    340     },
    341     {
    342       "flag": "Single-company non-representative sample presented broadly",
    343       "detail": "The 57 respondents are all from a single tech company (IBM), yet the title claims 'Enterprise' generality. No limitations section acknowledges this restriction. The sample is heavily skewed toward experienced developers (42% have >20 years experience)."
    344     },
    345     {
    346       "flag": "Self-reported productivity treated as actual productivity",
    347       "detail": "The paper reports perceived productivity gains (88%) as evidence of actual productivity improvement. Section 2 acknowledges 'discrepancies between perceived and measured productivity for experienced developers' but does not apply this caveat to its own findings."
    348     },
    349     {
    350       "flag": "No limitations section",
    351       "detail": "The paper has no dedicated limitations or threats-to-validity section despite multiple obvious threats: single-company sample, self-selection bias, demand characteristics from employer-administered survey, self-reported outcomes."
    352     },
    353     {
    354       "flag": "Arithmetic error in reported percentages",
    355       "detail": "Section 4.3.3 states '44 out of 57 users (97.8% of the users who answered on the perceived gain)'. 44/57 = 77.2%, not 97.8%. The denominator for 97.8% is unclear."
    356     },
    357     {
    358       "flag": "LLM-assisted analysis without validation methodology",
    359       "detail": "The paper used Gemini 2.5 pro and Claude Sonnet 4 to extract information from 35 papers, then 'manually verified' the results. No inter-rater reliability, no description of what manual verification entailed, and no error rate reported."
    360     }
    361   ],
    362   "cited_papers": [
    363     {
    364       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    365       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    366       "year": 2025,
    367       "arxiv_id": "2507.09089",
    368       "relevance": "RCT finding discrepancies between perceived and measured productivity for experienced developers using AI coding tools."
    369     },
    370     {
    371       "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace",
    372       "authors": ["Jenna Butler", "Jina Suh", "Sankeerti Haniyur", "Constance Hadley"],
    373       "year": 2024,
    374       "arxiv_id": "2410.18334",
    375       "relevance": "RCT of GitHub Copilot finding increased belief in usefulness but no statistically significant changes in objective telemetry metrics."
    376     },
    377     {
    378       "title": "How Much Does AI Impact Development Speed? an Enterprise-Based Randomized Controlled Trial",
    379       "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"],
    380       "year": 2025,
    381       "doi": "10.1109/ICSE-SEIP66354.2025.00060",
    382       "relevance": "Enterprise RCT finding developers were ~21% faster with AI but statistical significance reduced after controlling for other factors."
    383     },
    384     {
    385       "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
    386       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir"],
    387       "year": 2025,
    388       "doi": "10.1145/3716848",
    389       "relevance": "Empirical study of security vulnerabilities in Copilot-generated code in real GitHub projects."
    390     },
    391     {
    392       "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers",
    393       "authors": ["Zheyuan Cui", "Mert Demirer", "Sonia Jaffe"],
    394       "year": 2025,
    395       "doi": "10.2139/ssrn.4945566",
    396       "relevance": "Three field experiments showing less experienced developers benefit more from AI coding tools."
    397     },
    398     {
    399       "title": "Do Users Write More Insecure Code with AI Assistants?",
    400       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    401       "year": 2023,
    402       "doi": "10.1145/3576915.3623157",
    403       "relevance": "CCS study finding participants with AI access wrote significantly less secure code and were more overconfident."
    404     },
    405     {
    406       "title": "Examining the Use and Impact of an AI Code Assistant on Developer Productivity and Experience in the Enterprise",
    407       "authors": ["Justin D. Weisz", "Shraddha Kumar", "Michael Muller"],
    408       "year": 2025,
    409       "arxiv_id": "2412.06603",
    410       "relevance": "Large-scale enterprise study of IBM watsonx Code Assistant finding unevenly distributed productivity gains."
    411     },
    412     {
    413       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    414       "authors": ["Agnia Sergeyuk", "Yaroslav Golubev", "Timofey Bryksin", "Iftekhar Ahmed"],
    415       "year": 2025,
    416       "doi": "10.1016/j.infsof.2024.107610",
    417       "relevance": "Large survey (481 programmers) on AI coding assistant usage patterns and reasons for non-adoption."
    418     },
    419     {
    420       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    421       "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"],
    422       "year": 2024,
    423       "doi": "10.1145/3597503.3608128",
    424       "relevance": "410-developer survey on usability factors and challenges of AI programming assistants."
    425     },
    426     {
    427       "title": "Measuring GitHub Copilot's Impact on Productivity",
    428       "authors": ["Albert Ziegler", "Eirini Kalliamvakou"],
    429       "year": 2024,
    430       "doi": "10.1145/3633453",
    431       "relevance": "Large-scale study (17,420 users) measuring Copilot's productivity impact through acceptance rates."
    432     },
    433     {
    434       "title": "Maybe We Need Some More Examples: Individual and Team Drivers of Developer GenAI Tool Use",
    435       "authors": ["Courtney Miller", "Rudrajit Choudhuri"],
    436       "year": 2025,
    437       "arxiv_id": "2507.21280",
    438       "relevance": "Study finding organizational expectations for rapid AI productivity gains limited by uneven adoption across teams."
    439     }
    440   ]
    441 }

Impressum · Datenschutz