scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (27203B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Usage, Effects and Requirements for AI Coding Assistants in the Enterprise: An Empirical Study",
      6     "authors": [
      7       "Maja Vukovic",
      8       "Rangeet Pan",
      9       "Tin Kam Ho",
     10       "Rahul Krishna",
     11       "Raju Pavuluri",
     12       "Michele Merler"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv",
     16     "arxiv_id": "2601.20112",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims the paper surveys 57 developers and reviews 35 surveys, and discusses requirements for AI coding assistants. These are all supported by the paper content.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper uses causal language like 'AI coding assistants are helping drive the productivity' (Section 5) from observational self-report data without addressing confounds such as selection bias (users who adopt tools may already be more productive) or placebo effects.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title says 'in the Enterprise' broadly, but the sample is 57 developers from a single company (IBM). The paper does not bound its conclusions to this specific organizational context.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed for the findings. For example, the high perceived productivity could reflect novelty effects, self-selection bias, or demand characteristics from surveying within the employer organization.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper measures self-reported perceived productivity gains and presents them as actual productivity improvements without distinguishing between perceived and measured productivity. The related work acknowledges 'discrepancies between perceived and measured productivity' (Section 2) but does not apply this insight to its own findings.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no dedicated limitations or threats-to-validity section. The paper ends with a brief conclusion (Section 5) that mentions future work but does not discuss study limitations.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that findings from 57 IBM employees may not generalize to other enterprises.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding or acknowledgments section is present in the paper.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors are clearly listed as IBM Research, Yorktown Heights. The paper surveys IBM employees about AI coding assistants including IBM's own watsonx Code Assistant.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "IBM Research employees surveyed IBM employees about AI coding assistants including IBM's watsonx Code Assistant. IBM has a financial interest in positive findings about enterprise AI coding assistants.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present. IBM authors evaluating IBM's own product (watsonx Code Assistant is prominently featured in Figure 5) without declaring conflicts.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "'Productivity' is used interchangeably for perceived and actual productivity without definition. 'Enterprise' is not defined or bounded relative to academic or small-business contexts.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The introduction explicitly states the paper contributes a cross-divisional survey of AI coding assistant usage and derives requirements for future assistants.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 provides substantive related work discussion and Section 3 conducts a meta-analysis of 35 prior surveys, actively situating the contribution relative to existing gaps.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No code or analysis scripts are released. No repository URL is provided.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "Survey responses are not released. The 35-paper corpus is described but no structured dataset is provided.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment or dependency information provided. The paper mentions using Gemini 2.5 pro and Claude Sonnet 4 for LLM-based summarization but provides no setup details.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No reproduction instructions are provided. The survey questionnaire is included in Appendix A, but there are no instructions for reproducing the analysis.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Results are reported as raw counts and percentages (e.g., '88% perceived productivity gains', '44 out of 57 reported at least 25% increase') with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper makes comparative claims across divisions and experience levels (e.g., differences in motivations across divisions, Figure 6) but uses no statistical tests to determine whether differences are significant.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "Productivity gains are reported as self-reported percentages (25%, 50%, 100%) without any standardized effect sizes or baseline context beyond raw numbers.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "N=57 for the primary survey with no justification for sample size, no power analysis, and no discussion of whether this sample is sufficient for the subgroup comparisons attempted.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or spread measures are reported. Only raw counts and percentages are given.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The paper compares its own 57-person survey results against 35 prior surveys (Section 3), providing a baseline of existing knowledge about AI coding assistant usage.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The 35 reviewed surveys are from 2023-2025, which are contemporary for the topic. Selection criteria explicitly required recency.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is a survey study, not a system with components to ablate.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": false,
    200           "answer": false,
    201           "justification": "This is a descriptive survey, not an evaluation with metrics.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "The paper itself is a survey collecting human opinions; there is no system output to evaluate.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Not applicable to survey research.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by division (Figures 6, 7), years of experience (Figure 3b), programming language proficiency (Figure 3c), and feature categories (Figure 9).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 4.3.6 specifically discusses users who perceived no gain (13 of 57, 22%), including detailed reasons: limitations of the assistant, correctness issues, prompting overhead.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that 22% of users found no productivity gain, that AI tools made debugging harder for some, and that experienced developers see discrepancies between perceived and measured productivity.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "The paper mentions using 'Gemini 2.5 pro and Claude Sonnet 4' for LLM-based summarization of surveys but provides no version IDs or snapshot dates.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Full prompts used for LLM-based analysis of the 35 surveys are provided in Tables 1 and 2 (Supplementary Material), including the complete extraction instructions for both Gemini and Claude.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No hyperparameters (temperature, top-p, etc.) are reported for the LLM-based analysis pipeline.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The LLMs are prompted directly for information extraction.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3.1 describes the search and selection criteria for the 35 surveys, and Section 3.2 describes the LLM-based extraction and human verification methodology. The pipeline is shown in Figure 1. Filtering criteria (recency, practitioner surveys, SE-specific, publicly available) are stated, with counts (approximately 50 → 35).",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Neither the survey responses nor the extracted data from the 35 reviewed papers are made available.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 4.1 states the survey was launched in May 2025 within a tech company, collected via an online form with 25 questions and 57 responses. The full questionnaire is in Appendix A.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "The paper says 57 responses were received from within a 'tech company' but does not describe how participants were recruited, whether it was voluntary, how they were invited, or what the response rate was.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The paper does not describe how free-text survey responses were coded into categories. Section 4.3.2 mentions responses were 'manually mapped' to themes but the mapping criteria and process are not documented.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "This paper does not evaluate a pre-trained model on any benchmark. It is a survey study.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable — no model benchmark evaluation is performed.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable — no model benchmark evaluation is performed.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for this survey study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics board approval is mentioned despite surveying 57 human participants.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Detailed demographics are provided: business unit breakdown (Figure 3a), years of SE experience (Figure 3b), programming language proficiency (Figure 3c), IDE usage (Figure 4).",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "No inclusion or exclusion criteria are stated for participant selection. The paper says 57 responses were received but does not describe who was eligible or how participants were selected.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "This is a cross-sectional survey, not an experimental study with randomized conditions.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "Not applicable to a cross-sectional survey.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "Some questions have different response counts (e.g., '35 respondents to this question' in Section 4.3.3 vs. 57 total) but no systematic reporting of attrition or explanation for missing responses.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": false,
    360           "answer": false,
    361           "justification": "This is a survey paper; cost reporting is not applicable.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": false,
    366           "answer": false,
    367           "justification": "This is a survey paper; compute budget is not applicable.",
    368           "source": "opus"
    369         }
    370       },
    371       "survey_methodology": {
    372         "prisma_or_structured_protocol": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "The paper describes a web search process (Section 3.1) with filtering criteria but does not follow PRISMA or any named review protocol. No flow diagram with counts at each filtering stage is provided. The search terms are listed but not as reproducible database queries.",
    376           "source": "opus"
    377         },
    378         "quality_assessment_of_sources": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The 35 reviewed surveys are not assessed for methodological quality. All are treated equally regardless of sample size (ranging from 10 to 17,420), study design, or rigor.",
    382           "source": "opus"
    383         },
    384         "publication_bias_discussed": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No discussion of publication bias. The paper does not consider whether the 35 surveyed studies skew toward positive findings about AI coding assistants.",
    388           "source": "opus"
    389         }
    390       }
    391     }
    392   },
    393   "claims": [
    394     {
    395       "claim": "88% of surveyed IBM enterprise developers perceived productivity gains from AI coding assistants.",
    396       "evidence": "Survey of 57 IBM developers; 50 (88%) answered yes to a binary productivity gain question. No objective measurement or control group.",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "44 out of 57 users reported at least a 25% increase in productivity.",
    401       "evidence": "Figure 8 and Section 4.3.3. The parenthetical '97.8% of users who answered on perceived gain' implies only ~45 of 57 answered this sub-question, making the '44 out of 57' framing misleading.",
    402       "supported": "weak"
    403     },
    404     {
    405       "claim": "Usage motivations and desired features differ significantly across enterprise divisions.",
    406       "evidence": "Figures 6 and 7 show cross-division distributions; e.g., Research cites productivity exclusively while Consulting emphasizes code understanding. Differences are descriptive, no statistical tests.",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "Prior survey literature is dominated by ChatGPT and GitHub Copilot, with significant gaps in diversity, longitudinal design, and agentic workflow coverage.",
    411       "evidence": "Meta-analysis of 35 surveys; Figure 2a shows 12+ surveys covering ChatGPT. Gap analysis in Section 3.3 is systematic and well-supported.",
    412       "supported": "strong"
    413     },
    414     {
    415       "claim": "73.7% of users report that AI coding assistants transformed their development process.",
    416       "evidence": "Section 4.3.5: '42 out of 57 (73.7%) reported transformation.' Self-report only; no behavioral validation.",
    417       "supported": "weak"
    418     },
    419     {
    420       "claim": "Future AI coding assistants require full repository awareness, deep customization, and autonomous software lifecycle management.",
    421       "evidence": "Section 4.3.7 and Figure 9 aggregate free-text user responses into short-term and long-term feature themes. Qualitative and internally consistent.",
    422       "supported": "moderate"
    423     }
    424   ],
    425   "methodology_tags": [
    426     "observational",
    427     "qualitative",
    428     "meta-analysis"
    429   ],
    430   "key_findings": "A voluntary survey of 57 IBM enterprise developers found 88% perceived productivity gains from AI coding assistants, with 44 reporting at least 25% improvement, primarily in code generation, understanding, and web-search replacement. A concurrent meta-analysis of 35 prior surveys reveals the literature is dominated by ChatGPT and GitHub Copilot, uses homogeneous participant pools, focuses on immediate tasks over long-term outcomes, and entirely lacks agentic workflow analysis. Cross-divisional analysis shows motivation and valued features vary substantially by role, arguing against one-size-fits-all assistants. Future requirements center on full repository context, compliance-aware customization, and autonomous software development lifecycle support.",
    431   "red_flags": [
    432     {
    433       "flag": "Single-company, non-random sample generalized to 'enterprise'",
    434       "detail": "All 57 participants are from IBM, yet the paper's title and conclusions frame findings as representative of enterprise settings broadly. IBM employees have atypical tool access (including IBM's own watsonx) and organizational context."
    435     },
    436     {
    437       "flag": "IBM studying its own product without conflict-of-interest disclosure",
    438       "detail": "IBM Research authors survey IBM employees, with watsonx Code Assistant included in the tool list. No competing interests statement is provided despite direct financial interest in positive findings."
    439     },
    440     {
    441       "flag": "Perceived productivity conflated with measured productivity",
    442       "detail": "The paper cites Becker et al. [5] on discrepancies between perceived and measured productivity for experienced developers, yet relies entirely on self-reported perceived gains without any objective measurement or acknowledgment of this limitation for their own data."
    443     },
    444     {
    445       "flag": "No statistical testing on cross-group comparisons",
    446       "detail": "All division-level and experience-level differences are presented as descriptive bar charts without significance tests, confidence intervals, or effect sizes. Per-division subgroups are very small (5% ≈ 3 people)."
    447     },
    448     {
    449       "flag": "No limitations section",
    450       "detail": "The paper has no dedicated limitations or threats-to-validity section despite clear threats: single-company sampling, voluntary response bias, social desirability, unknown response rate, and LLM extraction quality."
    451     },
    452     {
    453       "flag": "LLM-based systematic review without quality validation",
    454       "detail": "Two frontier LLMs are used to extract structured data from 35 papers with only manual verification afterward. No inter-rater agreement between LLM extractions or between LLM and human is reported."
    455     },
    456     {
    457       "flag": "Response rate not reported",
    458       "detail": "The paper does not disclose how many people were invited, the overall response rate, or how many declined, making it impossible to assess non-response bias."
    459     }
    460   ],
    461   "cited_papers": [
    462     {
    463       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    464       "relevance": "410-developer survey across 57 countries on AI assistant usability; central reference for the meta-analysis and gap identification"
    465     },
    466     {
    467       "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace",
    468       "relevance": "RCT of GitHub Copilot with 106 developers; shows subjective productivity beliefs increased but objective telemetry did not"
    469     },
    470     {
    471       "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers",
    472       "relevance": "Large-scale enterprise field experiments; key benchmark for productivity effect size claims"
    473     },
    474     {
    475       "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
    476       "relevance": "Google enterprise RCT (96 engineers) showing 21% speed improvement; direct comparator for this survey's perceived gains"
    477     },
    478     {
    479       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    480       "relevance": "Becker et al. finding discrepancy between perceived and measured productivity — cited in related work but not applied to authors' own data"
    481     },
    482     {
    483       "title": "Experience with GitHub Copilot for Developer Productivity at Zoominfo",
    484       "relevance": "Industry case study with objective acceptance rate and productivity data; enterprise comparator"
    485     },
    486     {
    487       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    488       "relevance": "481-developer survey; largest prior perception study in the meta-analysis"
    489     },
    490     {
    491       "title": "Do Users Write More Insecure Code with AI Assistants?",
    492       "relevance": "Security study showing AI-assisted users wrote less secure code; context for enterprise security concerns"
    493     },
    494     {
    495       "title": "Examining the Use and Impact of an AI Code Assistant on Developer Productivity and Experience in the Enterprise",
    496       "relevance": "IBM Watson Coding Assistant study by overlapping author group; important prior work context"
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 3,
    502       "justification": "Directly addresses enterprise tool adoption: cross-divisional usage patterns, feature requirements, and what practitioners actually want — highly actionable for engineering leaders."
    503     },
    504     "surprise_contrarian": {
    505       "score": 1,
    506       "justification": "Mostly confirms existing consensus on perceived productivity gains; division-level variation is mildly novel but not a surprising or contrarian finding."
    507     },
    508     "fear_safety": {
    509       "score": 1,
    510       "justification": "Mentions security concerns and over-reliance risks inherited from prior literature but adds no new safety findings from the primary study."
    511     },
    512     "drama_conflict": {
    513       "score": 1,
    514       "justification": "IBM studying its own enterprise tool creates a latent conflict-of-interest angle, but the paper does not acknowledge or frame it in a way that would generate controversy."
    515     },
    516     "demo_ability": {
    517       "score": 1,
    518       "justification": "Survey findings with no interactive component or tool to try."
    519     },
    520     "brand_recognition": {
    521       "score": 2,
    522       "justification": "IBM Research is a well-known institution; the survey covers major recognized tools (Copilot, ChatGPT, Cursor, Windsurf) familiar to practitioners."
    523     }
    524   },
    525   "hn_data": {
    526     "threads": [],
    527     "top_points": 0,
    528     "total_points": 0,
    529     "total_comments": 0
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs