scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23175B)
      1 {
      2   "paper": {
      3     "title": "An Empirical Study of Generative AI Adoption in Software Engineering",
      4     "authors": ["Görkem Giray", "Onur Demirörs", "Marcos Kalinowski", "Daniel Mendez"],
      5     "year": 2025,
      6     "venue": "arXiv (submitted to ACM)",
      7     "arxiv_id": "2512.23327",
      8     "doi": "10.48550/arXiv.2512.23327"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The paper states 'The questionnaire, the collected data, and the quantitative and qualitative data analysis artifacts, including Python scripts... are available in our online open science repository [to be published on Zenodo].' This is a promise of future release, not an actual release — no URL is provided."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "Same as above — data availability is stated as '[to be published on Zenodo]' which is a future promise, not an actual release."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency details are provided for the analysis scripts."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are included. The paper describes methods but does not provide a guide to replicate the analysis."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper consistently reports bootstrap confidence intervals for percentages, e.g., 'P = 79.44% [79.27, 79.61]' (Section 4.2), using S=1,000 bootstrap resamples as described in Section 3.5."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper reports descriptive statistics and confidence intervals but does not perform significance tests for any comparative claims (e.g., differences between groups or experience levels)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Results are presented as percentages with confidence intervals but without effect size measures for comparisons."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No power analysis or formal justification for the sample size of 204 is provided. The paper does not discuss whether 204 is adequate for the analyses conducted."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "For experience, the paper reports mean, median, IQR (Q1=5.0, Q3=17.0) in Section 4.1. Bootstrap confidence intervals provide spread measures for survey percentages."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper systematically compares its findings against prior surveys (Table 1) and industry reports (StackOverflow, Capgemini, DORA, MIT) throughout Section 5 Discussion."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Comparisons are made against contemporary sources: StackOverflow 2025, Google DORA 2025, MIT 2025, Capgemini 2024 — all recent and relevant."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "Survey study with no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "This is a descriptive survey, not an evaluation of a system. Percentages and confidence intervals describe the sample; there are no competing evaluation metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Not applicable — the study is a survey collecting self-reported data, not evaluating a system's outputs."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Survey study — no train/test split applicable."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by SE activity (Figure 9), tool (Figure 10), usage frequency (Figure 11), benefits (Figure 12), challenges (Figure 16), country (Figure 1), role (Figure 4), organization size (Figure 6), and more."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Challenges and reasons for non-adoption are extensively discussed (Sections 4.2.1 and 4.3.5), including specific failure modes like hallucinations, validation overhead, and context understanding failures."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that ~20% do not use GenAI, 58% don't use objective metrics, ~9% disagree that quality improves, and 1% report negative productivity impact. Challenges are reported in detail."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about 80% adoption, benefits (cycle time, quality, productivity), challenges (incorrect outputs, prompt engineering, validation overhead), and institutionalization patterns are all supported by the results in Sections 4.2-4.5."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper uses causal language in places, e.g., 'GenAI tools enable me to achieve better quality' (Q18), 'reduction in cycle time' as a 'benefit' of GenAI. The cross-sectional survey design cannot support causal inference, and this is not explicitly acknowledged in the framing of benefits."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 6 (Threats to Validity) explicitly avoids generalizability claims: 'we avoided further generalizability claims throughout the paper due to the aforementioned limitations. Replications should be conducted.' The paper frames results as 'from our sample.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The Discussion (Section 5) considers alternative interpretations, e.g., that perceived productivity gains may be unreliable due to lack of objective metrics, that non-adopters may face skill gaps rather than tool trust issues, and cites Becker et al. (2025) showing AI can slow experienced developers."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly distinguishes perceived from objective productivity/quality: 'Despite the high perceived productivity and quality improvements, objective measurement of GenAI's impact seems very limited' (Section 5) and 'these reported perceived improvements become even more questionable' given limited metric usage (Section 4.3.4)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "Survey study — no AI models are evaluated."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "Survey study — no prompting is used."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Survey study — no model hyperparameters applicable."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used in this survey study."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.5 describes the data cleaning pipeline: 10 responses removed for declining consent, 9 removed for not providing a valid SE activity, completeness check on remaining 204 responses. Qualitative coding procedures are described in detail including selective and open coding approaches."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 'Threats to Validity' provides a dedicated and substantive discussion of face/content validity, criterion validity, construct validity, and reliability threats."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 discusses specific threats: convenience/snowball sampling bias, risk of misunderstood questions, purposive sampling to limit geographic over-representation (stopped collecting in Brazil/Türkiye at ~20), and the random sampling limitation addressed via bootstrapping."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states it focuses on 'GenAI for SE' (not SE for GenAI), that units of analysis are 'software professionals... not organizations or projects' (Section 3.4), and avoids generalizability claims due to non-probability sampling."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Data is promised for Zenodo '[to be published]' but not actually available. No working URL or archive is provided."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.4 describes the data collection in detail: online questionnaire on surveyjs.io, May-November 2025, using convenience + purposive + snowball sampling via emails and LinkedIn."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 3.4 describes three sampling strategies: convenience (professional network, LinkedIn), purposive (targeting different countries, max 2 per organization), and snowball sampling through contacts."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.5 documents the pipeline: 223 responses received → 10 removed (consent declined) → 9 removed (no valid SE activity) → 204 used. Qualitative coding process is described with initial cycles, theme extraction, and validation by third/fourth authors."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or funding sources."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Eindhoven University of Technology, Izmir Institute of Technology, PUC-Rio, Blekinge Institute of Technology, and fortiss."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "Survey study — no pre-trained model is evaluated on any benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Survey study — no pre-trained model is evaluated on any benchmark."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Survey study — no pre-trained model is evaluated on any benchmark."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section 3.2 states: 'The Research Ethics Committee at Izmir Institute of Technology approved the questionnaire.'"
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Section 4.1 provides extensive demographics: country (37 countries, Figure 1), education field and degree (Figure 2), experience distribution (Figure 3), roles (Figure 4), sector (Figure 5), organization and team size (Figure 6), and project management approach (Figure 7)."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Section 3.4 states the target population is 'professionals performing SE-related activities.' Section 3.5 describes exclusion: 10 removed for declining consent, 9 removed for not providing a valid SE activity. The screening question C1 gates entry."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "Cross-sectional survey — no experimental conditions or randomization applicable."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "Cross-sectional survey — blinding is not applicable."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Section 3.5 reports: 223 total responses, 10 removed for declining consent, 9 removed for quality (no valid SE activity), leaving 204 for analysis. Per-question N values vary and are reported (e.g., N=162 for users, N=38 for non-users)."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Survey paper — no computational method with inference costs."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper — no significant computation required."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Approximately 80% of SE practitioners use GenAI tools in their SE activities.",
    294       "evidence": "P = 79.44% [79.27, 79.61] from 204 respondents (Section 4.2). Aligns with StackOverflow 2025 (84% using or planning to use).",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Implementation is the primary use case for GenAI in SE (71%), followed by verification & validation (24%).",
    299       "evidence": "Figure 9, Section 4.2.2. Based on 162 GenAI-using respondents reporting ~2 tasks each, coded to ISO/IEC/IEEE 12207 processes.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Approximately 95% of GenAI-using practitioners report a productivity increase.",
    304       "evidence": "Section 4.3.2, Figure 13: 43% report 50% time reduction, 27% report 75% reduction, 26% report moderate increase. Only 3.5% report no gain and 1.2% negative impact.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "82% of respondents agree that GenAI tools enhance their work quality.",
    309       "evidence": "Section 4.3.3, Figure 14: 46.72% strongly agree, 35.21% somewhat agree. Based on 5-point Likert scale (N=161).",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Objective measurement of productivity and quality remains very limited — 58% of respondents use no objective metric.",
    314       "evidence": "Section 4.3.4, Figure 15: 58.15% [57.85, 58.45] explicitly stated no objective metrics used. Only story points (19%) and velocity (12%) reported as common metrics.",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "Incorrect outputs/hallucinations are the top challenge (48%), followed by prompt engineering difficulties (31%).",
    319       "evidence": "Section 4.3.5, Figure 16: inaccurate output at 47.70%, interaction/prompting at 31.48%, validation overhead at 25.89%. Based on N=130 coded responses.",
    320       "supported": "moderate"
    321     },
    322     {
    323       "claim": "79% of practitioners expect GenAI to redefine rather than replace their roles; 62% disagree that GenAI will replace them.",
    324       "evidence": "Section 4.5, Figure 18: Based on Likert scale responses from N=198 and N=198 respectively.",
    325       "supported": "moderate"
    326     }
    327   ],
    328   "methodology_tags": ["observational", "qualitative"],
    329   "key_findings": "Survey of 204 SE practitioners from 37 countries finds ~80% adoption of GenAI tools, primarily for implementation (71%) and verification/validation (24%). While 95% report perceived productivity gains and 82% report quality improvements, 58% use no objective metrics to measure these claims. Top challenges are hallucinations (48%), prompt engineering (31%), and output validation overhead (26%). Most practitioners (79%) expect role redefinition rather than replacement.",
    330   "red_flags": [
    331     {
    332       "flag": "Self-reported benefits without objective measurement",
    333       "detail": "95% claim productivity gains and 82% claim quality improvements, but 58% of respondents use no objective metrics. The paper acknowledges this tension but the headline findings (especially in the abstract) still lead with perceived benefits."
    334     },
    335     {
    336       "flag": "Non-probability sampling with narrow CIs",
    337       "detail": "The bootstrap confidence intervals are extremely narrow (e.g., [79.27, 79.61] for adoption rate) because bootstrapping quantifies sampling variability of the estimator, not coverage of the target population. With convenience + snowball sampling, the real uncertainty is much larger than these CIs suggest. The paper acknowledges sampling limitations but the narrow CIs may give a false sense of precision."
    338     },
    339     {
    340       "flag": "Potential selection bias toward GenAI enthusiasts",
    341       "detail": "Survey distributed through professional networks, LinkedIn, and snowball sampling to SE practitioners. People who respond to a survey about GenAI adoption are likely more engaged with GenAI than non-respondents, potentially inflating the 80% adoption figure."
    342     },
    343     {
    344       "flag": "Data and code not yet released",
    345       "detail": "The paper promises open science data on Zenodo but the repository is '[to be published]'. Claims of transparency cannot be verified without the actual data release."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Copiloting the Future: How Generative AI Transforms Software Engineering",
    351       "authors": ["L. Banh", "F. Holldack", "G. Strobel"],
    352       "year": 2025,
    353       "relevance": "Survey of GenAI impact on SE across 17 European companies, covering use cases, benefits, and challenges."
    354     },
    355     {
    356       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    357       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    358       "year": 2025,
    359       "arxiv_id": "2507.09089",
    360       "relevance": "Empirical study finding AI can slow experienced developers, contradicting perceived productivity gains — key contrarian evidence."
    361     },
    362     {
    363       "title": "A large-scale survey on the usability of ai programming assistants: Successes and challenges",
    364       "authors": ["J. T. Liang", "C. Yang", "B. A. Myers"],
    365       "year": 2024,
    366       "relevance": "Survey of 410+ developers on programming assistant usability, covering reasons for non-use and challenges."
    367     },
    368     {
    369       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    370       "authors": ["A. Sergeyuk", "Y. Golubev", "T. Bryksin", "I. Ahmed"],
    371       "year": 2025,
    372       "relevance": "Survey of 481 developers across 71 countries on AI coding assistant usage patterns and perceptions."
    373     },
    374     {
    375       "title": "Navigating the complexity of generative ai adoption in software engineering",
    376       "authors": ["D. Russo"],
    377       "year": 2024,
    378       "relevance": "Survey of 100 practitioners on GenAI adoption complexity in SE, covering all knowledge areas."
    379     },
    380     {
    381       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    382       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    383       "year": 2023,
    384       "arxiv_id": "2302.06590",
    385       "relevance": "Controlled experiment showing Copilot enables 55.8% faster task completion, a key productivity claim in the field."
    386     },
    387     {
    388       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    389       "authors": ["F. Song", "A. Agarwal", "W. Wen"],
    390       "year": 2024,
    391       "arxiv_id": "2410.02091",
    392       "relevance": "Evaluates Copilot's impact on OSS project-level productivity, finding benefits especially for core developers."
    393     },
    394     {
    395       "title": "Toward Effective AI Support for Developers: A survey of desires and concerns",
    396       "authors": ["M. Khemka", "B. Houck"],
    397       "year": 2024,
    398       "relevance": "Large-scale Microsoft survey (737 developers) on AI support desires and concerns, including trust and accuracy issues."
    399     },
    400     {
    401       "title": "Productivity assessment of neural code completion",
    402       "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li"],
    403       "year": 2022,
    404       "relevance": "GitHub Copilot productivity study surveying 2,000+ developers on perceived productivity impact."
    405     },
    406     {
    407       "title": "How secure is AI-generated code: a large-scale comparison of large language models",
    408       "authors": ["N. Tihanyi", "T. Bisztray", "M. A. Ferrag"],
    409       "year": 2025,
    410       "relevance": "Finds 62% of LLM-generated code is vulnerable, directly relevant to security challenges of GenAI adoption."
    411     },
    412     {
    413       "title": "Perceived Impact of AI-Based Tooling on Software Development Code Quality",
    414       "authors": ["B. Martinović", "R. Rozić"],
    415       "year": 2025,
    416       "relevance": "Survey finding AI tool users 67% likely to produce vulnerable code vs 27% without, relevant to quality impact claims."
    417     },
    418     {
    419       "title": "Naming the pain in machine learning-enabled systems engineering",
    420       "authors": ["M. Kalinowski", "D. Mendez", "G. Giray"],
    421       "year": 2025,
    422       "relevance": "Large-scale survey on ML-enabled systems engineering challenges, related methodology and authorship overlap."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs