ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (26209B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Empirical Study of Generative AI Adoption in Software Engineering",
      6     "authors": [
      7       "G. Giray",
      8       "Onur Demirörs",
      9       "Marcos Kalinowski",
     10       "Daniel Méndez"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2512.23327",
     15     "doi": "10.48550/arXiv.2512.23327"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about 80% adoption, benefits (cycle time, quality, productivity), challenges (incorrect outputs, prompt engineering, validation overhead), and institutionalization patterns are all supported by the results in Sections 4.2-4.5.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper uses causal language in places, e.g., 'GenAI tools enable me to achieve better quality' (Q18), 'reduction in cycle time' as a 'benefit' of GenAI. The cross-sectional survey design cannot support causal inference, and this is not explicitly acknowledged in the framing of benefits.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 6 (Threats to Validity) explicitly avoids generalizability claims: 'we avoided further generalizability claims throughout the paper due to the aforementioned limitations. Replications should be conducted.' The paper frames results as 'from our sample.'",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The Discussion (Section 5) considers alternative interpretations, e.g., that perceived productivity gains may be unreliable due to lack of objective metrics, that non-adopters may face skill gaps rather than tool trust issues, and cites Becker et al. (2025) showing AI can slow experienced developers.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes perceived from objective productivity/quality: 'Despite the high perceived productivity and quality improvements, objective measurement of GenAI's impact seems very limited' (Section 5) and 'these reported perceived improvements become even more questionable' given limited metric usage (Section 4.3.4).",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 'Threats to Validity' provides a dedicated and substantive discussion of face/content validity, criterion validity, construct validity, and reliability threats.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 6 discusses specific threats: convenience/snowball sampling bias, risk of misunderstood questions, purposive sampling to limit geographic over-representation (stopped collecting in Brazil/Türkiye at ~20), and the random sampling limitation addressed via bootstrapping.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states it focuses on 'GenAI for SE' (not SE for GenAI), that units of analysis are 'software professionals... not organizations or projects' (Section 3.4), and avoids generalizability claims due to non-probability sampling.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or funding sources.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed: Eindhoven University of Technology, Izmir Institute of Technology, PUC-Rio, Blekinge Institute of Technology, and fortiss.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding information is disclosed, so independence cannot be assessed.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 2.1 explicitly defines AI, GenAI, SE for GenAI, and GenAI for SE; SE activities are mapped to the ISO/IEC/IEEE 12207 taxonomy providing precise vocabulary.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper states its aim to 'provide an overview of the status of GenAI adoption in SE' via four explicit RQs, and Table 1 shows the 16 dimensions this study covers that prior work does not fully address.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 1 systematically compares this study against 17 prior surveys across 16 dimensions; Section 5 explicitly contrasts findings with StackOverflow Developer Survey, Capgemini, Google DORA, McKinsey, and Liang et al.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The paper states 'The questionnaire, the collected data, and the quantitative and qualitative data analysis artifacts, including Python scripts... are available in our online open science repository [to be published on Zenodo].' This is a promise of future release, not an actual release — no URL is provided.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Same as above — data availability is stated as '[to be published on Zenodo]' which is a future promise, not an actual release.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No environment specifications, requirements files, or dependency details are provided for the analysis scripts.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are included. The paper describes methods but does not provide a guide to replicate the analysis.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "The paper consistently reports bootstrap confidence intervals for percentages, e.g., 'P = 79.44% [79.27, 79.61]' (Section 4.2), using S=1,000 bootstrap resamples as described in Section 3.5.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper reports descriptive statistics and confidence intervals but does not perform significance tests for any comparative claims (e.g., differences between groups or experience levels).",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Results are presented as percentages with confidence intervals but without effect size measures for comparisons.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or formal justification for the sample size of 204 is provided. The paper does not discuss whether 204 is adequate for the analyses conducted.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "For experience, the paper reports mean, median, IQR (Q1=5.0, Q3=17.0) in Section 4.1. Bootstrap confidence intervals provide spread measures for survey percentages.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The paper systematically compares its findings against prior surveys (Table 1) and industry reports (StackOverflow, Capgemini, DORA, MIT) throughout Section 5 Discussion.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Comparisons are made against contemporary sources: StackOverflow 2025, Google DORA 2025, MIT 2025, Capgemini 2024 — all recent and relevant.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Survey study with no system components to ablate.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": false,
    198           "answer": false,
    199           "justification": "This is a descriptive survey, not an evaluation of a system. Percentages and confidence intervals describe the sample; there are no competing evaluation metrics.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Not applicable — the study is a survey collecting self-reported data, not evaluating a system's outputs.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Survey study — no train/test split applicable.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by SE activity (Figure 9), tool (Figure 10), usage frequency (Figure 11), benefits (Figure 12), challenges (Figure 16), country (Figure 1), role (Figure 4), organization size (Figure 6), and more.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Challenges and reasons for non-adoption are extensively discussed (Sections 4.2.1 and 4.3.5), including specific failure modes like hallucinations, validation overhead, and context understanding failures.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that ~20% do not use GenAI, 58% don't use objective metrics, ~9% disagree that quality improves, and 1% report negative productivity impact. Challenges are reported in detail.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": false,
    236           "answer": false,
    237           "justification": "Survey study — no AI models are evaluated.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "Survey study — no prompting is used.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "Survey study — no model hyperparameters applicable.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used in this survey study.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.5 describes the data cleaning pipeline: 10 responses removed for declining consent, 9 removed for not providing a valid SE activity, completeness check on remaining 204 responses. Qualitative coding procedures are described in detail including selective and open coding approaches.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Data is promised for Zenodo '[to be published]' but not actually available. No working URL or archive is provided.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.4 describes the data collection in detail: online questionnaire on surveyjs.io, May-November 2025, using convenience + purposive + snowball sampling via emails and LinkedIn.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 3.4 describes three sampling strategies: convenience (professional network, LinkedIn), purposive (targeting different countries, max 2 per organization), and snowball sampling through contacts.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Section 3.5 documents the pipeline: 223 responses received → 10 removed (consent declined) → 9 removed (no valid SE activity) → 204 used. Qualitative coding process is described with initial cycles, theme extraction, and validation by third/fourth authors.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "Survey study — no pre-trained model is evaluated on any benchmark.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Survey study — no pre-trained model is evaluated on any benchmark.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Survey study — no pre-trained model is evaluated on any benchmark.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "Section 3.2 states: 'The Research Ethics Committee at Izmir Institute of Technology approved the questionnaire.'",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Section 4.1 provides extensive demographics: country (37 countries, Figure 1), education field and degree (Figure 2), experience distribution (Figure 3), roles (Figure 4), sector (Figure 5), organization and team size (Figure 6), and project management approach (Figure 7).",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Section 3.4 states the target population is 'professionals performing SE-related activities.' Section 3.5 describes exclusion: 10 removed for declining consent, 9 removed for not providing a valid SE activity. The screening question C1 gates entry.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "Cross-sectional survey — no experimental conditions or randomization applicable.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "Cross-sectional survey — blinding is not applicable.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": true,
    351           "justification": "Section 3.5 reports: 223 total responses, 10 removed for declining consent, 9 removed for quality (no valid SE activity), leaving 204 for analysis. Per-question N values vary and are reported (e.g., N=162 for users, N=38 for non-users).",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "Survey paper — no computational method with inference costs.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "Survey paper — no significant computation required.",
    366           "source": "opus"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Approximately 80% of SE practitioners use GenAI tools for their SE activities",
    374       "evidence": "162 of 204 respondents reported GenAI use; bootstrapped P=79.44% [79.27, 79.61] from convenience/snowball sample in 37 countries",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Implementation is the primary GenAI use case (71%), far exceeding V&V (24%) and personal assistance (23%)",
    379       "evidence": "Frequency analysis of free-text Q13 responses coded to ISO 12207 taxonomy; N=162 with bootstrapped CIs in Figure 9",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "~95% of GenAI users report productivity increases, with 43% estimating an 8-hour task now takes 4 hours",
    384       "evidence": "Self-reported Likert scale Q17 with N=162; no objective measurement—58% of respondents explicitly use no objective metrics to verify this",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Incorrect output/hallucination is the dominant challenge (47.7%), followed by prompt engineering difficulties (31.5%) and validation overhead (25.9%)",
    389       "evidence": "Free-text Q20 (N=130) coded via grounded theory and mapped to ISO/IEC 25059 quality characteristics; reviewed by all four authors",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "58% of practitioners use no objective metric to measure size, productivity, or quality",
    394       "evidence": "Direct survey question Q19 with N=115 valid responses; explicitly acknowledged as undermining self-reported benefit claims",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "79% of practitioners expect GenAI to redefine rather than replace their roles within five years",
    399       "evidence": "Likert Q24 with N=198; bootstrapped CIs reported; aligns with Capgemini (59% no displacement concern) and StackOverflow (64%) surveys",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "54% of practitioners anticipate job market contraction despite personal job security confidence",
    404       "evidence": "Likert Q25 with N=194; creates tension with Q24 finding about individual role preservation",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "observational",
    410     "qualitative"
    411   ],
    412   "key_findings": "A questionnaire-based survey of 204 SE practitioners from 37 countries found that ~80% use GenAI tools, primarily for implementation (71%), with ChatGPT dominating (62%). Despite ~95% reporting perceived productivity increases and 82% perceiving quality gains, 58% use no objective metrics to verify these claims—a critical gap the authors explicitly flag. The top challenge is incorrect or hallucinated output (48%), followed by prompt engineering difficulties (31%) and output validation overhead (26%). Most practitioners (79%) expect GenAI to redefine rather than replace their roles, though 54% anticipate broader job market contraction from efficiency gains, revealing a divergence between individual and market-level outlooks.",
    413   "red_flags": [
    414     {
    415       "flag": "Self-report productivity inflation",
    416       "detail": "All productivity and quality improvement claims are self-reported with no objective verification; 58% of respondents explicitly admit using no metrics, yet the paper presents '95% report productivity increases' as a headline result without adequate qualification."
    417     },
    418     {
    419       "flag": "Non-probability sampling biases toward adopters",
    420       "detail": "Convenience and snowball sampling via authors' professional LinkedIn/email networks likely overrepresents GenAI enthusiasts and early adopters; practitioners who avoid GenAI tools or are skeptical are systematically underrepresented, inflating adoption rates."
    421     },
    422     {
    423       "flag": "Misleadingly narrow bootstrapped CIs",
    424       "detail": "Bootstrapped CIs are extremely narrow (e.g., [79.27, 79.61]) reflecting within-sample precision, but bootstrapping cannot correct for non-representative sampling; the false precision may be misread as population-level accuracy."
    425     },
    426     {
    427       "flag": "Data and code not yet available",
    428       "detail": "Data availability statement says 'to be published on Zenodo'—the questionnaire, raw data, and Python analysis scripts are not verifiable at submission time, making independent replication impossible."
    429     },
    430     {
    431       "flag": "No pre-registration",
    432       "detail": "A survey with 31 questions and multiple open-text coding schemes was not pre-registered, allowing post-hoc selection of which patterns and categories to highlight without a verifiable analysis plan."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges",
    438       "relevance": "Direct comparator surveying 410 developers on programming assistant use; finding on non-use due to functional failures (54%) contrasts with this study's skill/time barrier finding"
    439     },
    440     {
    441       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    442       "relevance": "Closest comparator; 481 developers in 71 countries on programming assistant usage—Table 1 uses it as a benchmark for coverage comparison"
    443     },
    444     {
    445       "title": "Productivity assessment of neural code completion",
    446       "relevance": "Seminal survey of 2,047 Copilot users on perceived productivity; used as baseline comparison for scale and methodology"
    447     },
    448     {
    449       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    450       "relevance": "RCT showing 55.8% faster task completion with Copilot; referenced as a more rigorous counterpart to self-reported productivity estimates"
    451     },
    452     {
    453       "title": "Navigating the complexity of generative AI adoption in software engineering",
    454       "relevance": "Prior survey on GenAI adoption in SE across 100 practitioners; Table 1 systematically compares coverage against this study"
    455     },
    456     {
    457       "title": "Toward Effective AI Support for Developers: A survey of desires and concerns",
    458       "relevance": "Large survey of 737 Microsoft developers on AI support desires; compared directly in non-adoption analysis (trust gap vs. skill gap finding)"
    459     },
    460     {
    461       "title": "Rethinking software engineering in the era of foundation models: A curated catalogue of challenges",
    462       "relevance": "Foundational framing paper on FMware engineering challenges; provides theoretical context for organizational governance findings"
    463     },
    464     {
    465       "title": "Sampling in software engineering research: A critical review and guidelines",
    466       "relevance": "Methodological reference used to justify sampling approach and bootstrapping decisions in Section 3.4"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 3,
    472       "justification": "Directly actionable for software teams, CTOs, and policy makers making GenAI adoption decisions—covers tools, use cases, governance mechanisms, and workforce impacts across 37 countries."
    473     },
    474     "surprise_contrarian": {
    475       "score": 1,
    476       "justification": "The 58% no-metrics finding is notable, and the skill-gap vs. trust-gap non-adoption dichotomy adds nuance, but most headline numbers confirm existing industry surveys."
    477     },
    478     "fear_safety": {
    479       "score": 2,
    480       "justification": "Raises concerns about overreliance and skill atrophy, 54% expecting job market contraction, and cloud-based security/privacy risks from proprietary code exposure."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "No major controversy; results broadly align with StackOverflow and Capgemini industry reports, with the tension between individual confidence and market contraction concern being the main narrative hook."
    485     },
    486     "demo_ability": {
    487       "score": 0,
    488       "justification": "Survey paper with no tool, system, or interactive component to demonstrate."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "Authors from established academic institutions (TU/e, PUC-Rio, BTH/fortiss) but not famous industry labs; no brand recognition beyond academic SE circles."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [],
    497     "top_points": 0,
    498     "total_points": 0,
    499     "total_comments": 0
    500   }
    501 }

Impressum · Datenschutz