scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23827B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "An Empirical Study of Generative AI Adoption in Software Engineering",
      6     "authors": [
      7       "G. Giray",
      8       "Onur Demirörs",
      9       "Marcos Kalinowski",
     10       "Daniel Méndez"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2512.23327",
     15     "doi": "10.48550/arXiv.2512.23327"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All major abstract claims (80% adoption, cycle time reduction, quality improvement, hallucination challenges, institutionalization gaps) are directly supported by survey results in the paper.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper makes no causal claims; all findings are explicitly framed as self-reported perceptions (e.g., 'practitioners report,' 'perceived productivity change').",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 6 explicitly states 'we avoided further generalizability claims throughout the paper' and recommends replications; results are attributed to the sample throughout.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss social desirability bias, self-selection bias among GenAI adopters, or alternative explanations for the strongly positive productivity perceptions reported by 95% of users.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly flags that 'perceived improvements become even more questionable' given limited objective measurement, and dedicates RQ2.4 to showing that 58% of practitioners use no objective metrics.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 'Threats to Validity' is a dedicated section covering face/content, criterion, construct, and reliability validity.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific mitigations are named (pilot study with 5 SE professionals, social scientist validation, bootstrapping with S=1000, purposive sampling capped at ~20 responses per country, IRB approval).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states it represents software professionals in its non-probability sample and that 'replications should be conducted to further strengthen the statistical generalizability.'",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source or acknowledgment of financial support appears anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list full institutional affiliations (TU/e, Izmir Institute of Technology, PUC-Rio, Blekinge Institute of Technology, fortiss) on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding was disclosed, making this criterion not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no competing interests or financial disclosure statement; only AI use in writing is declared.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 2.1 explicitly defines AI, GenAI, SE for GenAI, and GenAI for SE, with the study scope limited to the latter.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The objective is explicitly stated: 'provide an overview of the status of GenAI adoption in SE' across four structured research questions (RQ1–RQ4).",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 1 systematically compares this study to 17 prior questionnaire-based studies across 16 coverage dimensions, explicitly identifying gaps the current study fills.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The Data Availability statement says scripts are 'available in our online open science repository [to be published on Zenodo]' — not yet released.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Raw survey data is also promised '[to be published on Zenodo]' — not currently available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": false,
    134           "answer": false,
    135           "justification": "This is a questionnaire survey; no software environment or dependencies are required to replicate the study design.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The questionnaire structure is described but no step-by-step instructions for replicating the full data collection and analysis pipeline are provided in the paper.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Bootstrapped 95% CIs are reported consistently for all percentage findings, e.g., 'P = 79.44% [79.27, 79.61]'.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": false,
    154           "answer": false,
    155           "justification": "The paper reports descriptive statistics and does not make formal comparative claims between subgroups that would require significance testing.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": false,
    160           "answer": false,
    161           "justification": "No formal effect size statistics (Cohen's d, odds ratios, etc.) are reported; the study is purely descriptive.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No power analysis or formal sample size justification is provided; the authors simply explain that 204 usable responses were obtained.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Bootstrapped confidence intervals serve as a spread measure and are reported for all main results throughout the paper.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": false,
    180           "answer": false,
    181           "justification": "This is a descriptive survey study; there is no experimental system or treatment being compared against a baseline.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": false,
    186           "answer": false,
    187           "justification": "Not applicable; no experimental baselines are involved.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Not applicable; no system components are being evaluated.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The study uses Likert scales, free-text coding, binary questions, and continuous experience measurements across multiple constructs.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "The study is itself a human survey; the criterion for human evaluation of system outputs does not apply.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "Not a prediction task; no held-out set is relevant.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by SE activity (Figure 9), tool (Figure 10), challenge type (Figure 16), organization size, role, and country.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Non-adoption reasons (Figure 8) and challenges (Figure 16, covering 16 categories) are extensively discussed with quantification.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that 20% don't use GenAI tools, that 58% use no objective productivity metrics, and that validated code quality studies contradict practitioners' positive perceptions.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": false,
    236           "answer": false,
    237           "justification": "The study surveys practitioners about tools they use; the researchers themselves do not employ LLMs as part of the study methodology.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "No LLM prompting is part of the research methodology; survey questions are provided in Table 2.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "No model inference is performed by the researchers; not applicable.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used in this survey study.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.5 documents quality checks: removal of 10 non-consent responses, 9 responses lacking valid SE activity, completeness verification, and qualitative coding procedures.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw data is promised '[to be published on Zenodo]' but is not currently available for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.4 describes three sampling techniques (convenience, purposive, snowball), distribution channels (LinkedIn, email), data period (May–Nov 2025), and country-level response caps.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Convenience sampling via professional network, purposive sampling with max two participants per organization, and snowball sampling are all described with rationale.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from collection to analysis is documented: quality filtering (Section 3.5), bootstrapping (S=1000), grounded theory coding with two independent reviewers, and ISO/IEC 12207 taxonomy mapping.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This is a practitioner survey; no model capabilities are being benchmarked, making training cutoff irrelevant.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not applicable; no benchmark evaluation is performed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "Not applicable; no model benchmarking is conducted.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "No pre-registration is mentioned anywhere in the paper.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": true,
    320           "answer": true,
    321           "justification": "Section 3.2 states: 'The Research Ethics Committee at Izmir Institute of Technology approved the questionnaire.'",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": true,
    326           "answer": true,
    327           "justification": "Extensive demographics reported: country (37 countries), education field and degree, years of experience, role, sector, organization size, team size, and project management approach.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": true,
    332           "answer": true,
    333           "justification": "Screening question C1 (consent), and exclusion of responses failing to provide a valid SE activity (Q11=Yes without Q13 answer) are explicitly documented.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "This is an observational survey with no experimental randomization; not applicable.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "Blinding is not feasible or relevant in a self-report questionnaire study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": true,
    350           "answer": true,
    351           "justification": "223 responses received; 10 removed for non-consent, 9 for not providing valid SE activity, leaving 204 for analysis — fully documented in Section 3.5.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No AI inference is performed by the researchers as part of the methodology; cost is irrelevant.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": false,
    364           "answer": false,
    365           "justification": "A questionnaire study has no meaningful computational budget to report.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Approximately 80% of SE practitioners use GenAI tools for SE activities.",
    374       "evidence": "162 of 204 respondents (P=79.44% [79.27, 79.61]) reported using GenAI tools.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Approximately 95% of GenAI users report a productivity increase.",
    379       "evidence": "Q17 responses: 43% report 50% time reduction, 27% report 75% reduction, 26% report moderate increase; only 4.6% neutral or negative.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "82% of respondents perceive quality improvement in their work when using GenAI tools.",
    384       "evidence": "46.7% 'strongly agree' and 35.2% 'somewhat agree' that GenAI enables better quality (N=161).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Incorrect or unreliable outputs (hallucinations) is the dominant challenge, affecting 47.7% of users.",
    389       "evidence": "Figure 16 shows 'Inaccurate output/Hallucination' at P=47.70% [47.42, 47.98], far ahead of any other challenge.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "58% of SE practitioners do not use any objective metric to measure productivity or quality.",
    394       "evidence": "RQ2.4 results: 58.15% [57.85, 58.45] explicitly state no objective metric is used (N=115).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "79% of practitioners expect GenAI to redefine rather than replace their roles within five years.",
    399       "evidence": "Q24 responses: 79% agree that GenAI will redefine their role; only 11% disagree (N=198).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "ChatGPT dominates the GenAI tool landscape with 62% usage among SE professionals.",
    404       "evidence": "Figure 10: ChatGPT at P=62.38% [62.14, 62.61], followed by Copilot (19.85%) and Gemini (19.08%).",
    405       "supported": "strong"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "observational",
    410     "qualitative"
    411   ],
    412   "key_findings": "A questionnaire survey of 204 SE practitioners across 37 countries found that approximately 80% use GenAI tools, primarily for implementation tasks, with ChatGPT dominating. Practitioners widely perceive productivity and quality benefits, with 95% reporting time savings, but 58% use no objective metrics to verify these gains. The dominant challenge is incorrect/hallucinated outputs (48%), followed by prompt engineering difficulty and validation overhead. Institutionalization is uneven — most organizations provide tool access but fewer invest in training or governance — and practitioners largely expect role redefinition over replacement, with moderate concern about job market contraction.",
    413   "red_flags": [
    414     {
    415       "flag": "Self-report bias",
    416       "detail": "All productivity and quality findings are based on self-perception, not objective measurement. The paper acknowledges this gap but still leads with these as key results without adequate caveats in the abstract."
    417     },
    418     {
    419       "flag": "Non-probability sampling with network bias",
    420       "detail": "Convenience + snowball sampling via authors' personal and LinkedIn networks biases toward respondents with ties to the authors' countries (USA 24, Brazil 21, Turkey 19). Despite purposive sampling controls, the sample is not representative."
    421     },
    422     {
    423       "flag": "Data not yet released",
    424       "detail": "The Data Availability statement says data and scripts will be published '[to be published on Zenodo]' — at time of submission, no independent verification of results is possible."
    425     },
    426     {
    427       "flag": "AI-assisted writing undisclosed in methods",
    428       "detail": "The authors declare that Gemini, ChatGPT, and NotebookLM were used for summarization, rephrasing, and producing analysis scripts, but the extent of AI involvement in framing findings is unclear."
    429     },
    430     {
    431       "flag": "No pre-registration",
    432       "detail": "Research questions and hypotheses were not pre-registered, raising potential for selective reporting of the most favorable findings from an internationally distributed survey."
    433     },
    434     {
    435       "flag": "Social desirability in productivity estimates",
    436       "detail": "The claim that 95% of users report productivity increases and 82% report quality improvement is implausibly high and consistent with social desirability bias in self-report surveys, which is not discussed as a threat."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges",
    442       "relevance": "Direct comparator study surveying 410 developers across 57 countries on programming assistant usability and non-use reasons"
    443     },
    444     {
    445       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    446       "relevance": "Survey of 481 developers in 71 countries on coding assistant use, directly compared in Table 1"
    447     },
    448     {
    449       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    450       "relevance": "Key productivity evidence paper (55.8% task completion speedup) that this survey's perceived productivity claims are compared against"
    451     },
    452     {
    453       "title": "Navigating the complexity of generative AI adoption in software engineering",
    454       "relevance": "Prior survey of 100 SE professionals across multiple countries, direct predecessor in this literature"
    455     },
    456     {
    457       "title": "Toward Effective AI Support for Developers: A survey of desires and concerns",
    458       "relevance": "Survey of 737 Microsoft developers — large industry study used as comparison point for adoption and concern patterns"
    459     },
    460     {
    461       "title": "Productivity assessment of neural code completion",
    462       "relevance": "GitHub Copilot survey of 2,047 developers — largest comparator, explicitly cited for usage pattern comparisons"
    463     },
    464     {
    465       "title": "Sampling in software engineering research: A critical review and guidelines",
    466       "relevance": "Methodological foundation paper for the sampling design choices justified in Section 3.4"
    467     },
    468     {
    469       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    470       "relevance": "Counterpoint study finding AI can slow experienced developers, cited as nuancing the positive productivity narrative"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 3,
    476       "justification": "Directly actionable for organizations and practitioners deciding how to govern, train for, and adopt GenAI tools."
    477     },
    478     "surprise_contrarian": {
    479       "score": 1,
    480       "justification": "Mostly confirms expected adoption patterns; the contrast between high perceived productivity and lack of objective measurement is a mild insight, not shocking."
    481     },
    482     "fear_safety": {
    483       "score": 1,
    484       "justification": "Mentions job market contraction concerns (54%) and security/hallucination risks, but frames them moderately without alarming conclusions."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "The 'replace vs. redefine' framing and job market concern angle provides a mild debate hook but no strong controversy."
    489     },
    490     "demo_ability": {
    491       "score": 0,
    492       "justification": "Survey paper with no artifact, tool, or demo that a reader can try."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Authors are from established academic institutions but no industry lab; prominent tools (ChatGPT, GitHub Copilot) are mentioned throughout, lending familiarity."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [],
    501     "top_points": 0,
    502     "total_points": 0,
    503     "total_comments": 0
    504   }
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs