scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22924B)
      1 {
      2   "paper": {
      3     "title": "From Gains to Strains: Modeling Developer Burnout with GenAI Adoption",
      4     "authors": ["Zixuan Feng", "Sadia Afroz", "Anita Sarma"],
      5     "year": 2025,
      6     "venue": "ICSE-SEIS 2026",
      7     "arxiv_id": "2510.07435",
      8     "doi": "10.1145/3786581.3786934"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["observational", "qualitative"],
     13   "key_findings": "GenAI adoption heightens developer burnout by increasing job demands (organizational pressure and workload), while job resources (autonomy, learning resources) and positive AI perceptions mitigate burnout (R²=0.398, N=442). Larger organizations impose more organizational pressure but provide more learning resources; senior developers have more autonomy and access to resources. Burnout itself showed no significant association with developer characteristics (role, seniority, org size), suggesting it is broadly experienced. Qualitative findings reveal three workflow shifts: euphoria-to-stress, apprenticeship-to-copy-paste, and hidden collaboration costs from AI-generated content review burden.",
     14   "claims": [
     15     {
     16       "claim": "Job demands related to GenAI adoption (organizational pressure and workload) are positively associated with developer burnout (β=0.398, p<.001)",
     17       "evidence": "PLS-SEM structural model evaluation in Section 5.2, Table 3, Figure 2. Path coefficient β=0.398, SD=0.044, 95% CI (0.313, 0.483), p=0.000. 5,000 bootstrap subsamples.",
     18       "supported": "strong"
     19     },
     20     {
     21       "claim": "Job resources (autonomy and learning resources) are negatively associated with burnout (β=−0.360, p<.001)",
     22       "evidence": "PLS-SEM structural model, Table 3. Path coefficient β=−0.360, SD=0.042, 95% CI (−0.445, −0.278), p=0.000.",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "Favorable AI perceptions are negatively associated with burnout (β=−0.246, p<.001)",
     27       "evidence": "PLS-SEM structural model, Table 3. Path coefficient β=−0.246, SD=0.048, 95% CI (−0.334, −0.148), p=0.000.",
     28       "supported": "strong"
     29     },
     30     {
     31       "claim": "Developer characteristics (role, org size, seniority) are not significantly associated with burnout",
     32       "evidence": "OLS regression in Table 4, Section 6.2. None of the three predictors reached significance for the Burnout dependent variable after BH correction.",
     33       "supported": "moderate"
     34     },
     35     {
     36       "claim": "Organizational pressure is higher for frequent coders (β=0.51, p<.05) and in larger organizations (β=0.41, p<.001)",
     37       "evidence": "OLS regression, Table 4, Section 6.2.",
     38       "supported": "strong"
     39     },
     40     {
     41       "claim": "Larger organizations provide more learning resources (β=0.28, p<.001) but less autonomy (β=−0.13, p<.001)",
     42       "evidence": "OLS regression, Table 4, Section 6.2.",
     43       "supported": "strong"
     44     }
     45   ],
     46   "checklist": {
     47     "artifacts": {
     48       "code_released": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No source code or analysis scripts are released. The paper references a supplementary document on Zenodo (ref [3]) but this contains the questionnaire and additional tables, not analysis code."
     52       },
     53       "data_released": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No survey response data is released. Only aggregated results are presented. The Zenodo supplement contains the questionnaire and supplementary tables, not raw data."
     57       },
     58       "environment_specified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No environment specifications provided. The paper mentions using SmartPLS 4 and JASP but does not provide versions or environment details for reproduction."
     62       },
     63       "reproduction_instructions": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No step-by-step reproduction instructions are provided. The methodology is described in detail but there are no scripts or commands to replicate the analysis."
     67       }
     68     },
     69     "statistical_methodology": {
     70       "confidence_intervals_or_error_bars": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "95% confidence intervals are reported for all path coefficients in Table 3 (e.g., H1: 95% CI (0.313, 0.483))."
     74       },
     75       "significance_tests": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Bootstrapping with 5,000 subsamples for PLS-SEM p-values (Table 3), and Benjamini-Hochberg corrected p-values for the OLS regressions (Table 4)."
     79       },
     80       "effect_sizes_reported": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Standardized path coefficients (β) are reported in Table 3 and regression coefficients in Table 4. R²=0.398 for burnout is reported with context that 0.30-0.40 represents moderate explanatory power. Power analysis assumed medium effect size f²=0.15."
     84       },
     85       "sample_size_justified": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Power analysis using G*Power is reported in Section 5.1 (Step 2): minimum required N=119 for 3 predictors, medium effect size f²=0.15, α=0.05, power=0.95. Actual N=442 far exceeds this."
     89       },
     90       "variance_reported": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard deviations are reported for path coefficients in Table 3 (e.g., SD=0.044 for H1). The bootstrapping approach with 5,000 subsamples provides variance estimates."
     94       }
     95     },
     96     "evaluation_design": {
     97       "baselines_included": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The study compares against the theoretical JD-R model framework and uses AI-perception as a control variable. The model is assessed against established thresholds for reliability, validity, and fit (SRMR, R², Q²)."
    101       },
    102       "baselines_contemporary": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "Not applicable — this is a survey-based empirical study testing a theoretical model, not a system evaluation with competing baselines."
    106       },
    107       "ablation_study": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "Not applicable — the system is a theoretical model (JD-R), not a multi-component technical system. The formative HOC/LOC structure serves a similar analytical purpose."
    111       },
    112       "multiple_metrics": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Multiple evaluation metrics used: R², Q² (predictive relevance), SRMR (model fit), AVE, Cronbach's α, composite reliability (ρa, ρc), HTMT, VIF, outer loadings, and outer weights."
    116       },
    117       "human_evaluation": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "Not applicable — this is a survey study about human experience, not a system producing outputs that need human evaluation."
    121       },
    122       "held_out_test_set": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "Not applicable — this is a survey-based study, not a predictive modeling task requiring train/test splits."
    126       },
    127       "per_category_breakdown": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Results broken down by construct (Organizational Pressure, Workload, Autonomy, Learning Resources, Burnout) and by developer characteristics (role, org size, seniority) in Table 4. Learning resource types broken down in Table 5."
    131       },
    132       "failure_cases_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Hypotheses H4a and H4c are explicitly reported as unsupported (no significant associations found). The paper discusses where the model's predictions were not confirmed."
    136       },
    137       "negative_results_reported": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "H4a (developer characteristics → burnout) and H4c (characteristics → workload) found no significant associations, reported transparently in Section 6.2."
    141       }
    142     },
    143     "claims_and_evidence": {
    144       "abstract_claims_supported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Abstract claims ('GenAI adoption heightens burnout by increasing job demands, while job resources and positive perceptions mitigate these effects') are directly supported by the PLS-SEM results in Table 3 with significant path coefficients."
    148       },
    149       "causal_claims_justified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper explicitly states in Section 7 that hypotheses 'propose associations between different constructs rather than causal relationships, as the present study is a cross-sectional sample study.' The language throughout uses 'associated with' rather than causal language."
    153       },
    154       "generalization_bounded": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 7 acknowledges 'no single sample can capture the entire global software workforce' and describes the sample boundaries (442 practitioners from 56 organizations). The 90% male skew is noted. Cross-sectional limitation is stated."
    158       },
    159       "alternative_explanations_discussed": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 7 discusses specific confounds: 'participants who were already experiencing higher burnout (e.g., due to upcoming deadlines) might adopt AI differently,' and 'trust in AI systems or perceived productivity, may have influenced results but were not directly measured.'"
    163       },
    164       "proxy_outcome_distinction": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper is explicit that burnout is measured through 4 Likert-scale indicators capturing different facets (workload manageability, exhaustion, cynicism, reduced efficacy) as reflective indicators of the latent burnout construct. The measurement model is thoroughly validated. The paper frames its measurement at appropriate granularity."
    168       }
    169     },
    170     "setup_transparency": {
    171       "model_versions_specified": {
    172         "applies": false,
    173         "answer": false,
    174         "justification": "Not applicable — this paper does not use any AI/LLM models in its methodology. It studies GenAI adoption's effects on humans."
    175       },
    176       "prompts_provided": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "Not applicable — the paper does not use prompting. It is a survey study."
    180       },
    181       "hyperparameters_reported": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "PLS-SEM parameters reported: 5,000 bootstrap subsamples, significance level α=0.05, medium effect size f²=0.15. SmartPLS 4 used with blindfolding for Q². JASP used for EFA."
    185       },
    186       "scaffolding_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "Not applicable — no agentic scaffolding is used in this study."
    190       },
    191       "data_preprocessing_documented": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The paper describes receiving 688 responses, excluding invalid responses to reach N=442. EFA was used to verify factor structure. Composite scores were computed using outer weights from the measurement model. Coding procedures for qualitative data described with IRR (Jaccard index 90%, 92%)."
    195       }
    196     },
    197     "limitations_and_scope": {
    198       "limitations_section_present": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 7 is a dedicated 'Limitations' section with substantive discussion of cross-sectional design, method choice (PLS-SEM vs CB-SEM), sample representativeness, and confounding factors."
    202       },
    203       "threats_to_validity_specific": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 7 discusses specific threats: pre-existing burnout could bias responses, trust in AI was not directly measured as a confound, the choice of PLS-SEM over CB-SEM is justified for their model structure, and the cross-sectional design prevents causal inference."
    207       },
    208       "scope_boundaries_stated": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The paper states results should be 'interpreted as a theoretical starting point,' acknowledges cross-sectional design prevents causal claims, notes PLS-SEM was chosen because the model includes formative and reflective constructs, and bounds generalization to the sample characteristics."
    212       }
    213     },
    214     "data_integrity": {
    215       "raw_data_available": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Raw survey responses are not made available. Only aggregated statistics and model outputs are presented."
    219       },
    220       "data_collection_described": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Section 4 describes the survey instrument development (one month, three researchers), IRB approval, 5-point Likert scales, survey duration (5-8 minutes), two-week availability window, and the complete questionnaire is in the supplementary document."
    224       },
    225       "recruitment_methods_described": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Section 4 describes recruitment from 56 OSS communities spanning diverse domains (Microsoft, Google, Netflix, Red Hat, IBM, Kubernetes, Hugging Face, Python, TensorFlow). Email invitations were sent with consent forms, GDPR compliance, and IRB approval."
    229       },
    230       "data_pipeline_documented": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The pipeline is documented: 688 responses → exclusion of invalid responses → 442 final participants. EFA validated factor structure. PLS-SEM measurement model assessed. Qualitative coding: two authors independently coded 20% for IRR, then split remaining. However, criteria for 'invalid responses' are not detailed."
    234       }
    235     },
    236     "conflicts_of_interest": {
    237       "funding_disclosed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Acknowledgments section states: 'This work was supported by NSF Grant No. 2303043.'"
    241       },
    242       "affiliations_disclosed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "All three authors are from Oregon State University. The study does not evaluate any product from their institution, so no product-related conflict exists."
    246       },
    247       "funder_independent_of_outcome": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "NSF is an independent government funding agency with no financial stake in whether GenAI causes burnout or not."
    251       },
    252       "financial_interests_declared": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No competing interests or financial disclosure statement is present in the paper."
    256       }
    257     },
    258     "contamination": {
    259       "training_cutoff_stated": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not applicable — this paper does not evaluate a pre-trained model's capability on any benchmark. It is a survey study of developer burnout."
    263       },
    264       "train_test_overlap_discussed": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not applicable — no pre-trained model evaluation involved."
    268       },
    269       "benchmark_contamination_addressed": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Not applicable — no benchmark evaluation of a pre-trained model."
    273       }
    274     },
    275     "human_studies": {
    276       "pre_registered": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry) found in the paper."
    280       },
    281       "irb_or_ethics_approval": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 4 states: 'The survey began with the university's Institutional Review Board (IRB) approved consent form.'"
    285       },
    286       "demographics_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Table 1 reports detailed demographics: gender (90% men, 10% gender minorities), role (11 categories), organization size (4 levels), and experience (4 levels). N=442."
    290       },
    291       "inclusion_exclusion_criteria": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper describes broad recruitment targeting 56 OSS communities but does not state specific inclusion/exclusion criteria for who qualified as a valid participant. The reduction from 688 to 442 responses mentions 'excluding invalid responses' without stating the criteria."
    295       },
    296       "randomization_described": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "Not applicable — this is a cross-sectional survey study, not an experimental study with treatment/control conditions."
    300       },
    301       "blinding_described": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "Not applicable — this is a cross-sectional survey, not an experimental study where blinding would be relevant."
    305       },
    306       "attrition_reported": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "688 responses received, 442 retained after excluding invalid responses (attrition of 35.8%). For qualitative analysis, N=221 responded to the open-ended question, and N=361 responded to the resources question."
    310       }
    311     },
    312     "cost_and_practicality": {
    313       "inference_cost_reported": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "Not applicable — this is a survey study, not a computational method with inference costs."
    317       },
    318       "compute_budget_stated": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "Not applicable — this is a survey study with no significant computational requirements."
    322       }
    323     }
    324   },
    325   "red_flags": [
    326     {
    327       "flag": "Recruitment bias toward OSS communities",
    328       "detail": "Participants were recruited from 56 OSS communities including major tech companies (Microsoft, Google, Netflix). This likely overrepresents developers in large, tech-forward organizations and underrepresents developers in non-tech industries, small consultancies, or regions with different AI adoption patterns. The 90% male composition also limits generalizability."
    329     },
    330     {
    331       "flag": "Invalid response exclusion criteria unstated",
    332       "detail": "246 responses (35.8%) were excluded as 'invalid' without stating the exclusion criteria. This is a substantial proportion and the filtering criteria could meaningfully affect results."
    333     },
    334     {
    335       "flag": "Single-item measurement for learning resources",
    336       "detail": "Learning Resources is measured with a single Likert item, which the authors acknowledge cannot be assessed for reliability or included in EFA. This is a weak measurement for a construct that plays a key role in the structural model."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    342       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    343       "year": 2025,
    344       "relevance": "Observational study finding 19% productivity loss from AI tools, directly relevant to AI productivity claims evaluation."
    345     },
    346     {
    347       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    348       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    349       "year": 2023,
    350       "relevance": "Key RCT on GitHub Copilot productivity impact, foundational study in the AI developer tools space."
    351     },
    352     {
    353       "title": "How much does AI impact development speed? An enterprise RCT",
    354       "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"],
    355       "year": 2025,
    356       "relevance": "Enterprise RCT measuring AI's impact on development speed, directly relevant to productivity claims."
    357     },
    358     {
    359       "title": "Accelerate State of DevOps 2024: A Decade with DORA",
    360       "authors": ["Derek DeBellis", "Kevin M. Storer", "Amanda Lewis"],
    361       "year": 2024,
    362       "relevance": "Large-scale survey (39,000 developers) finding only 2.1% productivity rise per 25% AI adoption increase, with 7.2% decline in delivery performance."
    363     },
    364     {
    365       "title": "Navigating the complexity of generative AI adoption in software engineering",
    366       "authors": ["Daniel Russo"],
    367       "year": 2024,
    368       "relevance": "Study on complexity of GenAI adoption in SE, relevant to adoption intent and developer experience research."
    369     },
    370     {
    371       "title": "Predicting attrition among software professionals: Antecedents and consequences of burnout and engagement",
    372       "authors": ["Bianca Trinkenreich", "Fabio Santos", "Klaas-jan Stol"],
    373       "year": 2024,
    374       "relevance": "Uses JD-R model for software professional burnout prediction, direct methodological predecessor."
    375     },
    376     {
    377       "title": "The SPACE of AI: Real-World Lessons on AI's Impact on Developers",
    378       "authors": ["Brian Houck", "Travis Lowdermilk", "Cody Beyer", "Steven Clarke", "Ben Hanrahan"],
    379       "year": 2025,
    380       "relevance": "Uses SPACE framework to evaluate AI's impact on developer productivity, relevant to productivity measurement."
    381     },
    382     {
    383       "title": "What Needs Attention? Prioritizing Drivers of Developers' Trust and Adoption of Generative AI",
    384       "authors": ["Rudrajit Choudhuri", "Bianca Trinkenreich", "Rahul Pandita"],
    385       "year": 2025,
    386       "relevance": "Studies trust and adoption intentions for GenAI among developers, directly relevant to adoption factors."
    387     },
    388     {
    389       "title": "Canaries in the Coal Mine? Six Facts about the Recent Employment Effects of Artificial Intelligence",
    390       "authors": ["Erik Brynjolfsson", "Bharat Chandar", "Ruyu Chen"],
    391       "year": 2025,
    392       "relevance": "Documents employment effects of AI including 13-20% decline in entry-level cognitive jobs."
    393     },
    394     {
    395       "title": "Burnout in software engineering: A systematic mapping study",
    396       "authors": ["Tien Rahayu Tulili", "Andrea Capiluppi", "Ayushi Rastogi"],
    397       "year": 2023,
    398       "relevance": "Systematic mapping of burnout research in SE, directly relevant to understanding developer well-being."
    399     },
    400     {
    401       "title": "A Review on Vibe Coding: Fundamentals, State-of-the-art, Challenges and Future Directions",
    402       "authors": ["Partha Pratim Ray"],
    403       "year": 2025,
    404       "relevance": "Reviews vibe coding phenomenon referenced in this paper's discussion of shifting developer learning practices."
    405     }
    406   ]
    407 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs