ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (25403B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Developer Productivity with GenAI",
      6     "authors": [
      7       "Sadia Afroz",
      8       "Zixuan Feng",
      9       "Katie Kimura",
     10       "Bianca Trinkenreich",
     11       "Igor Steinmacher",
     12       "Anita Sarma"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2510.24265",
     17     "doi": "10.48550/arXiv.2510.24265"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 'limited overall productivity change' and a 'productivity paradox,' both supported by the survey results showing most responses in the neutral/no-change range across SPACE dimensions.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper uses language like 'GenAI adoption affects developer productivity' (RQ) and 'GenAI tools streamline routine coding tasks' (Observation 4), implying causal relationships from a cross-sectional survey that cannot establish causation. Self-selection bias (more productive developers may adopt AI more) is not addressed.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'Developer Productivity with GenAI' is broad, but the sample is drawn from 56 OSS communities (not representative of all developers). The paper does not bound its claims to OSS developers or acknowledge this limitation.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations such as self-selection bias (productive developers adopt AI more), novelty effects, or confounding variables like experience level or task complexity driving the observed patterns.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 6 explicitly discusses the gap between perceived speed and actual productivity: 'Does faster task completion truly represent progress, or merely shift effort without improving outcomes?' The paper acknowledges it measures perceptions, not objective productivity.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 6) acknowledges self-reported data limitations in one sentence but does not provide substantive discussion.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No specific threats to validity are discussed. The paper only mentions 'such perceptions may not fully align with objective productivity outcomes' without addressing specific threats like self-selection bias, response bias, or sample representativeness.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No explicit scope boundaries are stated. The paper does not specify what its results do NOT show or what populations/settings are excluded from its claims.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All authors' affiliations are clearly listed: Oregon State University, Colorado State University, Northern Arizona University.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms operationalized through SPACE framework (5 dimensions with 18 items), GenAI adoption measured by usage frequency (Never/Rarely/Sometimes/Often/Always). Definitions sufficient for interpretation.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "RQ stated explicitly: 'How does GenAI adoption affect developer productivity across multiple dimensions?' Contribution is first large-scale (n=415) survey using SPACE framework for GenAI impact assessment.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 discusses productivity frameworks (DevEx, DORA, SPACE). Section 5 situates work relative to prior GenAI studies (Dohmke, Paradis, Tong). Positioning as comprehensive multidimensional view vs narrow task-focused studies is clear.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL or code archive is provided in the paper. The supplementary material reference [1] points to a Zenodo DOI for the appendix/questionnaire, but no analysis code is released.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "No survey response data is released. The supplementary material [1] contains only the questionnaire appendix, not the collected data.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment or dependency specifications are provided. The analysis approach (descriptive statistics, stacked bar charts, violin plots) is described but no software environment is specified.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No confidence intervals or error bars are reported. The paper presents only descriptive statistics (medians, percentages) via stacked bar charts and violin plots with no uncertainty quantification.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are used. The paper compares frequent vs. non-frequent AI users using only visual inspection of stacked bar charts and violin plots, with no formal tests despite making comparative claims.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No effect sizes are reported. Differences between groups are described qualitatively ('slightly higher') without quantifying the magnitude.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The sample size of 415 is reported but not justified. No power analysis or rationale for why this N is sufficient is provided.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Violin plots show distribution shapes but no standard deviations, IQRs, or other formal spread measures are reported in text or tables.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The study compares frequent AI users against non-frequent AI users as a baseline comparison group across all SPACE dimensions.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": false,
    188           "answer": false,
    189           "justification": "Not applicable — this is a survey study, not a system evaluation. The comparison groups are contemporary by design.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "Not applicable — no system with components to ablate.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "The study measures productivity across 5 SPACE dimensions with 20 individual items (S1-S4, P1-P3, A1-A7, C1-C4, E1-E2).",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "Not applicable — this is a survey study measuring perceptions, not evaluating system outputs.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Not applicable — no ML model or test set involved.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by each SPACE dimension and by individual survey items within each dimension (Figures 1-6).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "The paper discusses where GenAI fails to help: no improvement in collaboration (Observation 5), no improvement in test success or learning pace (Observation 3), continued developer fatigue (Observation 2).",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The core finding is essentially negative: GenAI adoption has not produced substantial productivity changes. The paper reports limited impact across most SPACE dimensions.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": false,
    238           "answer": false,
    239           "justification": "Not applicable — the study surveys developers about their GenAI usage generally; it does not use specific AI models in experiments.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": false,
    244           "answer": false,
    245           "justification": "Not applicable — no prompting is used in this study.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": false,
    250           "answer": false,
    251           "justification": "Not applicable — no AI models are run as part of this study.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "Not applicable — no agentic scaffolding is used.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": false,
    263           "justification": "The paper states 688 responses were received and 'after removing invalid entries, we kept 415 responses' but does not describe the criteria for determining invalidity or the filtering process.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw survey response data is not made available. Only aggregated results are presented in the figures.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.1.2 describes the data collection procedure: email invitations to 56 OSS communities, two-week collection window, anonymization per GDPR and IRB approval, $50 raffle incentive.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Section 3.1.2 describes recruitment from 56 OSS communities including organizational repositories (IBM, Oracle, Google, Adobe), infrastructure projects, AI projects, and data science communities.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The pipeline from 688 to 415 responses is mentioned but the filtering criteria for 'removing invalid entries' are not described. The 273 removed responses (40%) are unexplained.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "Not applicable — this is a survey study that does not evaluate any pre-trained model on a benchmark.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable — no model evaluation on benchmarks.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable — no benchmark evaluation.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry is provided.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": true,
    323           "justification": "Section 3.1 states 'The protocol was approved by our university's IRB.'",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Section 3.1.2 reports demographics: gender (90.6% men), organization size (57.83% large/extra-large), experience (82.17% >5 years), roles (36.87% full-stack, 16.87% backend, 15.42% data/ML).",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "No explicit inclusion or exclusion criteria are stated. The paper describes recruiting from OSS communities but does not specify who was eligible or what made an entry 'invalid' (273 responses removed).",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "Not applicable — this is a cross-sectional survey, not an experimental study with randomized assignment.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "Not applicable — cross-sectional survey, not an experimental study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": true,
    353           "justification": "The paper reports 688 responses received and 415 kept after removing invalid entries, documenting attrition from initial responses to final sample.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": false,
    360           "answer": false,
    361           "justification": "Not applicable — this is a survey study with no computational method.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": false,
    366           "answer": false,
    367           "justification": "Not applicable — survey study.",
    368           "source": "opus"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Frequent GenAI users perceive faster code output (LOC) but no improvement in test quality",
    376       "evidence": "Figure 3: 72.7% of frequent users report more LOC per day; but majority report 'no change' or worse on P2 (test pass rate) and P3 (API methods learned)",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "GenAI has minimal impact on team communication and collaboration",
    381       "evidence": "Figure 5: >75% of developers report no change across all communication items (C1-C4) regardless of usage frequency",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Overall productivity change from GenAI adoption is limited and not transformative",
    386       "evidence": "Figure 1: Median scores for frequent users remain within neutral ('no-change') range across all five SPACE dimensions",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "GenAI tools accelerate task execution but mask trade-offs in effort redistribution",
    391       "evidence": "Discussion notes developers trade manual coding time for reviewing/validating AI-generated outputs; Figure 4 shows 84.3% of frequent users report no time reduction on code reviews despite automation",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "GenAI adoption shows apparent benefits while concealing deeper individual silos in teams",
    396       "evidence": "Discussion: 'sacrificing collaboration' and questioning whether tech 'drives us toward deeper individual silos.' Supported by minimal collaboration improvement and acceleration of individual output.",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Developers perceive manageable workload and higher engagement with GenAI use",
    401       "evidence": "Figure 2: 68.6% of frequent users agree workload manageable (S1), lower exhaustion reported, but 62.8% still mark neutral/worse on exhaustion",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "observational",
    407     "survey",
    408     "human_studies"
    409   ],
    410   "key_findings": "Large-scale survey of 415 software developers across 56 OSS communities reveals that frequent GenAI users perceive modest improvements in coding speed and efficiency but report limited overall productivity change across the SPACE framework's five dimensions. Notably, GenAI shows negligible impact on team communication, collaboration, or software quality metrics, suggesting benefits are primarily individual. The authors frame this as a 'productivity paradox' in which apparent task acceleration masks redistributed effort, potential team silos, and unmeasured trade-offs between speed and deeper work quality.",
    411   "red_flags": [
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "Descriptive analysis only; no p-values, CIs, or hypothesis tests despite comparative claims (frequent vs non-frequent). Unclear if reported differences are real or noise."
    415     },
    416     {
    417       "flag": "Causal claims from observational data",
    418       "detail": "Paper claims 'GenAI adoption affects productivity' and 'tools streamline tasks', but study is purely observational correlation between usage frequency and perceived change. No experimental manipulation."
    419     },
    420     {
    421       "flag": "No limitations section",
    422       "detail": "Limitations scattered through discussion but never systematized. No dedicated threats-to-validity section or comprehensive limitations statement."
    423     },
    424     {
    425       "flag": "Self-report bias not discussed",
    426       "detail": "All data is developer self-reported perception of change. Social desirability bias, recall error, and measurement validity not addressed. Objective productivity data absent."
    427     },
    428     {
    429       "flag": "Highly skewed sample demographics",
    430       "detail": "90.6% male, 82% 5+ years experience, 57% large orgs, all from OSS—limits generalizability to broader developer population (women, junior devs, small companies, closed-source work)."
    431     },
    432     {
    433       "flag": "GenAI tools not specified",
    434       "detail": "Paper mentions GitHub Copilot and ChatGPT in intro but never specifies which tools respondents actually used or their versions. Grouping heterogeneous tools as 'GenAI' without controlling for tool type."
    435     },
    436     {
    437       "flag": "Large unexplained attrition",
    438       "detail": "39.7% of responses (273/688) removed as 'invalid' with no documentation of criteria or analysis of whether removed respondents differed systematically."
    439     },
    440     {
    441       "flag": "No demographic subgroup analysis",
    442       "detail": "Despite noting 90.6% male sample, no separate reporting of results by gender, experience level, role, or organization type. Potential confounding masked."
    443     },
    444     {
    445       "flag": "Generalization claims not bounded",
    446       "detail": "Title and conclusions frame findings broadly ('Developer Productivity') without explicitly stating results apply only to OSS contributors, experienced developers, large organizations."
    447     }
    448   ],
    449   "cited_papers": [
    450     {
    451       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    452       "relevance": "Foundational framework (SPACE) that structures the entire study's measurement approach across 5 productivity dimensions."
    453     },
    454     {
    455       "title": "How much does AI impact development speed? An enterprise RCT",
    456       "relevance": "Paradis et al. 2025—RCT evidence from Google showing ~21% time reduction; contrasts with this survey's null findings and demonstrates importance of experimental design over observational."
    457     },
    458     {
    459       "title": "AI slows down some experienced software developers, study finds",
    460       "relevance": "Tong 2025—reports 19% slower task completion with AI, directly contradicts hype; similar skeptical tone to this paper's 'productivity paradox' framing."
    461     },
    462     {
    463       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    464       "relevance": "Peng et al. 2023—objective GitHub metrics showing Copilot usage patterns; represents data-driven alternative to self-report perception approach."
    465     },
    466     {
    467       "title": "\"Will I be replaced?\" Assessing ChatGPT's effect on software development and programmer perceptions of AI tools",
    468       "relevance": "Kuhail et al. 2024—explores negative effects of AI (skill erosion, reduced peer interaction) that resonate with this paper's finding of isolated individual gains."
    469     },
    470     {
    471       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    472       "relevance": "Vaithilingam et al. 2022—identifies debugging and validation effort as hidden costs, supporting this paper's theory of redistributed rather than eliminated effort."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 1,
    478       "justification": "Offers framework-level insights about GenAI productivity but no actionable techniques or tools practitioners can directly apply."
    479     },
    480     "surprise_contrarian": {
    481       "score": 2,
    482       "justification": "The 'productivity paradox' finding that GenAI hasn't meaningfully improved developer productivity contradicts the dominant industry narrative of massive AI-driven gains."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No safety, security, or risk angle is addressed in the paper."
    487     },
    488     "drama_conflict": {
    489       "score": 1,
    490       "justification": "Mildly questions the AI productivity hype promoted by GitHub/Microsoft but doesn't directly challenge specific company claims with strong evidence."
    491     },
    492     "demo_ability": {
    493       "score": 0,
    494       "justification": "Survey-based study with no code, tool, or demo to interact with."
    495     },
    496     "brand_recognition": {
    497       "score": 1,
    498       "justification": "Authors are from recognized universities (Oregon State, Colorado State, NAU) but not famous AI labs; the topic touches well-known tools like Copilot but only tangentially."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "45845800",
    505         "title": "From Memorization to Reasoning in the Spectrum of Loss Curvature",
    506         "points": 65,
    507         "comments": 14,
    508         "url": "https://news.ycombinator.com/item?id=45845800",
    509         "created_at": "2025-11-07T12:43:49Z"
    510       }
    511     ],
    512     "top_points": 65,
    513     "total_points": 65,
    514     "total_comments": 14
    515   }
    516 }

Impressum · Datenschutz