scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24202B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Developer Productivity with GenAI",
      6     "authors": [
      7       "Sadia Afroz",
      8       "Zixuan Feng",
      9       "Katie Kimura",
     10       "Bianca Trinkenreich",
     11       "Igor Steinmacher",
     12       "Anita Sarma"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2510.24265",
     17     "doi": "10.48550/arXiv.2510.24265"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims of limited productivity change and productivity paradox are directly supported by survey data showing medians in neutral range and coding throughput gains without quality/learning improvements.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "Paper frames research question as causal ('how does GenAI adoption affect...') but uses observational survey comparing self-selected frequent vs non-frequent users without randomization or control for selection bias.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Sample is 90.6% male, 82% >5 years experience, 58% large organizations, recruited from 56 specific OSS/corporate communities. Conclusions extrapolate broadly to 'AI-mediated development' without acknowledging narrow demographics.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Paper does not discuss alternative explanations for why frequent users report slightly higher scores (selection bias from optimistic adopters, confounding by user type or tool quality perceptions).",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Study measures self-reported perceptions of productivity, not actual metrics (code quality, velocity, maintainability). Paper acknowledges perceptions 'may not fully align' but inadequately addresses this fundamental distinction in claims.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No dedicated limitations section. Threats to validity scattered briefly in discussion (e.g., single sentence on self-reported data).",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Specific threats (social desirability bias, selection bias, recall bias, confounding from self-selection into frequent/non-frequent groups) are not articulated.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Paper does not explicitly state scope boundaries: what results do NOT show (no objective metrics, no code quality analysis, no team-level impacts beyond perception).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source mentioned anywhere in paper. If unfunded, should be stated explicitly.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Authors from Oregon State, Colorado State, Northern Arizona universities clearly listed. Evaluating third-party tools, no affiliation conflicts.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funder identified.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement provided.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Productivity operationalized via SPACE but not precisely defined; GenAI tools mentioned broadly (Copilot, ChatGPT) without version/date specification; frequent/non-frequent threshold not specified.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Explicitly states goal: examine GenAI impact on developer productivity across SPACE dimensions to fill gap left by fragmented prior studies focused on narrow metrics.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Engages substantively with prior frameworks (DevEx, DORA, SPACE) and empirical work (Copilot productivity claims, ChatGPT studies), positioning this as more comprehensive multi-dimensional analysis.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": false,
    124           "answer": false,
    125           "justification": "No analysis code released. Survey instrument referenced in supplementary material but not provided with paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": false,
    130           "answer": false,
    131           "justification": "Raw survey data not released (privacy protection justified) but prevents independent verification of results.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": false,
    136           "answer": false,
    137           "justification": "Survey study, not computational. No software environment or analysis tools specified.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Recruitment details provided but insufficient for replication. Invalid entry exclusion criteria not defined. Survey instrument not fully reproduced in paper.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No confidence intervals reported. Violin plots show distributions but no CIs presented. Percentages lack uncertainty bounds.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests reported for comparisons between frequent and non-frequent users. All differences lack p-values.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "Percentages reported but no standardized effect sizes (Cohen's d) or between-group effect metrics.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "415 respondents retained but no power analysis or justification for sample size adequacy provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Violin plots show distributions but standard deviations and confidence intervals not reported in tables. Only medians/means aggregated.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Compares frequent vs non-frequent AI user groups as baseline.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Both groups from 2025 survey period, contemporary comparison.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "Survey study, not system evaluation. Ablation not applicable.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "21 survey items across 5 SPACE dimensions provide multiple metrics.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Self-reported human evaluations of developer experience across productivity dimensions.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Not a prediction task.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Detailed breakdowns by SPACE dimension, item-level within dimensions, and frequent vs non-frequent usage.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Reports null findings (no change in communication, test success, learning) but does not discuss specific failure patterns or when GenAI was unhelpful.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Reports 70%+ no change in communication, majority no improvement in test success, main finding is limited overall impact.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Mentions 'GitHub Copilot and ChatGPT' without versions, training dates, or model snapshots. Participants could use any version.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Survey instrument promised in supplementary materials but not reproduced in main paper. Cannot verify exact questions asked.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": false,
    250           "answer": false,
    251           "justification": "Not applicable for survey.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "Evaluates natural GenAI usage, not controlled scaffolding. Not applicable.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": false,
    263           "justification": "Paper removes 273/688 responses (39.7%) as 'invalid entries' but exclusion criteria not defined. No documentation of filtering decisions.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": false,
    270           "answer": false,
    271           "justification": "Raw survey data not released (privacy protected). Prevents independent verification.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Describes recruitment from 56 OSS/corporate communities, email invitations, 5-8 minute survey, $50 raffle, two-week period.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Explicitly lists 56 OSS communities (Apache, PyTorch, etc.) and organizational repos (IBM, Oracle, Google, Adobe, JetBrains) as recruitment sources.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "Pipeline partially documented: survey adapted from prior work, piloted (n=7), 688 responses collected, invalid entries removed. But invalid criteria undefined.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "Not applicable; not evaluating model capabilities on benchmarks.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration mentioned (OSF, ClinicalTrials.gov, etc.).",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": true,
    323           "justification": "Explicitly states 'The protocol was approved by our university's IRB.'",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Reports gender (90.6% male), role distribution (full-stack 36.87%, backend 16.87%, data/ML 15.42%), experience (82.17% >5 years), organization size (57.83% large).",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "Recruited from specific communities but explicit inclusion/exclusion criteria not stated. Who could participate undefined.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": false,
    341           "justification": "Not randomized. Participants self-selected into frequent vs non-frequent groups. No random assignment.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "Not feasible for survey; participants knew they were surveyed about GenAI. No blinding.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "688→415 retention (60.3%) but invalid entry criteria not defined. Cannot assess attrition bias.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": false,
    360           "answer": false,
    361           "justification": "Not applicable; evaluates developer use of existing commercial tools, not inference cost.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": false,
    366           "answer": false,
    367           "justification": "Not applicable for survey study.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "GenAI adoption does not produce substantial positive or negative shifts in perceived productivity across SPACE dimensions",
    376       "evidence": "Figure 1 shows all median aggregated scores remain in neutral (no-change) range; Observation 1 explicitly states GenAI integration has not yet produced substantial shifts.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Frequent GenAI users report slightly higher perceived improvements in Efficiency and Satisfaction than non-frequent users",
    381       "evidence": "Figure 2 shows 68.6% frequent users vs 62.8% non-frequent report manageable workload; Figure 6 shows less time spent per work item for frequent users.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "GenAI increases coding output volume but does not improve test success or learning pace",
    386       "evidence": "Figure 3 shows 72.7% of frequent users report more LOC per day, but majority report no change in test pass rate and API methods learned; Observation 3 directly states this.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "GenAI tools streamline routine coding tasks but have limited impact on evaluative tasks like code review",
    391       "evidence": "Figure 4 shows more commits and test cases for frequent users (Activity increased), but 84.3% report no reduction in code review time; Observation 4 explicitly makes this distinction.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "GenAI has not reshaped team communication or collaboration practices",
    396       "evidence": "Figure 5 shows 70%+ of both groups report no change across all communication items; Observation 5 states impact remains largely individual not collective.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "The 'productivity paradox' exists: developers become faster but do not necessarily create better software",
    401       "evidence": "Discussion section and abstract explicitly frame this paradox; supported by Figure 3 showing output volume increases without quality improvements.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "observational",
    407     "qualitative"
    408   ],
    409   "key_findings": "Survey of 415 developers using the SPACE framework reveals GenAI adoption has minimal perceived impact on developer productivity across five dimensions. While frequent GenAI users report slightly higher satisfaction and efficiency, they show no meaningful improvements in performance quality, overall activity levels, or team collaboration. The paper identifies a 'productivity paradox' where developers work faster at coding tasks but do not achieve better software quality, stronger teamwork, or deeper engagement—suggesting GenAI benefits are primarily individual and potentially shallow.",
    410   "red_flags": [
    411     {
    412       "flag": "Causal claims without experimental design",
    413       "detail": "Paper frames RQ causally ('how does GenAI adoption affect...') using observational survey comparing self-selected frequent/non-frequent users with no randomization. Selection bias uncontrolled—users may differ in optimism, adoption propensity, or tool perception, not actual tool impact."
    414     },
    415     {
    416       "flag": "Perceptions measured, outcomes claimed",
    417       "detail": "Study measures self-reported perceptions of productivity, not actual metrics (code quality, velocity, maintainability, bug rates). Paper acknowledges perceptions 'may not fully align with objective outcomes' but inadequately addresses this gap between measurement and claims."
    418     },
    419     {
    420       "flag": "No statistical significance testing",
    421       "detail": "All between-group comparisons lack p-values, CIs, or significance tests. Cannot determine if observed differences (e.g., 68.6% vs 62.8%) are meaningful or noise."
    422     },
    423     {
    424       "flag": "Narrow, unrepresentative sample",
    425       "detail": "Sample heavily skewed: 90.6% male, 82% with >5 years experience, 58% in large orgs, recruited from specific 56 OSS/corporate communities. Generalization to broader developer population unjustified."
    426     },
    427     {
    428       "flag": "Invalid entry removal criteria undefined",
    429       "detail": "39.7% of responses (273/688) removed as 'invalid' but exclusion criteria not specified. Could introduce systematic bias. Attrition rate not transparently reported."
    430     },
    431     {
    432       "flag": "No study pre-registration",
    433       "detail": "Study not pre-registered (OSF, ClinicalTrials.gov). Increases risk of HARKing and flexibility in analysis decisions post-hoc."
    434     },
    435     {
    436       "flag": "Model versions not specified",
    437       "detail": "Paper mentions 'GitHub Copilot and ChatGPT' without versions, training dates, or snapshots. Participants could use any version; comparisons not replicable."
    438     },
    439     {
    440       "flag": "Confounding variables not controlled",
    441       "detail": "Self-selection into frequent/non-frequent use not accounted for. Differences could reflect user optimism, adoption propensity, or perceived tool quality rather than actual GenAI impact."
    442     },
    443     {
    444       "flag": "No limitations section",
    445       "detail": "Threats to validity scattered and brief. Lacks dedicated section addressing specific validity concerns (social desirability bias, recall bias, selection bias)."
    446     }
    447   ],
    448   "cited_papers": [
    449     {
    450       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    451       "relevance": "Prior empirical work showing faster task completion with Copilot; key baseline for comparison."
    452     },
    453     {
    454       "title": "Sea change in software development: Economic and productivity analysis of the ai-powered developer lifecycle",
    455       "relevance": "Claims 55.8% speedup from Copilot; represents optimistic prior work that this paper tempers."
    456     },
    457     {
    458       "title": "How much does AI impact development speed? An enterprise RCT",
    459       "relevance": "Experimental evidence from Google RCT showing 21% speedup; most rigorous prior work, contrasts with this survey approach."
    460     },
    461     {
    462       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    463       "relevance": "Shows AI-generated code requires rework and debugging; supports productivity paradox hypothesis."
    464     },
    465     {
    466       "title": "The SPACE of Developer Productivity: There's more to it than you think",
    467       "relevance": "Foundational framework (SPACE: Satisfaction, Performance, Activity, Communication, Efficiency) organizing this paper's analysis."
    468     },
    469     {
    470       "title": "Software developers' perceptions of productivity",
    471       "relevance": "Prior survey methodology on developer perceptions; methods adapted for GenAI context in this paper."
    472     },
    473     {
    474       "title": "Will I be replaced? Assessing ChatGPT's effect on software development and programmer perceptions of AI tools",
    475       "relevance": "Related work on ChatGPT impact showing over-reliance erodes coding skills; supports finding that GenAI may not improve learning pace."
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 2,
    481       "justification": "Directly addresses practitioner question about GenAI ROI, but findings are mixed/limited (no productivity gains), reducing actionable value for adoption decisions."
    482     },
    483     "surprise_contrarian": {
    484       "score": 2,
    485       "justification": "Somewhat contrarian to industry hype; suggests productivity paradox and limited real benefits. Challenges narrative of GenAI as productivity silver bullet."
    486     },
    487     "fear_safety": {
    488       "score": 0,
    489       "justification": "Study focuses purely on productivity perceptions, not AI safety, misuse, or risk concerns. No fear/safety angle."
    490     },
    491     "drama_conflict": {
    492       "score": 1,
    493       "justification": "Mild tension between industry optimism and measured reality, but not dramatic or controversial. Raises questions rather than making strong claims."
    494     },
    495     "demo_ability": {
    496       "score": 0,
    497       "justification": "Survey-based study with no interactive demo, tool, or system. Results are observational/statistical, not demonstrable."
    498     },
    499     "brand_recognition": {
    500       "score": 2,
    501       "justification": "Authors from solid regional universities (Oregon State, Colorado State, Northern Arizona) but not top-tier research labs. Mid-level prestige."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "45845800",
    508         "title": "From Memorization to Reasoning in the Spectrum of Loss Curvature",
    509         "points": 65,
    510         "comments": 14,
    511         "url": "https://news.ycombinator.com/item?id=45845800",
    512         "created_at": "2025-11-07T12:43:49Z"
    513       }
    514     ],
    515     "top_points": 65,
    516     "total_points": 65,
    517     "total_comments": 14
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs