scan-v4.json (33788B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 6 "authors": [ 7 "Joel Becker", 8 "Nate Rush", 9 "Beth Barnes", 10 "David Rein" 11 ], 12 "year": 2025, 13 "venue": "arXiv", 14 "arxiv_id": "2507.09089", 15 "doi": null 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract claims (19% slowdown, developer forecast of 24% speedup, post-hoc estimate of 20% speedup, expert predictions of 38-39% speedup) are all supported by the results in the paper (Figure 1, Section 3.1, Table 6).", 23 "source": "opus" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper makes causal claims ('AI tooling slowed developers down') and uses an RCT design with randomized treatment assignment, which is the gold standard for causal inference. Balance checks confirm successful randomization (Table 4). Issues are defined before randomization to prevent confounding.", 29 "source": "opus" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper explicitly bounds generalization in Section 4.1 ('Key Caveats'), Table 2 ('What the evidence does not show'), and throughout the discussion. It states results do not imply AI is unhelpful in other settings, with future models, or with better elicitation strategies.", 35 "source": "opus" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": true, 40 "justification": "The paper systematically investigates 21 alternative explanations for the slowdown in Section 3.3 and Appendix C, categorized into direct productivity loss, experimental artifacts, factors raising developer performance, and factors limiting AI performance.", 41 "source": "opus" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper explicitly discusses the proxy-outcome distinction. Section 1 critiques prior work for using non-fixed outcome measures (lines of code, PRs) that may not correspond to productivity. It uses task completion time as its measure, notes this is 'a fixed outcome measure,' and discusses scope creep (Section C.2.3) as a potential gap between time and productivity.", 47 "source": "opus" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 4.1 'Key Caveats' serves as a dedicated limitations section, discussing setting-specific factors, AI-specific factors, and agent capabilities. Table 2 explicitly lists what the evidence does not show.", 55 "source": "opus" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper discusses 21 specific threats grouped into four categories (Table 1, Appendix C). These are highly specific: 'Our sample of 16 developers' is acknowledged, experimentally driven overuse, sampling bias in recruitment, unfamiliar development environment, and more.", 61 "source": "opus" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "Table 2 is a gold-standard scope boundary statement, listing six specific things the evidence does NOT show (e.g., 'AI systems do not currently speed up many or most software developers'). Section 4.1 adds specific caveats about setting-specific factors and AI-specific factors.", 67 "source": "opus" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding source is disclosed. METR (Model Evaluation & Threat Research) is the authors' organization, but no grants, sponsors, or funding agencies are mentioned.", 75 "source": "opus" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Authors are identified as being from METR (Model Evaluation & Threat Research). METR is not a developer of the AI tools being evaluated (Cursor, Claude, GPT-4o).", 81 "source": "opus" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "Funding source is not disclosed, so independence cannot be assessed. METR's organizational mission involves AI safety and capability evaluation, which could create incentives in either direction regarding AI productivity results.", 87 "source": "opus" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement or financial interest disclosure is present in the paper.", 93 "source": "opus" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Productivity is operationalized as task completion time with the speedup formula S defined mathematically in Section 2.3; 'AI-allowed/disallowed' conditions are precisely specified; 'speedup' and 'slowdown' are given quantitative definitions with noted abuse of language flagged.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 1 explicitly lists five ways this study complements existing literature: frontier models, live OSS tasks, fixed outcome measure, experienced engineers with repository familiarity, and rich data collection.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 1.1 reviews relevant literature across five subsections; Table 3 systematically compares this study to six prior works on four key methodological dimensions, explaining how design choices account for divergent findings.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": false, 123 "justification": "No repository URL, code archive, or analysis scripts are provided in the paper. The paper describes detailed data collection and regression analyses but does not release code.", 124 "source": "opus" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": false, 129 "justification": "No dataset download link is provided. The paper describes collecting screen recordings, developer forecasts, and implementation times, but does not release the underlying data.", 130 "source": "opus" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "No environment specifications, requirements files, or dependency information is provided for reproducing the analyses.", 136 "source": "opus" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "No step-by-step reproduction instructions are provided. The regression specification is described in Appendix D, but there are no scripts or instructions to replicate the analysis.", 142 "source": "opus" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": true, 149 "justification": "95% confidence intervals are reported throughout, using HC3 standard errors (Section D.2, Figure 15). The paper also reports CIs from alternative uncertainty estimation procedures including clustered standard errors and hierarchical bootstrap.", 150 "source": "opus" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": true, 155 "justification": "The paper uses regression-based inference with HC3 standard errors and reports p-values (e.g., Table 4 balance checks with Welch t-tests, Table 5 chi-square test). Figure 15 shows alternative uncertainty procedures.", 156 "source": "opus" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "The primary result is reported as a 19% increase in completion time (from the log-linear regression coefficient transformed via exp(β)-1). Context is provided with baseline completion times (Figure 4) and the effect is reported with confidence intervals.", 162 "source": "opus" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "No power analysis or sample size justification is provided. The study has 16 developers and 246 tasks. The paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting our data' (Section 3.3) but does not justify why 16 developers was the chosen sample size.", 168 "source": "opus" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": true, 173 "justification": "Standard deviations are reported in Table 4 for forecasted times. Multiple uncertainty estimation procedures are compared in Figure 15. The paper reports variance across developers (Figure 17) and across subsets.", 174 "source": "opus" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "The study design inherently includes a baseline: the AI-disallowed condition serves as the control/baseline against which AI-allowed performance is compared. The paper also compares results against prior literature (Table 3).", 182 "source": "opus" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "The comparison is between AI-allowed and AI-disallowed conditions within the same study, using contemporary AI tools (Claude 3.5/3.7 Sonnet, GPT-4o, Gemini 2.5 Pro). Prior work comparisons in Table 3 include contemporary studies.", 188 "source": "opus" 189 }, 190 "ablation_study": { 191 "applies": false, 192 "answer": false, 193 "justification": "This is an RCT measuring a single treatment (AI allowed vs. not allowed), not a multi-component system. There is no system to ablate.", 194 "source": "opus" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper reports multiple outcome measures: self-reported implementation time, screen recording time, pre-review and post-review time, lines of code per hour, and activity time breakdowns (Figure 6, Figure 22). Alternative estimators are also compared (Figure 13).", 200 "source": "opus" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": true, 205 "justification": "The study includes extensive human evaluation: 143 hours of screen recordings were manually labeled with fine-grained activity labels (Section 2.4), exit interviews and surveys were conducted (Section G.5), and PR quality was assessed through the repositories' standard review processes.", 206 "source": "opus" 207 }, 208 "held_out_test_set": { 209 "applies": false, 210 "answer": false, 211 "justification": "This is an RCT, not a prediction task. There is no train/test split concept applicable here.", 212 "source": "opus" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Extensive breakdowns are provided: per-developer speedup (Figure 17), by prior task exposure and external resource needs (Figure 7), by AI experience (Figure 10), by scope creep (Figure 9), by month (Figure 23), and by activity type (Figures 6, 19-21).", 218 "source": "opus" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "The entire paper is essentially a discussion of a failure case (AI slowing developers down). Section C.1.4 discusses low AI reliability and developers' experiences with failed AI generations. Qualitative examples of AI failures are provided throughout.", 224 "source": "opus" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "The core finding is a negative result: AI tools slow down experienced developers by 19%, contradicting expectations. The paper also reports which hypothesized contributing factors had evidence against them (Table 1, Section C.3).", 230 "source": "opus" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "The paper specifies models used: Claude 3.7 Sonnet (thinking mode), Claude 3.7 Sonnet, Claude 3.5 Sonnet, GPT-4o, Gemini 2.5 Pro, and o1, with usage percentages (Section C.3.6). These are marketing names without snapshot dates, but for an RCT studying tool usage in the wild, this represents what developers actually used.", 238 "source": "opus" 239 }, 240 "prompts_provided": { 241 "applies": false, 242 "answer": false, 243 "justification": "This is an RCT where developers use AI tools naturally. There are no researcher-designed prompts — developers prompt AI tools as they see fit. The study measures the effect of allowing AI usage, not of specific prompts.", 244 "source": "opus" 245 }, 246 "hyperparameters_reported": { 247 "applies": false, 248 "answer": false, 249 "justification": "This is an RCT where developers use commercial AI tools (Cursor Pro) with default settings. The researchers do not control hyperparameters — they are studying the tools as used in practice.", 250 "source": "opus" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "The study evaluates Cursor Pro as a third-party tool used as-is by developers. The authors cannot describe Cursor's internal scaffolding. Section F.2.2 provides a primer on Cursor's features (chat, agent mode, autocomplete) as understood from the user perspective.", 256 "source": "opus" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "The paper documents data preprocessing: imputation of post-review times for unreviewed issues (Section C.3.4), filtering criteria for screen recordings (>10% broken time, >20% discrepancy with self-reported time, cheating removal), resulting in 74 of 128 recordings (Section 2.4). Issue dropout is documented (Section C.3.3).", 262 "source": "opus" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": false, 269 "justification": "Raw data (screen recordings, implementation times, forecasts, survey responses) is not publicly available. Only aggregated results and regression outputs are presented.", 270 "source": "opus" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Data collection is described in detail: Section 2.2.2 covers screen recordings, Cursor analytics, implementation notes. Section 2.4 covers developer forecasts, expert forecasts, exit interviews, and fine-grained activity labels. Appendix G provides full developer instructions.", 276 "source": "opus" 277 }, 278 "recruitment_methods_described": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section G describes recruitment: outreach via professional networks, Reddit communities, and GitHub profiles; filtering for 500+ star repos, 5+ recent commits; 51 initial respondents filtered to ~20 then 16. Selection criteria are explicit.", 282 "source": "opus" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The pipeline is documented: 51 interested developers → ~20 meeting criteria → 16 final participants (3 dropped, Section G.6). 246 tasks completed (136 AI-allowed, 110 AI-disallowed). Screen recording filtering: 128 recordings → 74 valid after quality filters (Section 2.4). Imputation methods for missing post-review times are described (Section C.3.4).", 288 "source": "opus" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": false, 294 "answer": false, 295 "justification": "This is an RCT measuring developer productivity with AI tools, not an evaluation of a pre-trained model's capability on a benchmark.", 296 "source": "opus" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": false, 300 "answer": false, 301 "justification": "This is an RCT measuring developer productivity, not a benchmark evaluation where train/test overlap is relevant.", 302 "source": "opus" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": false, 306 "answer": false, 307 "justification": "This is an RCT, not a benchmark evaluation.", 308 "source": "opus" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": true, 314 "answer": false, 315 "justification": "No mention of pre-registration (OSF, AsPredicted, AEA registry, or similar) anywhere in the paper.", 316 "source": "opus" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": true, 320 "answer": false, 321 "justification": "No mention of IRB or ethics board approval. The study involves human participants (developers) completing tasks and being recorded, but no ethics review is mentioned.", 322 "source": "opus" 323 }, 324 "demographics_reported": { 325 "applies": true, 326 "answer": true, 327 "justification": "Developer demographics are reported: typically over a decade of software experience, 5 years average on their repository, 1,500 commits on average, 59% of repository lifetime. AI experience levels: 93% used LLMs, 44% used Cursor (Section 2.1). Table 7 provides per-developer statistics.", 328 "source": "opus" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": true, 332 "answer": true, 333 "justification": "Section G.2.1 lists explicit eligibility criteria: 1+ year professional experience, 6+ months as active maintainer, repository must be open source with 500+ stars and 3000+ LOC, must have a list of issues. Section G describes further filtering steps.", 334 "source": "opus" 335 }, 336 "randomization_described": { 337 "applies": true, 338 "answer": true, 339 "justification": "Randomization is described: issues randomized via 'simulated fair coin flip' (Section 2.2). 25 early issues used block randomization before switching (Section E.3). Balance checks confirm successful randomization (Table 4, Table 5).", 340 "source": "opus" 341 }, 342 "blinding_described": { 343 "applies": true, 344 "answer": true, 345 "justification": "The paper explicitly states 'this study is not blinded' (Section C.2.1) — developers know which condition they are in. This is acknowledged as a limitation and investigated as a potential confound (experimentally driven overuse, Section C.2.1).", 346 "source": "opus" 347 }, 348 "attrition_reported": { 349 "applies": true, 350 "answer": true, 351 "justification": "Section G.6 reports 3 developers were dropped (2 due to repository pausing development, 1 due to cheating). Section C.3.3 reports issue dropout: 7 AI-disallowed and 6 AI-allowed issues not completed, plus 24 intentionally dropped issues with reasons. Robustness checks on dropout subsets are provided.", 352 "source": "opus" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "No AI inference costs or token consumption is reported. The paper mentions providing Cursor Pro subscriptions and paying developers $150/hour but does not report API costs or token usage.", 360 "source": "opus" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "Total study cost is partially disclosed ($150/hour for developers, with ~40 hour cap per developer) but total computational budget, API costs, and infrastructure costs are not stated.", 366 "source": "opus" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "AI tools increased task completion time by 19% for experienced OSS developers using Cursor Pro with Claude 3.5/3.7 Sonnet", 374 "evidence": "Log-linear regression on 246 tasks (136 AI-allowed, 110 AI-disallowed) with 95% CIs using HC3 standard errors; robust across five alternative estimators including simple ratio, alternative regression specifications, and screen-recording-based times", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Developers predicted 24% speedup before the study and estimated 20% speedup after, despite experiencing 19% slowdown", 379 "evidence": "Pre-task developer forecasts collected for all issues before randomization; post-study exit survey with weighted mean methodology (Equations 14-15); both directions confirmed with confidence intervals in Figure 1", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Economics experts predicted 39% speedup and ML experts predicted 38% speedup, dramatically overestimating AI productivity benefits", 384 "evidence": "Incentivized forecasts from 34 economics experts and 54 ML experts; Table 6 shows full distribution statistics including min, quartiles, max", 385 "supported": "strong" 386 }, 387 { 388 "claim": "High developer familiarity with repositories contributes to the slowdown effect", 389 "evidence": "Moderate evidence from subgroup analyses (Figure 7): higher prior task exposure and lower external resource needs correlate with greater slowdown; qualitative developer reports corroborate; analysis is underpowered for statistical significance", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "AI code generation acceptance rate below 44% and high cleanup burden indicate low AI reliability in this setting", 394 "evidence": "Cursor analytics from 13 of 16 developers show <44% acceptance rate; 56% of developers report needing major changes to AI code; 100% report needing to modify AI-generated code; 9% of time spent reviewing/cleaning outputs", 395 "supported": "strong" 396 }, 397 { 398 "claim": "75% of individual developers experienced slowdown when AI usage was allowed", 399 "evidence": "Per-developer speedup estimates in Figure 17 using heterogeneous treatment effects methodology (Equations 8-13); 12 of 16 developers show point estimates indicating slowdown", 400 "supported": "strong" 401 }, 402 { 403 "claim": "Fully autonomous agents can implement core issue functionality but fail on quality requirements (documentation, linting, tests)", 404 "evidence": "Described as 'preliminary evidence (forthcoming)' in Section 4.1 for Claude 3.7 Sonnet on study repositories; not yet published and not directly supported by this paper's data", 405 "supported": "weak" 406 } 407 ], 408 "methodology_tags": [ 409 "rct" 410 ], 411 "key_findings": "An RCT with 16 experienced open-source developers (5-year average repository tenure) completing 246 real tasks found that frontier AI tools (Cursor Pro with Claude 3.5/3.7 Sonnet) increased task completion time by 19%, directly contradicting developer predictions of 24% speedup and expert forecasts of 38-39% speedup from 88 economics and ML researchers. Developers maintained their mistaken perception of AI benefit even after completing the study, with post-hoc estimates of 20% speedup. Contributing factors identified include high developer familiarity making AI less useful, large/complex repository contexts exceeding AI context window reliability (<44% code acceptance rate), and over-optimism leading to continued AI use despite diminishing returns. Results are robust across multiple estimators, outcome measures, and analysis subsets, though 16-developer sample size limits subgroup power.", 412 "red_flags": [ 413 { 414 "flag": "No pre-registration", 415 "detail": "The 21-factor analysis in Appendix C is described as 'a priori' but no pre-registration on OSF, AEA, or similar registry is cited; the boundary between genuinely pre-specified and post-hoc hypotheses cannot be verified." 416 }, 417 { 418 "flag": "Small sample (n=16 developers)", 419 "detail": "Only 16 developers participate; the paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting our data,' yet multiple subgroup conclusions are drawn with wide confidence intervals." 420 }, 421 { 422 "flag": "Non-blinded design", 423 "detail": "Developers know their condition assignment; Section C.2.1 finds mixed evidence about experimentally-driven AI overuse but cannot rule it out; screen-recording labelers are also not blinded to condition." 424 }, 425 { 426 "flag": "Self-reported primary outcome", 427 "detail": "Completion times are self-reported per issue; while validated against screen recording durations on a subset (25% vs 24% slowdown), systematic bias from time-tracking burden in the study context cannot be eliminated." 428 }, 429 { 430 "flag": "No IRB or ethics disclosure", 431 "detail": "Human subjects research involving paid participants, screen recordings of work, and exit interviews contains no mention of institutional ethics review or informed consent procedures." 432 }, 433 { 434 "flag": "No data or code release", 435 "detail": "Neither raw data nor analysis code is made publicly available, preventing independent verification of the primary statistical claims despite detailed methodological appendices." 436 }, 437 { 438 "flag": "No funding disclosure", 439 "detail": "METR's funding sources are not disclosed despite the organization evaluating tools from Anthropic (Claude) and Anysphere (Cursor); potential funder relationships to evaluated companies cannot be assessed." 440 } 441 ], 442 "cited_papers": [ 443 { 444 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 445 "relevance": "Primary methodological comparison point; found 56% speedup on synthetic tasks using non-frontier models, contrasting with this study's 19% slowdown on real tasks" 446 }, 447 { 448 "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial", 449 "relevance": "Another RCT finding 21% speedup using synthetic tasks; used as comparison for study design differences (synthetic vs. real tasks, non-fixed vs. fixed outcomes)" 450 }, 451 { 452 "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers", 453 "relevance": "Three field experiments finding 26% output increase using non-fixed outcome measures (PRs); key methodological contrast for fixed vs. non-fixed outcome measure debate" 454 }, 455 { 456 "title": "Generative AI at Work", 457 "relevance": "Customer service AI study finding benefits concentrated among less experienced workers; cited for heterogeneous effects framework motivating focus on expert developers" 458 }, 459 { 460 "title": "Experimental Evidence on the Productivity Effects of Generative Artificial Intelligence", 461 "relevance": "Non-software domain (writing) RCT finding productivity gains; provides cross-domain comparison for AI productivity effects" 462 }, 463 { 464 "title": "Generative AI and Labour Productivity: A Field Experiment on Coding", 465 "relevance": "BIS field experiment finding 55% output increase measured in PRs; cited as example of non-fixed outcome measure limitations" 466 }, 467 { 468 "title": "Artificial Intelligence: The Ambiguous Labor Market Impact of Automating Prediction", 469 "relevance": "Agrawal et al. theoretical framework treating AI as prediction cost reduction with distributional consequences; motivates heterogeneous effects by experience level" 470 }, 471 { 472 "title": "Measuring AI Ability to Complete Long Tasks", 473 "relevance": "METR's related work on autonomous task completion benchmarks; provides context on AI capability measurement and benchmark vs. real-world performance gap" 474 } 475 ], 476 "engagement_factors": { 477 "practical_relevance": { 478 "score": 3, 479 "justification": "Directly measures impact of currently available tools (Cursor Pro, Claude 3.5/3.7 Sonnet) on real developer workflows, immediately actionable for practitioners and managers evaluating AI tool adoption." 480 }, 481 "surprise_contrarian": { 482 "score": 3, 483 "justification": "Finding is dramatically contrary to near-universal expert and practitioner consensus; 19% slowdown vs. 38-39% predicted speedup from 88 experts represents a complete reversal of expectations with rigorous methodology." 484 }, 485 "fear_safety": { 486 "score": 1, 487 "justification": "Indirectly relevant to AI safety by demonstrating that benchmark performance substantially overestimates real-world AI capabilities, suggesting capability assessments for AI risk may be miscalibrated." 488 }, 489 "drama_conflict": { 490 "score": 2, 491 "justification": "Results directly contradict multiple prior published studies and expert consensus, creating methodological controversy about which study designs are appropriate for measuring AI productivity impact." 492 }, 493 "demo_ability": { 494 "score": 1, 495 "justification": "Cannot replicate the RCT; tools (Cursor Pro, Claude) are publicly available for personal experimentation but no interactive demo or replication dataset is released." 496 }, 497 "brand_recognition": { 498 "score": 2, 499 "justification": "METR is a recognized AI safety evaluation organization; study evaluates Claude (Anthropic) and Cursor Pro (Anysphere), lending credibility through well-known tool and lab associations." 500 } 501 }, 502 "hn_data": { 503 "threads": [ 504 { 505 "hn_id": "36781015", 506 "title": "How is ChatGPT's behavior changing over time?", 507 "points": 289, 508 "comments": 178, 509 "url": "https://news.ycombinator.com/item?id=36781015", 510 "created_at": "2023-07-19T01:06:12Z" 511 }, 512 { 513 "hn_id": "41215631", 514 "title": "Ask HN: Has degradation in the quality of ChatGPT and Claude been proven?", 515 "points": 42, 516 "comments": 40, 517 "url": "https://news.ycombinator.com/item?id=41215631", 518 "created_at": "2024-08-11T12:12:39Z" 519 }, 520 { 521 "hn_id": "42764969", 522 "title": "Evolving Deeper LLM Thinking", 523 "points": 12, 524 "comments": 0, 525 "url": "https://news.ycombinator.com/item?id=42764969", 526 "created_at": "2025-01-20T04:24:10Z" 527 }, 528 { 529 "hn_id": "45661775", 530 "title": "Measuring the Impact of Early-2025 AI on Experienced Developer Productivity", 531 "points": 4, 532 "comments": 2, 533 "url": "https://news.ycombinator.com/item?id=45661775", 534 "created_at": "2025-10-21T21:12:22Z" 535 }, 536 { 537 "hn_id": "37265952", 538 "title": "The AI Reproducibility Crisis", 539 "points": 4, 540 "comments": 3, 541 "url": "https://news.ycombinator.com/item?id=37265952", 542 "created_at": "2023-08-25T19:15:29Z" 543 }, 544 { 545 "hn_id": "45497568", 546 "title": "Fine-Tuning Small Language Models with Low-Rank Adapters to Mimic User Behaviors", 547 "points": 3, 548 "comments": 0, 549 "url": "https://news.ycombinator.com/item?id=45497568", 550 "created_at": "2025-10-06T23:40:50Z" 551 }, 552 { 553 "hn_id": "45249175", 554 "title": "What do the fundamental constants of physics tell us about life?", 555 "points": 3, 556 "comments": 0, 557 "url": "https://news.ycombinator.com/item?id=45249175", 558 "created_at": "2025-09-15T13:02:16Z" 559 }, 560 { 561 "hn_id": "44593569", 562 "title": "Measuring the Impact of Early-2025 AI on Experienced Developer Productivity", 563 "points": 2, 564 "comments": 0, 565 "url": "https://news.ycombinator.com/item?id=44593569", 566 "created_at": "2025-07-17T14:03:27Z" 567 }, 568 { 569 "hn_id": "44783441", 570 "title": "Measuring the Impact of AI on Experienced Open-Source Developer Productivity", 571 "points": 1, 572 "comments": 1, 573 "url": "https://news.ycombinator.com/item?id=44783441", 574 "created_at": "2025-08-04T09:03:06Z" 575 }, 576 { 577 "hn_id": "46254932", 578 "title": "Measuring Impact of Early-2025 AI on Experienced Open-Source Dev Productivity", 579 "points": 1, 580 "comments": 0, 581 "url": "https://news.ycombinator.com/item?id=46254932", 582 "created_at": "2025-12-13T14:54:29Z" 583 } 584 ], 585 "top_points": 289, 586 "total_points": 361, 587 "total_comments": 224 588 } 589 }