scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31260B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
      6     "authors": [
      7       "Elise Paradis",
      8       "Kate Grey",
      9       "Quinn Madison",
     10       "Daye Nam",
     11       "Andrew Macvean",
     12       "Vahid Meimand",
     13       "Nan Zhang",
     14       "Ben Ferrari-Church",
     15       "Satish Chandra"
     16     ],
     17     "year": 2024,
     18     "venue": "2025 IEEE/ACM 47th International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP)",
     19     "arxiv_id": "2410.12944",
     20     "doi": "10.1109/ICSE-SEIP66354.2025.00060"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Abstract claims of ~21% speed gain and large CI are supported by regression results (β1=-0.24, 95%CI=[-0.51,0.03]); the 'interesting effect' for coding-hours is appropriately hedged without claiming significance.",
     28         "source": "haiku"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The RCT design with random assignment to AI/no-AI conditions directly supports causal inference; the paper explicitly notes RCTs are 'the empirical standard for establishing causal links.'",
     34         "source": "haiku"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper explicitly states 'we cannot assume that the effect size obtained in our lab study will necessarily apply more broadly' and limits findings to Google developers using proprietary internal tools in summer 2024.",
     40         "source": "haiku"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The discussion explicitly addresses why the 21% estimate differs from Peng et al.'s 56%—attributing it to population differences (Upwork vs Googlers) and tool differences—and offers four alternative explanations for the null interaction with AI usage frequency.",
     46         "source": "haiku"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper carefully distinguishes 'time spent on task' from broader 'developer productivity,' references the SPACE framework to acknowledge multidimensionality, and explicitly states time on task is one dimension—not a complete proxy for productivity.",
     52         "source": "haiku"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section V 'LIMITATIONS' is a dedicated full-column section with multiple distinct limitations discussed.",
     60         "source": "haiku"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Specific threats include: sample size adequate for main effect but underpowered for interaction effects; all participants are Googlers limiting generalizability; 3-hour task cap limits applicability to longer real-world tasks; and inability to eliminate self-evaluation bias when researchers evaluate their own tools.",
     66         "source": "haiku"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Explicit scope boundaries: findings apply to internal Google tools in summer 2024 specifically, may not translate across tools over time, and do not cover code quality (explicitly excluded: 'questions about the impact of AI on code quality were not explored').",
     72         "source": "haiku"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No formal funding disclosure section is present; the Google affiliation is evident from author emails but there is no explicit statement that Google funded the research.",
     80         "source": "haiku"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All nine authors list @google.com email addresses and 'Google' as their affiliation; this is clearly and consistently disclosed.",
     86         "source": "haiku"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Google employees are evaluating Google's own production AI coding tools; the paper itself acknowledges 'we cannot fully eliminate the risk associated with us evaluating our own team's developer tools.'",
     92         "source": "haiku"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests statement, no declaration of patents, equity, or consulting arrangements is included anywhere in the paper.",
     98         "source": "haiku"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Time on task is precisely defined (time in Cider V study workspace from creation to last interaction); the three AI features are described with screenshots; 'enterprise-grade task' is operationalized through 10 files, 474 lines of code, and multi-component requirements.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper explicitly states it contributes 'an estimate of the impact of three AI features on the time developers spent on a complex, enterprise-grade task' and positions this as filling a gap in enterprise-context studies.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper directly compares to Peng et al. (56% RCT estimate) and Cui et al. (26% field experiment), explains the methodological differences driving divergent estimates, and situates itself within the SPACE framework for developer productivity.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "No code released; analysis scripts for t-tests and regressions are not shared. Task instructions are 'available upon request' (footnote 1) but not published.",
    129           "source": "haiku"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Raw participant telemetry and survey data are not released; this is enterprise human-subjects data from Google's proprietary systems.",
    135           "source": "haiku"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The study environment is Google's proprietary Cider V IDE on the Piper monorepo, which is inaccessible to external researchers; no reproducible environment specification is possible or provided.",
    141           "source": "haiku"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No step-by-step instructions for reproducing the analysis are provided; statistical methods are described but code is not shared.",
    147           "source": "haiku"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "95% CI is reported for the main AI effect: β1=-0.24, 95%CI=[-0.51, 0.03]; standard errors are reported for regression coefficients in Table III and for group means in Table II.",
    155           "source": "haiku"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Two-tailed Student t-test (t(83.6)=2.11, p=.038) and regression p-values are reported; Welch t-tests for covariate balance across conditions are also included.",
    161           "source": "haiku"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Effect sizes are calculated and explicitly reported in percentage terms using the formula (1−exp(β))×100, yielding 21% for the main AI effect and 32% for coding hours; the derivation is shown.",
    167           "source": "haiku"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper asserts the study was 'adequately powered to test our main effect' but provides no formal power analysis calculation (effect size assumed, alpha, power level) to support this claim.",
    173           "source": "haiku"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Standard deviations are reported for all variables in Table I, by condition in Table II, and standard errors for regression coefficients throughout Table III.",
    179           "source": "haiku"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The no-AI condition (N=46) is a clean control baseline; the 48/48 split and randomization make this a well-defined comparison.",
    187           "source": "haiku"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The control condition reflects the same enterprise task without AI features, conducted in summer 2024; this is a contemporary and appropriate baseline for the evaluation.",
    193           "source": "haiku"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "Three AI features (AI Code Completion, Smart Paste, Natural Language to Code) are bundled into a single experimental condition; no ablation separates individual feature contributions.",
    199           "source": "haiku"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Time on task is the sole outcome metric; code quality, developer satisfaction, and error rates are not measured as outcomes (questionnaire responses serve as covariates only, not outcomes).",
    205           "source": "haiku"
    206         },
    207         "human_evaluation": {
    208           "applies": false,
    209           "answer": false,
    210           "justification": "The study measures behavioral outcomes (task completion time) rather than having judges rate AI output quality; human evaluation of system outputs is not part of the design.",
    211           "source": "haiku"
    212         },
    213         "held_out_test_set": {
    214           "applies": false,
    215           "answer": false,
    216           "justification": "Not a prediction or machine learning task; the RCT design measures human behavioral outcomes, not model generalization.",
    217           "source": "haiku"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by developer subgroups through regression interaction models: seniority level, daily coding hours, and AI usage frequency each receive separate interaction analysis.",
    223           "source": "haiku"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The discussion addresses cases where AI may fail to help: junior developers who lack skills to leverage generated code, developers coding fewer hours daily, and the null results for H4 and H5 are discussed as informative failures.",
    229           "source": "haiku"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Hypotheses H4 and H5 are explicitly rejected (no significant interaction with seniority p=0.706 or AI usage frequency p=0.235), and H2 is 'partially supported' since the effect lost significance when controlling for covariates.",
    235           "source": "haiku"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The AI features are named and described with blog post references, but no specific underlying model versions, checkpoint dates, or model identifiers are provided for any of the three features.",
    243           "source": "haiku"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No system prompts or prompt templates for any of the three AI features are provided; task instructions are only available upon request from the authors.",
    249           "source": "haiku"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "No hyperparameters (temperature, top-p, context window, etc.) are reported for any of the three AI coding features.",
    255           "source": "haiku"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "The study evaluates production black-box AI features integrated into a commercial IDE; no custom agentic scaffolding was built for this study.",
    261           "source": "haiku"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "Log transformation of time on task is documented; variable coding decisions are explicit (e.g., dichotomization of AvgProgHrsDay at ≥5 hours, ordinal coding of DataLogExp 0-3, binary NbrHighFreqAreas indicator per area).",
    267           "source": "haiku"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "Raw telemetry and survey data are not released; this is proprietary Google enterprise data with human subjects constraints.",
    275           "source": "haiku"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Telemetry data collection is described: time starts when participant creates study-specific workspace in Cider V and ends at last IDE interaction; adjacent surfaces (code search, debugging tools) are included.",
    281           "source": "haiku"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Recruitment is described in detail: email invitation by a team independent of the research team, with explicit inclusion criteria (≥1 year at Google, C++ proficiency, Piper submissions, Cider V as main IDE, some task domain experience).",
    287           "source": "haiku"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The pipeline is described: telemetry collection → randomization after pre-task questionnaire → task completion → post-task questionnaire → t-tests and regressions; variable coding decisions are documented with formulas.",
    293           "source": "haiku"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "This is an RCT measuring developer behavior, not an evaluation of model capabilities on benchmarks; training data contamination is not relevant.",
    301           "source": "haiku"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable; the study uses a custom enterprise coding task and measures human behavioral outcomes, not model benchmark performance.",
    307           "source": "haiku"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "The study does not evaluate models on standard benchmarks; the task is a custom proprietary enterprise task administered in 2024.",
    313           "source": "haiku"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "No pre-registration on any registry (OSF, ClinicalTrials.gov, AEA) is mentioned despite this being a human subjects RCT.",
    321           "source": "haiku"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": true,
    325           "answer": false,
    326           "justification": "No IRB or ethics board approval is mentioned anywhere in the paper for this human subjects experiment with 96 employees.",
    327           "source": "haiku"
    328         },
    329         "demographics_reported": {
    330           "applies": true,
    331           "answer": false,
    332           "justification": "Professional covariates (seniority level, coding hours) are reported but standard participant demographics (age, gender, years of professional experience) are not reported.",
    333           "source": "haiku"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": true,
    337           "answer": true,
    338           "justification": "Explicit inclusion criteria are stated: ≥1 year at Google, C++ proficiency, submission of code to Piper, Cider V as main IDE, and some experience with the task domain (data logging infrastructure).",
    339           "source": "haiku"
    340         },
    341         "randomization_described": {
    342           "applies": true,
    343           "answer": true,
    344           "justification": "Randomization is described: participants assigned to AI or no-AI condition after completing pre-task questionnaire and tool training; covariate balance verification via Welch t-tests confirms successful randomization.",
    345           "source": "haiku"
    346         },
    347         "blinding_described": {
    348           "applies": true,
    349           "answer": false,
    350           "justification": "No blinding was implemented or described; participants explicitly enabled or disabled AI features as part of the study protocol, making blinding to condition impossible.",
    351           "source": "haiku"
    352         },
    353         "attrition_reported": {
    354           "applies": true,
    355           "answer": true,
    356           "justification": "Attrition is explicitly reported: '93 participants (96.9%) who started the task also completed it,' meaning 3 participants did not complete the task.",
    357           "source": "haiku"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": false,
    363           "answer": false,
    364           "justification": "This is a human behavioral study; inference cost of AI features is not the focus and would not be meaningful to report in this context.",
    365           "source": "haiku"
    366         },
    367         "compute_budget_stated": {
    368           "applies": false,
    369           "answer": false,
    370           "justification": "Not applicable; this is an RCT study, not a model training or large-scale compute experiment.",
    371           "source": "haiku"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "Developers who used AI features completed the enterprise task approximately 21% faster than those who did not, controlling for covariates.",
    379       "evidence": "Regression Model 2: β1=-0.24 (ExpCon), effect=(1-exp(-0.24))×100=21%, model p=0.011; t-test also significant (t(83.6)=2.11, p=.038).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The AI effect on speed is statistically significant in unadjusted analysis but loses significance when controlling for developer-level covariates.",
    384       "evidence": "H1 supported (t-test p=0.038); H2 'partially supported'—effect β=-0.24 but 95%CI=[-0.51, 0.03], p=0.086 in best-fit Model 2.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Developers who code more hours per day may benefit more from AI tools than those who code fewer hours.",
    389       "evidence": "Interaction term EC:APHD β=-0.29, Model 4 p=0.018 (model level), but interaction coefficient itself not significant; described as 'large and negative but not statistically significant.'",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "The 21% estimate is smaller than Peng et al.'s 56% estimate, likely due to population differences (Googlers vs Upwork freelancers) rather than tool differences.",
    394       "evidence": "Both studies are RCTs; the paper argues population difference (Upwork sample ~50% earning <$10k/year vs full-time Googlers) is the more likely explanation, though tool differences cannot be excluded.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Prior frequency of AI tool usage does not significantly predict speed gains when using AI in this task.",
    399       "evidence": "H5 rejected: interaction effect between experimental condition and NbrHighFreqAreas was negative but not significant (p=0.235).",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "Seniority level and daily coding hours are independent predictors of task speed, with senior developers 15% faster and high-coding-hours developers 32% faster.",
    404       "evidence": "Model 2: Level β=-0.16 (p<0.05), AvgProgHrsDay β=-0.38 (p<0.05).",
    405       "supported": "strong"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "rct"
    410   ],
    411   "key_findings": "This Google-internal RCT (n=96) found that three bundled AI coding features (AI Code Completion, Smart Paste, Natural Language to Code) reduced time on an enterprise-grade coding task by approximately 21%, with the main effect significant in unadjusted analysis (p=0.038) but losing statistical significance when controlling for developer-level covariates (p=0.086, 95%CI=[-0.51, 0.03]). The 21% estimate is substantially smaller than the often-cited 56% from Peng et al.'s GitHub Copilot study, with the authors attributing the difference to population composition (Googlers vs. Upwork freelancers). Developers who code more hours daily and more senior developers showed larger absolute speed advantages, though interaction effects with AI use were not statistically significant, suggesting the study was underpowered for subgroup analysis. No significant moderation by prior AI tool usage was found, and code quality was not assessed.",
    412   "red_flags": [
    413     {
    414       "flag": "Self-evaluation conflict of interest",
    415       "detail": "All nine authors are Google employees evaluating Google's own production AI tools; the paper acknowledges this but notes it was unavoidable due to IP constraints—the tools are only accessible to Google employees."
    416     },
    417     {
    418       "flag": "No pre-registration",
    419       "detail": "This is a human subjects RCT but no pre-registration is mentioned, leaving the hypothesis testing framework susceptible to post-hoc adjustment (though hypotheses are stated before results)."
    420     },
    421     {
    422       "flag": "No IRB or ethics disclosure",
    423       "detail": "96 Google employees participated as human subjects with no mention of IRB approval, ethics committee review, or informed consent procedures."
    424     },
    425     {
    426       "flag": "Bundled intervention",
    427       "detail": "Three AI features are combined into a single experimental condition; the contribution of each individual feature cannot be assessed, limiting understanding of which tools drive the effect."
    428     },
    429     {
    430       "flag": "Main claim loses significance when adjusting for covariates",
    431       "detail": "H2—the more rigorous test—is only 'partially supported': the AI effect is β=-0.24 with 95%CI=[-0.51, 0.03] and p=0.086, yet the abstract and conclusion foreground the significant unadjusted result."
    432     },
    433     {
    434       "flag": "No code quality measurement",
    435       "detail": "Task completion time is the sole outcome; code quality, correctness beyond 'tests pass,' and maintainability are not assessed despite being central to developer productivity concerns."
    436     },
    437     {
    438       "flag": "No power analysis calculation",
    439       "detail": "The paper asserts the study was 'adequately powered' for the main effect but provides no formal power analysis with assumed effect size, alpha, and power level to substantiate this claim."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    445       "relevance": "Direct RCT comparison study; found 56% speed gain vs this paper's 21%—central to the discussion of effect size differences."
    446     },
    447     {
    448       "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers",
    449       "relevance": "Large pooled enterprise field experiment (n=4,867) finding 26% throughput increase with Copilot; the closest comparable enterprise-scale estimate."
    450     },
    451     {
    452       "title": "Productivity assessment of neural code completion",
    453       "relevance": "Telemetry-based study of GitHub Copilot suggestion acceptance rates and perceived productivity—key prior work on measuring AI coding tool benefit."
    454     },
    455     {
    456       "title": "Using an LLM to help with code understanding",
    457       "relevance": "Prior work from overlapping authors (Nam, Macvean) on LLM-assisted code comprehension tasks; cited for findings on coding experience moderating AI benefit."
    458     },
    459     {
    460       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    461       "relevance": "Proposes modeling user behavior with AI coding tools; cited as a promising path for understanding expertise-speed interaction mechanisms."
    462     },
    463     {
    464       "title": "The SPACE of developer productivity: There's more to it than you think",
    465       "relevance": "Framework paper that motivates multi-dimensional view of developer productivity beyond time on task; used to situate the study's single-metric scope."
    466     },
    467     {
    468       "title": "What predicts software developers' productivity?",
    469       "relevance": "Empirical study of diverse factors affecting developer productivity; informed the theoretical framework of control variables."
    470     },
    471     {
    472       "title": "Randomized controlled trial for Microsoft Security Copilot",
    473       "relevance": "Comparable RCT design for AI coding tools in enterprise setting; cited for evidence that beginners may benefit more than experienced developers."
    474     },
    475     {
    476       "title": "Coding on Copilot: 2023 data suggests downward pressure on code quality",
    477       "relevance": "Industry report raising concerns that AI tools reduce code quality at ecosystem level; cited in conclusion as motivation for quality-focused future research."
    478     },
    479     {
    480       "title": "Significant productivity gains through programming with large language models",
    481       "relevance": "HCI study finding significant productivity gains; cited as recent evidence from the growing literature on LLM productivity impact."
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 3,
    487       "justification": "Directly measures enterprise developer speed gain from production AI tools using a real enterprise task—actionable for product teams and engineering managers."
    488     },
    489     "surprise_contrarian": {
    490       "score": 2,
    491       "justification": "The 21% estimate challenges the widely-cited 56% figure; the finding that more experienced/senior developers benefit more contradicts the popular narrative that AI helps beginners most."
    492     },
    493     "fear_safety": {
    494       "score": 1,
    495       "justification": "Briefly raises deskilling concerns and equity questions (who benefits from AI), but these are peripheral to the main results."
    496     },
    497     "drama_conflict": {
    498       "score": 2,
    499       "justification": "Google researchers openly acknowledge evaluating their own tools with a conflict of interest; the smaller-than-expected effect size and partially-unsupported main hypothesis create tension with Google's bullish stance on AI coding tools."
    500     },
    501     "demo_ability": {
    502       "score": 1,
    503       "justification": "The tools are Google-internal (Cider V, Piper) and inaccessible externally; external developers cannot replicate or directly try the evaluated tools."
    504     },
    505     "brand_recognition": {
    506       "score": 3,
    507       "justification": "All authors are from Google; the study evaluates Google's own internal AI coding infrastructure, giving it immediate brand recognition in the developer tools space."
    508     }
    509   },
    510   "hn_data": {
    511     "threads": [
    512       {
    513         "hn_id": "39164950",
    514         "title": "Lumiere: A Space-Time Diffusion Model for Video Generation",
    515         "points": 17,
    516         "comments": 1,
    517         "url": "https://news.ycombinator.com/item?id=39164950",
    518         "created_at": "2024-01-28T12:14:51Z"
    519       },
    520       {
    521         "hn_id": "42054102",
    522         "title": "TextLap: Customizing Language Models for Text-to-Layout Planning",
    523         "points": 8,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=42054102",
    526         "created_at": "2024-11-05T18:41:30Z"
    527       },
    528       {
    529         "hn_id": "39390245",
    530         "title": "Lumiere: A Space-Time Diffusion Model for Video Generation",
    531         "points": 5,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=39390245",
    534         "created_at": "2024-02-15T22:40:42Z"
    535       },
    536       {
    537         "hn_id": "38036218",
    538         "title": "Zephyr 7B",
    539         "points": 4,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=38036218",
    542         "created_at": "2023-10-27T09:06:34Z"
    543       },
    544       {
    545         "hn_id": "45255604",
    546         "title": "High-Dimensional Statistics",
    547         "points": 3,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=45255604",
    550         "created_at": "2025-09-15T22:13:10Z"
    551       },
    552       {
    553         "hn_id": "42306347",
    554         "title": "Auto-RAG",
    555         "points": 3,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=42306347",
    558         "created_at": "2024-12-03T14:25:36Z"
    559       },
    560       {
    561         "hn_id": "39158651",
    562         "title": "Lumiere: A Space-Time Diffusion Model for Video Generation",
    563         "points": 3,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=39158651",
    566         "created_at": "2024-01-27T18:59:57Z"
    567       },
    568       {
    569         "hn_id": "39128778",
    570         "title": "Meta Prompting by OpenAI and Mirac Suzgun Stanford",
    571         "points": 3,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=39128778",
    574         "created_at": "2024-01-25T12:23:06Z"
    575       },
    576       {
    577         "hn_id": "44299612",
    578         "title": "Developing RAG Based LLM Systems from PDFs: An Experience Report (2024)",
    579         "points": 2,
    580         "comments": 1,
    581         "url": "https://news.ycombinator.com/item?id=44299612",
    582         "created_at": "2025-06-17T14:22:29Z"
    583       },
    584       {
    585         "hn_id": "41890489",
    586         "title": "How much does AI impact development speed?",
    587         "points": 1,
    588         "comments": 0,
    589         "url": "https://news.ycombinator.com/item?id=41890489",
    590         "created_at": "2024-10-19T20:22:43Z"
    591       }
    592     ],
    593     "top_points": 17,
    594     "total_points": 49,
    595     "total_comments": 2
    596   }
    597 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs