scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (27517B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evolving with AI: A Longitudinal Analysis of Developer Logs",
      6     "authors": [
      7       "Agnia Sergeyuk",
      8       "Eric Huang",
      9       "Dariia Karaeva",
     10       "Anastasiia Serova",
     11       "Yaroslav Golubev",
     12       "Iftekhar Ahmed"
     13     ],
     14     "year": 2026,
     15     "venue": "ICSE 2026 / arXiv.org",
     16     "arxiv_id": "2601.10258",
     17     "doi": "10.1145/3744916.3787811"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims that 'AI users produce substantially more code but also delete significantly more' and 'survey respondents report productivity gains and perceive minimal changes in other dimensions' are both directly supported by Section 4 results.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper's framing implies AI causes workflow changes (title: 'Evolving with AI', RQs ask 'How did the introduction of AI tools influence...'), but the study is observational with self-selected groups. Section 6 acknowledges 'These interpretations cannot be fully disentangled without experimental assignment to conditions,' but the overall framing still implies causation from an observational design.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 6 (External validity) explicitly states findings may not generalize beyond JetBrains IDEs and the specific AI assistant. They also acknowledge the AI user group may represent early adopters whose behaviors don't fully generalize.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 6 (Internal validity) substantively discusses self-selection bias: 'It is likely that they differ not only in AI usage but also in baseline motivation, experience, or task profiles.' They also discuss how the higher activity of AI users could reflect pre-existing characteristics rather than AI influence.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly frames each metric as a 'proxy' (e.g., 'As a proxy for productivity, we counted the number of typed characters') and acknowledges limitations: 'it captures only one specific facet of productivity — namely, code authoring — and does not account for other important activities such as debugging, design, or collaboration.' Section 3.2.1.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 'Threats to Validity' provides substantial discussion organized by construct validity, internal validity, and external validity.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Threats are specific to this study: developers using other AI tools may be misclassified as non-users, AI users may differ in baseline motivation/experience, telemetry from a single IDE vendor, and the proxy metrics capture only specific facets of each construct.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states what results do not show: 'our findings may not generalize to all development environments or interface paradigms,' they cannot 'isolate causal effects,' and the distinction between groups is 'descriptive rather than explanatory.'",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding acknowledgments section is present. Three of six authors are affiliated with JetBrains (the company whose product is studied), but no explicit funding disclosure is made.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations with JetBrains and JetBrains Research are clearly listed in the header. The collaboration with JetBrains is stated in Section 1.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The study was conducted 'in collaboration with JetBrains,' three authors are JetBrains employees, and the study evaluates JetBrains AI Assistant. JetBrains has a commercial interest in demonstrating the value of their AI tools.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "AI users and AI non-users are precisely defined (consistent monthly JetBrains AI Assistant interaction from April–October 2024 vs. no interaction over two years), and each proxy metric is operationally defined with specific telemetry actions.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly states two contributions: empirical characterization of evolving AI-assisted workflows and a reframing of AI's impact on effort distribution rather than reduction.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 extensively reviews prior telemetry studies (Minelli, Amann, Damevski) and prior AI-impact studies (Peng, Vaithilingam, Mozannar), explicitly positioning the work as addressing the gap in longitudinal, behavioral evidence.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "Supplementary materials on Zenodo [35] include survey questionnaire, anonymized responses, interview script, and statistical analysis outputs, but no analysis source code or scripts are mentioned as released.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Anonymized survey responses and complete statistical analysis outputs are publicly available via Zenodo [35]. Raw telemetry cannot be released due to confidentiality agreements, but aggregated data is provided.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, dependency files, or software version details are provided for reproducing the statistical analysis.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided. The statistical methodology is described but there are no runnable scripts or README with commands.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "Figures 1-5 show shaded regions representing ±1 standard deviation from the monthly mean for all telemetry metrics.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Mixed-effects linear models are used with p-values and a threshold of p < 0.05. Kolmogorov-Smirnov and Bartlett's tests are used to validate model assumptions. Multiple results report statistical significance (e.g., p < 0.001 for debugging, p = 0.03 for pastes).",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Effect magnitudes are reported as regression coefficients with context: e.g., AI users +587 typed characters/month vs AI non-users +75, +102 deletions/month vs +7.6, +6.4 IDE activations vs -7.6.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No power analysis or justification for why 400 devices per group or 62 survey respondents were chosen. The sample sizes appear to be convenience-based.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Standard deviation bands are shown in all telemetry figures (±1 SD shaded regions), and the mixed-effects model accounts for inter-device variability via random intercepts.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "400 AI non-users serve as a comparison group against 400 AI users, with trends analyzed for both groups over the same two-year period.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Both groups are drawn from the same time period (Oct 2022 - Oct 2024) and the same IDE platforms, making the comparison contemporary.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is an observational study of developer behavior, not a system with components to ablate.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Five distinct telemetry metrics are used: typed characters, debugging instances, deletions, external pastes, and IDE window activations, plus seven Likert-scale survey questions.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "The study includes a 62-person survey capturing developer perceptions across all five dimensions, plus five semi-structured interviews providing qualitative context.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "This is an observational study analyzing real-world behavior, not a prediction task requiring train/test splits.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down across five dimensions (productivity, code quality, code editing, code reuse, context switching), with separate analysis for AI users vs non-users in each.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Negative experiences are discussed, including P33's testimony about time wasted crafting prompts, P39's preference for taking responsibility over AI-generated code, and the finding that context switching increased for AI users.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Several negative findings are reported: AI users delete significantly more code, context switching increases for AI users while decreasing for non-users, and a perception-behavior gap is documented where developers don't notice workflow changes.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": false,
    238           "answer": false,
    239           "justification": "This study does not evaluate any AI model's capability — it observes developer behavior. The AI assistant version is not specified, but the paper is studying user behavior, not model performance.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": false,
    244           "answer": false,
    245           "justification": "The paper does not use prompting as a research methodology.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "The mixed-effects linear model formula is explicitly provided (count_action ~ group × n_month + (1|id_device)), with p-value threshold of 0.05 and full statistical outputs available in supplementary materials.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used in this study.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3.2.2 describes the data processing pipeline: monthly aggregation of timestamped action records per device, zero-filling for missing actions, and the sample selection criteria (activity in Oct 2022 and Oct 2024). 'No extensive filtering was applied to the data.'",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw IDE telemetry logs cannot be released due to confidentiality agreements. Anonymized survey responses and aggregated statistical outputs are available, but the primary dataset (151M events) cannot be independently verified.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.2.1 describes telemetry collection (device IDs, event names, timestamps, metadata from IntelliJ IDEA, PyCharm, PhpStorm, WebStorm). Section 3.1.1 describes survey distribution (1,231 invitation emails to consented participant panel).",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Survey recruitment is described: participants from an internal JetBrains panel who previously consented to research contact and self-identified as AI tool users. Interview participants selected for diversity across experience, geography, satisfaction, and usage patterns.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline is documented: telemetry logs → monthly aggregation per device → zero-filling → mixed-effects regression. Survey: 1,231 emails → 76 clicks → 67 responses → 62 after excluding non-AI users. Section 3.2.2.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "This study does not evaluate a pre-trained model's capability on any benchmark. It is an observational study of developer behavior.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "No model benchmark evaluation is performed.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "No model benchmark evaluation is performed.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the survey or the overall study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "The paper states the study was 'conducted in line with our institution's ethical standards, adhering to the values and guidelines outlined in the ICC/ESOMAR International Code' — a marketing research ethics code, not IRB/ethics board approval.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Section 3.1.1 reports roles (56 developers, 16 team leads, 10 architects, 10 DevOps), experience levels (1-2 years to 16+ years), AI tools used, and usage duration.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Survey: self-identified AI tool users from JetBrains participant panel who consented to recontact. Telemetry: devices with activity in Oct 2022 and Oct 2024; AI users required consistent monthly JetBrains AI Assistant use from April-October 2024.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "This is an observational study, not an experimental study with random assignment to conditions.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "This is an observational study; blinding is not applicable.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": true,
    353           "justification": "Survey attrition is reported: 1,231 emails sent → 76 clicks → 67 complete responses → 62 after excluding non-AI users. 5 were excluded for not having used AI assistance.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": false,
    360           "answer": false,
    361           "justification": "This is an observational study of developer behavior, not a system or method with inference costs.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": false,
    366           "answer": false,
    367           "justification": "This is an observational study; computational budget is not a relevant concern.",
    368           "source": "opus"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "AI users type significantly more code over time than non-users (+587 vs +75 characters/month).",
    376       "evidence": "Mixed-effects linear model on 151M logged events across 800 devices over 24 months.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "AI users delete code at a significantly faster rate than non-users (+102 vs +7.6 deletions/month).",
    381       "evidence": "Same telemetry analysis; statistically significant interaction effect between group and time.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "82.3% of surveyed developers perceive productivity gains from AI tools, while perceptions of changes in code quality, editing, reuse, and context switching are largely neutral.",
    386       "evidence": "Survey of 62 professional developers with Likert-scale responses; descriptive statistics reported.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "AI users show increased context switching (IDE window activations) while non-users show decreased context switching (+6.4 vs -7.6 activations/month).",
    391       "evidence": "Telemetry trend analysis with mixed-effects model; statistically significant interaction effect.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "There is a systematic disconnect between what developers perceive about their workflow and what behavioral telemetry records.",
    396       "evidence": "Comparison of survey responses (half reporting no change in editing, quality, reuse, context switching) against telemetry showing significant increases in all these dimensions.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "AI redistributes developer effort rather than reducing it, with increased code production accompanied by increased deletion.",
    401       "evidence": "Simultaneous statistically significant increase in both typing and deletions for AI users, with no corresponding increase in debugging sessions.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "observational",
    407     "qualitative"
    408   ],
    409   "key_findings": "Over a two-year longitudinal study of 800 professional developers using JetBrains IDEs, AI users wrote and deleted substantially more code than non-users (typed characters: +587 vs +75/month; deletions: +102 vs +7.6/month), and showed increased context switching contrary to the expected benefit of in-IDE AI support. A parallel survey of 62 professionals revealed strong perceived productivity gains but largely neutral perceptions of changes in code quality, editing habits, reuse, and context switching — a systematic disconnect with what behavioral telemetry records. The findings suggest AI redistributes and restructures development effort rather than reducing it, reshaping workflow in ways that often elude developers' conscious awareness.",
    410   "red_flags": [
    411     {
    412       "flag": "Funder conflict of interest",
    413       "detail": "JetBrains employees evaluate the JetBrains AI Assistant using JetBrains-provided proprietary data with no external oversight or independent replication possible."
    414     },
    415     {
    416       "flag": "Self-selection confound not resolved",
    417       "detail": "AI users self-selected into AI tool adoption; the paper acknowledges early adopters may be inherently more active developers, but cannot disentangle this from AI's effect — cross-group comparisons are treated descriptively but framed causally throughout."
    418     },
    419     {
    420       "flag": "Raw data unavailable",
    421       "detail": "Core telemetry data (151M events) cannot be released due to confidentiality agreements, making independent replication impossible despite the study's empirical claims."
    422     },
    423     {
    424       "flag": "Very small survey n",
    425       "detail": "Only 62 survey respondents, recruited from JetBrains' own internal participant panel of prior study consents — a non-representative, potentially self-selected sample of AI-positive developers."
    426     },
    427     {
    428       "flag": "Narrow productivity proxy",
    429       "detail": "'Typed characters' as a proxy for productivity explicitly excludes debugging, design, collaboration, and review — yet the productivity framing in the abstract and results is broad."
    430     },
    431     {
    432       "flag": "No pre-registration",
    433       "detail": "A five-RQ mixed-methods study with JetBrains data was not pre-registered, leaving hypotheses open to post-hoc selection."
    434     },
    435     {
    436       "flag": "AI tool version unspecified",
    437       "detail": "The underlying LLM powering JetBrains AI Assistant during the study period is not identified, making it impossible to assess what capabilities developers were actually using."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    443       "relevance": "Key prior RCT on AI coding assistant productivity used as direct comparison point"
    444     },
    445     {
    446       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    447       "relevance": "Prior work finding developers spend >50% of time evaluating AI output — directly cited to corroborate rework findings"
    448     },
    449     {
    450       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    451       "relevance": "Shows Copilot did not consistently reduce task time — cited as contrasting evidence for the mixed findings on productivity"
    452     },
    453     {
    454       "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges",
    455       "relevance": "Large-scale survey on AI coding tool use; cited multiple times for developer perception data"
    456     },
    457     {
    458       "title": "Productivity assessment of neural code completion",
    459       "relevance": "Documents disconnect between perceived and actual productivity gains — central to the paper's main thesis"
    460     },
    461     {
    462       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    463       "relevance": "Finds AI assistants actually increase task completion time by 19% — directly contradicts perceived productivity gains"
    464     },
    465     {
    466       "title": "The SPACE of developer productivity: There's more to it than you think",
    467       "relevance": "Framework for understanding multidimensional developer productivity — cited to contextualize typed characters as a proxy"
    468     },
    469     {
    470       "title": "Are large language models a threat to digital public goods? Evidence from activity on Stack Overflow",
    471       "relevance": "Provides ecosystem-level evidence of behavioral change (StackOverflow decline) attributed to AI adoption"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Directly addresses how AI coding assistants change real professional workflows using two years of actual IDE telemetry — immediately actionable for tool designers and development managers."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "The finding that AI increases context switching (not reduces it as promised) and that developers systematically misperceive their own workflow changes challenges conventional AI productivity narratives."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns raised; the study focuses on workflow efficiency and perception gaps."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Mild tension between AI productivity promises and the finding that effort is redistributed rather than reduced, but framed constructively rather than polemically."
    490     },
    491     "demo_ability": {
    492       "score": 1,
    493       "justification": "Readers can observe their own AI usage patterns but cannot access or reproduce the telemetry study itself."
    494     },
    495     "brand_recognition": {
    496       "score": 2,
    497       "justification": "JetBrains is a well-known developer tools company; ICSE is the top software engineering conference, lending credibility and recognition."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "46676395",
    504         "title": "Too Helpful to Be Safe: User-Mediated Attacks on Planning and Web-Use Agents",
    505         "points": 4,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=46676395",
    508         "created_at": "2026-01-19T08:39:39Z"
    509       }
    510     ],
    511     "top_points": 4,
    512     "total_points": 4,
    513     "total_comments": 0
    514   }
    515 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs