scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23712B)
      1 {
      2   "paper": {
      3     "title": "Evolving with AI: A Longitudinal Analysis of Developer Logs",
      4     "authors": [
      5       "Agnia Sergeyuk",
      6       "Eric Huang",
      7       "Dariia Karaeva",
      8       "Anastasiia Serova",
      9       "Yaroslav Golubev",
     10       "Iftekhar Ahmed"
     11     ],
     12     "year": 2026,
     13     "venue": "ICSE '26",
     14     "arxiv_id": "2601.10258",
     15     "doi": "10.1145/3744916.3787811"
     16   },
     17   "scan_version": 2,
     18   "active_modules": [],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Supplementary materials on Zenodo [35] include survey questionnaire, anonymized responses, interview script, and statistical analysis outputs, but no analysis source code or scripts are mentioned as released."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Anonymized survey responses and complete statistical analysis outputs are publicly available via Zenodo [35]. Raw telemetry cannot be released due to confidentiality agreements, but aggregated data is provided."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, dependency files, or software version details are provided for reproducing the statistical analysis."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. The statistical methodology is described but there are no runnable scripts or README with commands."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Figures 1-5 show shaded regions representing ±1 standard deviation from the monthly mean for all telemetry metrics."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Mixed-effects linear models are used with p-values and a threshold of p < 0.05. Kolmogorov-Smirnov and Bartlett's tests are used to validate model assumptions. Multiple results report statistical significance (e.g., p < 0.001 for debugging, p = 0.03 for pastes)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Effect magnitudes are reported as regression coefficients with context: e.g., AI users +587 typed characters/month vs AI non-users +75, +102 deletions/month vs +7.6, +6.4 IDE activations vs -7.6."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or justification for why 400 devices per group or 62 survey respondents were chosen. The sample sizes appear to be convenience-based."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Standard deviation bands are shown in all telemetry figures (±1 SD shaded regions), and the mixed-effects model accounts for inter-device variability via random intercepts."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "400 AI non-users serve as a comparison group against 400 AI users, with trends analyzed for both groups over the same two-year period."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both groups are drawn from the same time period (Oct 2022 - Oct 2024) and the same IDE platforms, making the comparison contemporary."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is an observational study of developer behavior, not a system with components to ablate."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Five distinct telemetry metrics are used: typed characters, debugging instances, deletions, external pastes, and IDE window activations, plus seven Likert-scale survey questions."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The study includes a 62-person survey capturing developer perceptions across all five dimensions, plus five semi-structured interviews providing qualitative context."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "This is an observational study analyzing real-world behavior, not a prediction task requiring train/test splits."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down across five dimensions (productivity, code quality, code editing, code reuse, context switching), with separate analysis for AI users vs non-users in each."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative experiences are discussed, including P33's testimony about time wasted crafting prompts, P39's preference for taking responsibility over AI-generated code, and the finding that context switching increased for AI users."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Several negative findings are reported: AI users delete significantly more code, context switching increases for AI users while decreasing for non-users, and a perception-behavior gap is documented where developers don't notice workflow changes."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims that 'AI users produce substantially more code but also delete significantly more' and 'survey respondents report productivity gains and perceive minimal changes in other dimensions' are both directly supported by Section 4 results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper's framing implies AI causes workflow changes (title: 'Evolving with AI', RQs ask 'How did the introduction of AI tools influence...'), but the study is observational with self-selected groups. Section 6 acknowledges 'These interpretations cannot be fully disentangled without experimental assignment to conditions,' but the overall framing still implies causation from an observational design."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 (External validity) explicitly states findings may not generalize beyond JetBrains IDEs and the specific AI assistant. They also acknowledge the AI user group may represent early adopters whose behaviors don't fully generalize."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 6 (Internal validity) substantively discusses self-selection bias: 'It is likely that they differ not only in AI usage but also in baseline motivation, experience, or task profiles.' They also discuss how the higher activity of AI users could reflect pre-existing characteristics rather than AI influence."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper explicitly frames each metric as a 'proxy' (e.g., 'As a proxy for productivity, we counted the number of typed characters') and acknowledges limitations: 'it captures only one specific facet of productivity — namely, code authoring — and does not account for other important activities such as debugging, design, or collaboration.' Section 3.2.1."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This study does not evaluate any AI model's capability — it observes developer behavior. The AI assistant version is not specified, but the paper is studying user behavior, not model performance."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper does not use prompting as a research methodology."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The mixed-effects linear model formula is explicitly provided (count_action ~ group × n_month + (1|id_device)), with p-value threshold of 0.05 and full statistical outputs available in supplementary materials."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used in this study."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 3.2.2 describes the data processing pipeline: monthly aggregation of timestamped action records per device, zero-filling for missing actions, and the sample selection criteria (activity in Oct 2022 and Oct 2024). 'No extensive filtering was applied to the data.'"
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 'Threats to Validity' provides substantial discussion organized by construct validity, internal validity, and external validity."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Threats are specific to this study: developers using other AI tools may be misclassified as non-users, AI users may differ in baseline motivation/experience, telemetry from a single IDE vendor, and the proxy metrics capture only specific facets of each construct."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper explicitly states what results do not show: 'our findings may not generalize to all development environments or interface paradigms,' they cannot 'isolate causal effects,' and the distinction between groups is 'descriptive rather than explanatory.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Raw IDE telemetry logs cannot be released due to confidentiality agreements. Anonymized survey responses and aggregated statistical outputs are available, but the primary dataset (151M events) cannot be independently verified."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.2.1 describes telemetry collection (device IDs, event names, timestamps, metadata from IntelliJ IDEA, PyCharm, PhpStorm, WebStorm). Section 3.1.1 describes survey distribution (1,231 invitation emails to consented participant panel)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Survey recruitment is described: participants from an internal JetBrains panel who previously consented to research contact and self-identified as AI tool users. Interview participants selected for diversity across experience, geography, satisfaction, and usage patterns."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: telemetry logs → monthly aggregation per device → zero-filling → mixed-effects regression. Survey: 1,231 emails → 76 clicks → 67 responses → 62 after excluding non-AI users. Section 3.2.2."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding acknowledgments section is present. Three of six authors are affiliated with JetBrains (the company whose product is studied), but no explicit funding disclosure is made."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations with JetBrains and JetBrains Research are clearly listed in the header. The collaboration with JetBrains is stated in Section 1."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "The study was conducted 'in collaboration with JetBrains,' three authors are JetBrains employees, and the study evaluates JetBrains AI Assistant. JetBrains has a commercial interest in demonstrating the value of their AI tools."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This study does not evaluate a pre-trained model's capability on any benchmark. It is an observational study of developer behavior."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No model benchmark evaluation is performed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No model benchmark evaluation is performed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No pre-registration is mentioned for the survey or the overall study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The paper states the study was 'conducted in line with our institution's ethical standards, adhering to the values and guidelines outlined in the ICC/ESOMAR International Code' — a marketing research ethics code, not IRB/ethics board approval."
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 3.1.1 reports roles (56 developers, 16 team leads, 10 architects, 10 DevOps), experience levels (1-2 years to 16+ years), AI tools used, and usage duration."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Survey: self-identified AI tool users from JetBrains participant panel who consented to recontact. Telemetry: devices with activity in Oct 2022 and Oct 2024; AI users required consistent monthly JetBrains AI Assistant use from April-October 2024."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "This is an observational study, not an experimental study with random assignment to conditions."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This is an observational study; blinding is not applicable."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Survey attrition is reported: 1,231 emails sent → 76 clicks → 67 complete responses → 62 after excluding non-AI users. 5 were excluded for not having used AI assistance."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is an observational study of developer behavior, not a system or method with inference costs."
    290       },
    291       "compute_budget_stated": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "This is an observational study; computational budget is not a relevant concern."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "AI users produce substantially more code, with typed characters increasing at +587 characters/month vs +75 for non-users.",
    301       "evidence": "Mixed-effects linear model on 24 months of telemetry from 800 devices. Both trends statistically significant, interaction effect significant. Figure 1b, Section 4.1.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "AI users delete significantly more code (+102 deletions/month vs +7.6 for non-users).",
    306       "evidence": "Mixed-effects model on deletion events across the same 800 devices over 24 months. Both trends significant. Figure 3b, Section 4.3.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "82.3% of survey respondents perceive productivity gains from AI tools.",
    311       "evidence": "62 survey responses on 5-point Likert scale. Figure 1a, Section 4.1.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Developers perceive minimal changes in code quality, editing, reuse, and context switching, despite measurable telemetry differences.",
    316       "evidence": "Survey results show ~50% 'no change' for code quality, editing, reuse, and context switching (Figures 2a-5a), while telemetry shows significant trends for AI users in deletions, pastes, and IDE activations. Table 1, Section 5.2.",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "Context switching increases for AI users (+6.4 IDE activations/month) while decreasing for non-users (-7.6/month).",
    321       "evidence": "Mixed-effects model on IDE window activation events. Both trends statistically significant with opposing directions. Figure 5b, Section 4.5.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "AI does not uniformly reduce developer effort; it redistributes effort across more fragmented workflows.",
    326       "evidence": "Synthesis of all five dimensions: more typing + more deletion + more context switching = redistributed, not reduced effort. Section 5.1-5.2.",
    327       "supported": "moderate"
    328     }
    329   ],
    330   "methodology_tags": [
    331     "observational",
    332     "qualitative"
    333   ],
    334   "key_findings": "A two-year longitudinal study of 800 developers' IDE telemetry combined with a 62-person survey reveals that AI tool adoption is associated with substantially increased code output (+587 chars/month vs +75 for non-users) but also significantly more deletions (+102/month vs +7.6). Developers perceive strong productivity gains (82.3%) but report minimal changes in other dimensions, despite telemetry showing increased context switching and external paste activity among AI users. The perception-behavior gap suggests AI silently restructures development workflows in ways developers don't consciously recognize.",
    335   "red_flags": [
    336     {
    337       "flag": "Company evaluating its own product",
    338       "detail": "Three of six authors are JetBrains employees, and the study evaluates the impact of JetBrains AI Assistant using JetBrains telemetry data. While the paper acknowledges this in threats to validity, no formal conflict-of-interest statement is included."
    339     },
    340     {
    341       "flag": "Self-selection bias in group comparison",
    342       "detail": "AI users and non-users are self-selected, not randomly assigned. AI users are likely more active developers to begin with — the paper acknowledges 'AI users are generally more active in the IDE' but still frames findings as AI-mediated changes. The causal framing (title, RQs) overreaches the observational design."
    343     },
    344     {
    345       "flag": "Survey recruitment bias",
    346       "detail": "Survey respondents were recruited from JetBrains' internal participant panel of users who previously self-identified as AI tool users. This is a convenience sample biased toward JetBrains users and AI enthusiasts, yielding only 62 responses from 1,231 invitations (5% response rate)."
    347     },
    348     {
    349       "flag": "Misclassification of AI non-users",
    350       "detail": "Non-users are defined as those who never used JetBrains AI Assistant. They may use ChatGPT, Copilot, or other external AI tools extensively. The paper acknowledges this but still labels them 'AI non-users.'"
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    356       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    357       "year": 2025,
    358       "arxiv_id": "2507.09089",
    359       "relevance": "RCT finding that AI assistants actually increase task completion time by 19%, directly contradicting productivity claims."
    360     },
    361     {
    362       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    363       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    364       "year": 2023,
    365       "arxiv_id": "2302.06590",
    366       "relevance": "Key Copilot productivity study claiming 50%+ improvement in task completion time."
    367     },
    368     {
    369       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    370       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    371       "year": 2024,
    372       "relevance": "Found developers spend over 50% of time evaluating/editing AI-generated output, relevant to productivity measurement."
    373     },
    374     {
    375       "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges",
    376       "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"],
    377       "year": 2024,
    378       "relevance": "Large-scale survey on AI assistant usability, finding some developers actively avoid AI tools."
    379     },
    380     {
    381       "title": "Productivity assessment of neural code completion",
    382       "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X Alice Li"],
    383       "year": 2022,
    384       "relevance": "Documents disconnect between perceived and actual productivity with code completion tools."
    385     },
    386     {
    387       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    388       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    389       "year": 2024,
    390       "arxiv_id": "2410.02091",
    391       "relevance": "Reports no significant quality change despite productivity improvements with Copilot."
    392     },
    393     {
    394       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    395       "authors": ["Agnia Sergeyuk", "Yaroslav Golubev", "Timofey Bryksin", "Iftekhar Ahmed"],
    396       "year": 2025,
    397       "relevance": "Prior survey of AI coding assistant usage by some of the same authors."
    398     },
    399     {
    400       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    401       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L Glassman"],
    402       "year": 2022,
    403       "relevance": "Found Copilot did not consistently reduce task time, documenting expectation-experience gap."
    404     },
    405     {
    406       "title": "Are large language models a threat to digital public goods? Evidence from activity on Stack Overflow",
    407       "authors": ["Maria del Rio-Chanona", "Nadzeya Laurentsyeva", "Johannes Wachs"],
    408       "year": 2023,
    409       "arxiv_id": "2307.07367",
    410       "relevance": "Documents StackOverflow activity drop post-ChatGPT, relevant to code reuse patterns."
    411     },
    412     {
    413       "title": "The SPACE of developer productivity: There's more to it than you think",
    414       "authors": ["Nicole Forsgren", "Margaret-Anne Storey", "Chandra Maddila"],
    415       "year": 2021,
    416       "relevance": "Foundational framework for developer productivity measurement, relevant to proxy-outcome distinction."
    417     }
    418   ]
    419 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs