ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25183B)


      1 {
      2   "paper": {
      3     "title": "\"Maybe We Need Some More Examples:\" Individual and Team Drivers of Developer GenAI Tool Use",
      4     "authors": ["Courtney Miller", "Rudrajit Choudhuri", "Mara Ulloa", "Sankeerti Haniyur", "Robert DeLine", "Margaret-Anne Storey", "Emerson Murphy-Hill", "Christian Bird", "Jenna L. Butler"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.21280",
      8     "doi": "10.48550/arXiv.2507.21280"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["qualitative"],
     13   "key_findings": "Through paired interviews with 54 developers (27 matched pairs from the same teams), the study identifies that frequent GenAI tool users view tools as collaborators, adopt experimental approaches, and demonstrate adaptive persistence when facing challenges, while infrequent users view tools as features, adopt conservative approaches, and quickly abandon tools after failures. The study identifies a 'Productivity Pressure Paradox' where organizational expectations for rapid productivity gains without learning support undermine the very benefits that motivate adoption. Team and organizational factors (leadership messaging, context-specific resources, social learning structures, protected learning time) actively shape individual adoption patterns.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code or analysis scripts are released. The paper mentions supplementary materials (interview guide, codebook) will be posted on Zenodo with camera ready, but no repository URL is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Interview transcripts are not released (understandable for privacy). The paper states interview guide, codebook, and codebook manual are available in supplemental materials on HotCRP, with plans to post on Zenodo, but no data download link is provided."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a qualitative interview study with no computational experiments requiring environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While the methodology section describes the interview protocol and analysis approach in reasonable detail, no step-by-step reproduction instructions are provided. The supplementary materials (interview guide, codebook) are promised but not yet available."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a qualitative study. The authors explicitly state they 'avoid discussing frequency counts or percentages in line with established guidance against quantifying qualitative data' (Section 3.4)."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Qualitative thematic analysis study; no statistical hypothesis testing is performed or claimed."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative effect sizes are relevant to this qualitative study design."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The authors describe their saturation criterion: 'two consecutive interview pairs without learning any new major insights (i.e., 4 interviews)' (Section 3.4), citing Francis et al. (2010) for operationalizing data saturation."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No quantitative measurements requiring variance reporting in this qualitative study."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a qualitative interview study, not an evaluation of a system. The paired design comparing frequent vs. infrequent users serves as the comparative structure."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Not applicable to a qualitative interview study."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Not applicable to a qualitative interview study."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Not applicable to a qualitative interview study."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — the study IS a human interview study, not a system evaluation requiring human judges."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not applicable to qualitative research."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are organized by theme categories (mindset formation, approach determination, encounters with challenges, integration and evolution) with separate analysis of frequent vs. infrequent users. Figure 3 provides a visual breakdown of factor distributions across user types."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses cases where the framework's patterns do not hold — e.g., PID35 remained infrequent despite a collaborative team environment (Section 5.3), and Section 5 shows that organizational factors affect team members differently despite shared context."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that peer influence 'operates differently even within teams' (Section 5.3) and that formal organizational messaging alone is insufficient — PID37's team received minimal guidance reinforcing limited usage. These counter-narratives complicate the framework rather than being suppressed."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about collaborator vs. feature perception, experimental vs. conservative approaches, adaptive persistence vs. quick abandonment, and the Productivity Pressure Paradox are all supported with extensive interview evidence in Sections 4 and 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly acknowledges in Section 3.6: 'we occasionally use causal language (e.g., \"leads to\") for clarity and readability. These phrasings should not be interpreted as statistical causal claims, as our qualitative design does not support formal causal inference.' This is an appropriate hedge."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 3.6 explicitly states limitations to generalizability: single large company, GitHub Copilot only, excludes Germany/Norway, may not generalize to smaller companies, open source, or companies without AI promotion. The authors also acknowledge construct validity limitations around measuring only Copilot usage."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses that infrequent users classified via Copilot telemetry may be heavy users of other AI tools (Section 3.6). Section 5 explicitly addresses how the same organizational factors produce divergent responses, referencing social construction theory (Fulk, Orlikowski) rather than assuming simple causal paths."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures GitHub Copilot usage days as a proxy for GenAI tool usage and explicitly acknowledges the gap: 'Engineers using general-purpose AI tools (e.g., internal OpenAI models) or non-compliant tools (e.g., Cursor, Windsurf) could be classified as infrequent users despite potentially heavy AI usage elsewhere' (Section 3.6)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models are used as part of the study methodology. The study is about developers using GitHub Copilot, but the researchers themselves do not use AI models for evaluation."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting of AI models is part of the study methodology."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No AI model hyperparameters are relevant to this qualitative interview study."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used in this study."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper documents the pairing algorithm (Hungarian method via scipy.optimize), filtering criteria (same primary programming language, usage gap in top third), interview scheduling process, and exclusion of 2 participants whose pair partners dropped out (Section 3.2, 3.4)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3.6 'Limitations' provides a dedicated, substantive discussion of study limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 3.6 discusses specific threats: self-selection bias, single-company limitation with strong pro-AI culture, construct validity issue of measuring only Copilot usage, and potential response bias from organizational pressure. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 3.6 explicitly states what the results do not show: 'This limits generalizability to engineers at smaller or larger companies without AI promotion, or to open source development contexts.' The paper also notes Copilot-only measurement does not capture other tool usage."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Interview transcripts are not available (reasonable for privacy, but limits independent verification). Only interview guide and codebook are promised as supplementary materials."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2-3.4 describes data collection in detail: telemetry extraction, pairing algorithm, 8-week observation window (April 1 - May 31, 2025), video conferencing interviews, 670 invitations sent, 315 agreed, 152 from 76 pairs, 56 interviews conducted."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.2 describes recruitment: internal company chat, reaching out to individual developers from algorithmically-identified pairs, excluding Germany and Norway for regulatory reasons. Potential recruitment bias from self-selection is acknowledged in limitations."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: 670 invitations → 315 agreed → 152 from 76 complete pairs → 56 interviews conducted → 2 excluded (partner dropout) → 54 participants (27 pairs) in final analysis. The coding process is also documented: open coding → 8-interview framework review → iterative refinement → full re-coding."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No explicit funding disclosure or acknowledgment of grants. The acknowledgments mention internships at Microsoft and visiting researcher status but do not disclose formal funding sources."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: 4 authors at Microsoft (the company whose tool, GitHub Copilot, is studied), plus affiliations at CMU, Oregon State, Northwestern, and University of Victoria. Microsoft affiliation is prominent."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Microsoft employees are studying Microsoft's own product (GitHub Copilot). Microsoft has a direct financial interest in demonstrating that adoption barriers can be overcome and that the tool provides value. The funder/employer is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Several authors are Microsoft employees studying a Microsoft product (GitHub Copilot), creating an inherent financial interest that is not formally declared."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is a qualitative interview study, not a benchmark evaluation of a pre-trained model's capabilities."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — no pre-trained model is being evaluated on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — no benchmark evaluation is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No mention of pre-registration. The interview protocol evolved iteratively (Section 3.3), which is standard for qualitative research but means the study was not pre-registered."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of IRB or ethics board approval anywhere in the paper. Section 3.1 discusses ethical considerations (privacy, anonymity, non-judgmental environments) but does not mention formal ethics review."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Table 1 reports participant demographics: location (7 countries), role (Senior SWE, SWE II, SWE), and Copilot usage duration. Figure 2 shows usage distributions for frequent and infrequent groups."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Section 3.2 describes inclusion criteria: same primary programming language, usage gap in top third, same career stage/title/level/country/manager. Exclusion: Germany and Norway for regulatory reasons, pairs where one partner didn't agree."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not applicable — this is an observational/qualitative study, not an experiment with random assignment to conditions."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Not applicable — this is a qualitative interview study, not an experiment requiring blinding."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Section 3.4 reports: 670 invitations → 315 agreed → 152 from 76 pairs → 56 interviews conducted → 2 excluded when pair partners dropped out → 54 final participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a qualitative interview study with no computational method whose cost would be relevant."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No computational experiments are performed."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Frequent GenAI tool users view tools as collaborative partners while infrequent users view them as utility features, and this perception shapes subsequent usage patterns.",
    296       "evidence": "Section 4.1 with quotes from multiple participants: PID2 uses Copilot 'as a colleague', PID7 compares it to 'a stack overflow search'. Contrasting pairs (PID7/PID23) illustrate how perception differences lead to different engagement patterns within the same team.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Frequent users demonstrate adaptive persistence when facing challenges while infrequent users more often quickly abandon the tool.",
    301       "evidence": "Section 4.3 with paired comparisons: PID27 breaks tasks into smaller components when facing hallucination; PID24 gives up after compile errors and reverts to manual methods. PID24/PID26 pair directly contrasts responses to similar prompt engineering challenges. Figure 3 shows distribution differences.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "A 'Productivity Pressure Paradox' exists where organizational expectations for rapid productivity gains without learning support undermines the skill development needed to achieve those gains.",
    306       "evidence": "Section 5.4 with quotes from multiple participants: PID20 notes rising expectations, PID46 describes the 'chicken or egg problem' of investing time in prompt crafting, PID6 describes pressure-induced reversion to familiar methods, PID44 describes being overburdened. Figure 4 illustrates the cycle.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Team-specific demonstrations and context-specific resources are more valuable than generic AI guidance for promoting adoption.",
    311       "evidence": "Section 5.2: PID48 calls for 'a comprehensive guide of how to use it for specific projects'; PID6 describes a colleague's shared-screen demo of using agent mode for a common team task as 'very helpful'. PID33 describes being overwhelmed by 10+ generic AI meetings per week.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "red_flags": [
    316     {
    317       "flag": "Company evaluating its own product",
    318       "detail": "Four of nine authors are Microsoft employees. The study examines GitHub Copilot adoption at Microsoft. While the findings are largely about adoption barriers (not product promotion), the framing positions challenges as organizational rather than product-related, potentially deflecting from product quality issues. No formal competing interests statement is provided."
    319     },
    320     {
    321       "flag": "No IRB/ethics approval mentioned",
    322       "detail": "The study interviews 54 human participants about sensitive workplace topics (tool usage patterns, performance pressure, job security fears) at their employer, yet no IRB or ethics board approval is mentioned despite a detailed ethical considerations section."
    323     },
    324     {
    325       "flag": "Potential social desirability bias",
    326       "detail": "Section 3.6 acknowledges the company's 'strong pro-AI culture may have influenced participants' responses.' Infrequent users at a company 'vocally committed to AI adoption' may understate resistance or overstate willingness to adopt. The study frames this as a limitation but does not attempt to measure or mitigate it beyond neutral question wording."
    327     },
    328     {
    329       "flag": "No quantification of qualitative patterns",
    330       "detail": "The paper explicitly avoids frequency counts, citing guidance against quantifying qualitative data. While methodologically defensible, this means claims like 'many frequent-AI users' vs 'many infrequent-AI users' cannot be verified for how dominant these patterns actually were. Figure 3 shows proportional distributions but without counts."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace",
    336       "authors": ["Jenna Butler", "Jina Suh", "Sankeerti Haniyur", "Constance Hadley"],
    337       "year": 2025,
    338       "relevance": "RCT of GenAI coding tools in the workplace, finding mixed outcomes including decreased efficiency for some developers."
    339     },
    340     {
    341       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    342       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    343       "year": 2025,
    344       "arxiv_id": "2507.09089",
    345       "relevance": "Empirical measurement of AI impact on developer productivity showing mixed results."
    346     },
    347     {
    348       "title": "Navigating the complexity of generative ai adoption in software engineering",
    349       "authors": ["Daniel Russo"],
    350       "year": 2024,
    351       "relevance": "Study of GenAI adoption factors in SE finding compatibility outweighs traditional UTAUT factors."
    352     },
    353     {
    354       "title": "Exploring Individual Factors in the Adoption of LLMs for Specific Software Engineering Tasks",
    355       "authors": ["Stefano Lambiase", "Gemma Catolino", "Fabio Palomba", "Filomena Ferrucci", "Daniel Russo"],
    356       "year": 2025,
    357       "arxiv_id": "2504.02553",
    358       "relevance": "Examines how adoption drivers vary by task type, finding peer opinions can backfire by creating unrealistic expectations."
    359     },
    360     {
    361       "title": "A large-scale survey on the usability of ai programming assistants: Successes and challenges",
    362       "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"],
    363       "year": 2024,
    364       "relevance": "Large-scale survey documenting usability challenges developers face with AI programming assistants."
    365     },
    366     {
    367       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    368       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    369       "year": 2023,
    370       "arxiv_id": "2302.06590",
    371       "relevance": "Key productivity study estimating Copilot's impact on developer task completion speed."
    372     },
    373     {
    374       "title": "Navigating the jagged technological frontier: Field experimental evidence of the effects of AI on knowledge worker productivity and quality",
    375       "authors": ["Fabrizio Dell'Acqua", "Edward McFowland III", "Ethan R Mollick"],
    376       "year": 2023,
    377       "relevance": "Influential field experiment showing AI improves some tasks but harms performance on others (jagged frontier concept)."
    378     },
    379     {
    380       "title": "What Guides Our Choices? Modeling Developers' Trust and Behavioral Intentions Towards GenAI",
    381       "authors": ["Rudrajit Choudhuri", "Bianca Trinkenreich", "Rahul Pandita"],
    382       "year": 2024,
    383       "arxiv_id": "2409.04099",
    384       "relevance": "Models trust factors in developer GenAI adoption including system quality, functional value, and goal alignment."
    385     },
    386     {
    387       "title": "Copiloting the future: How generative AI transforms Software Engineering",
    388       "authors": ["Leonardo Banh", "Florian Holldack", "Gero Strobel"],
    389       "year": 2025,
    390       "relevance": "Survey on how GenAI transforms SE, documenting enablers, barriers, and ethical concerns."
    391     },
    392     {
    393       "title": "Human-AI Experience in Integrated Development Environments: A Systematic Literature Review",
    394       "authors": ["Agnia Sergeyuk", "Ilya Zakharov", "Ekaterina Koshchenko", "Maliheh Izadi"],
    395       "year": 2025,
    396       "arxiv_id": "2503.06195",
    397       "relevance": "Systematic review of human-AI interaction in IDEs, relevant to understanding developer experience with AI tools."
    398     },
    399     {
    400       "title": "\"It would work for me too\": How online communities shape software developers' trust in AI-powered code generation tools",
    401       "authors": ["Ruijia Cheng", "Ruotong Wang", "Thomas Zimmermann", "Denae Ford"],
    402       "year": 2024,
    403       "relevance": "Studies how peer influence in online communities shapes developer trust in AI code generation tools."
    404     },
    405     {
    406       "title": "AI tool use and adoption in software development by individuals and organizations: a grounded theory study",
    407       "authors": ["Ze Shi Li", "Nowshin Nawar Arony", "Ahmed Musa Awon", "Daniela Damian", "Bowen Xu"],
    408       "year": 2024,
    409       "arxiv_id": "2406.17325",
    410       "relevance": "Grounded theory study of AI tool adoption in software development at individual and organizational levels."
    411     }
    412   ]
    413 }

Impressum · Datenschutz