scan.json (24545B)
1 { 2 "paper": { 3 "title": "In-IDE Human-AI Experience in the Era of Large Language Models; A Literature Review", 4 "authors": [ 5 "Agnia Sergeyuk", 6 "Sergey Titov", 7 "Maliheh Izadi" 8 ], 9 "year": 2024, 10 "venue": "ICSE 2024", 11 "arxiv_id": "2401.10739", 12 "doi": "10.1145/3643796.3648463" 13 }, 14 "scan_version": 3, 15 "active_modules": ["survey_methodology"], 16 "methodology_tags": ["meta-analysis"], 17 "key_findings": "This literature review of 36 papers (2020–2024) identifies three research branches in in-IDE Human-AI Experience: Design of Interaction (14 papers on UI principles), Impact of Interaction (13 papers on workflow and productivity effects), and Quality of Interaction (9 papers on correctness, comprehensibility, and security). The authors find that AI tools increase productivity but may trade off code quality, that security risks vary by model (up to 40% vulnerability rate for C code), and that over-reliance on AI remains a challenge especially for novice developers. The paper proposes three future research directions: task-specific UI design, trust-building mechanisms, and code readability as an alignment criterion.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No analysis code or scripts are released. The paper provides a dataset on Zenodo but no code to reproduce the analysis or categorization." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The authors provide their curated dataset on Zenodo (https://doi.org/10.5281/zenodo.10290921), referenced in Section 4.2 and footnote 2." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment or dependency specifications are provided. As a qualitative survey, no computational environment was needed, but no tooling for data extraction or analysis is specified." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided. While the search string and databases are listed, there are no instructions for how to replicate the filtering and categorization process." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": false, 44 "answer": false, 45 "justification": "This is a qualitative literature review with no statistical experiments or quantitative aggregation." 46 }, 47 "significance_tests": { 48 "applies": false, 49 "answer": false, 50 "justification": "No comparative statistical claims are made; the paper categorizes and summarizes literature qualitatively." 51 }, 52 "effect_sizes_reported": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experiments are conducted; effect sizes are not applicable to this qualitative literature survey." 56 }, 57 "sample_size_justified": { 58 "applies": false, 59 "answer": false, 60 "justification": "Survey sample size is determined by the search strategy, not experimental power analysis. Not applicable for this paper type." 61 }, 62 "variance_reported": { 63 "applies": false, 64 "answer": false, 65 "justification": "No experimental runs are conducted; variance reporting is not applicable to a qualitative literature review." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": false, 72 "justification": "No prior surveys are compared against. The authors claim 'the absence of a literature survey in this domain' but do not systematically compare with related reviews or surveys in adjacent fields." 73 }, 74 "baselines_contemporary": { 75 "applies": false, 76 "answer": false, 77 "justification": "No baselines are included, so their contemporariness cannot be assessed." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "No system with components to ablate; this is a literature survey." 83 }, 84 "multiple_metrics": { 85 "applies": false, 86 "answer": false, 87 "justification": "No experiments are conducted that would require evaluation metrics." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "No system outputs to evaluate; this is a survey paper." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "No experiments with train/test splits are conducted." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by the three identified categories: Design (14 papers), Impact (13 papers), and Quality (9 papers), each discussed in dedicated subsections of Section 3." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses challenges and failure modes identified in reviewed work: over-reliance on AI, security vulnerabilities reaching 40% for C programs, code quality trade-offs, and compatibility issues with AI tools." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports negative findings from the reviewed literature including productivity-quality trade-offs, security vulnerabilities, over-reliance concerns for novices, and limitations of AI assistants for non-trivial tasks." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims three research branches were identified from 36 papers, which matches the analysis in Section 3. The proposed future directions (task-specific UI, trust, readability) are discussed in Section 4.1." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper relays causal claims from reviewed work as conclusions (e.g., 'Using AI tools increases productivity' in Section 3, 'in-IDE Human-AI Interaction significantly affects and changes the developers' workflow') without assessing whether the underlying study designs actually support causal inference." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The conclusion states 'it is evident that the field has three branches' as if this is the definitive structure of the field, generalizing from only 36 papers (22 from ArXiv). The title claims to cover 'the Era of Large Language Models' broadly. While temporal bias is acknowledged, the generalizations from 36 papers to the entire field are not bounded." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The threats to validity section (4.2) discusses methodological limitations (sampling bias, temporal bias, source reliability) but does not consider alternative categorizations of the field or alternative interpretations of the patterns found." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper analyzes 36 papers but frames findings as describing 'the field' of in-IDE HAX. The gap between a 36-paper sample (61% non-peer-reviewed ArXiv papers) and claims about the entire field's structure is not acknowledged." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": false, 145 "answer": false, 146 "justification": "No AI models are used in the paper's own methodology; this is a manual literature review." 147 }, 148 "prompts_provided": { 149 "applies": false, 150 "answer": false, 151 "justification": "No prompting is used; this is a manual literature review." 152 }, 153 "hyperparameters_reported": { 154 "applies": false, 155 "answer": false, 156 "justification": "No AI models or computational experiments are used in the paper's methodology." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used; this is a manual literature survey." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The search string and inclusion/exclusion criteria are stated (Section 2), but the filtering from 211 to 36 papers is presented as a single step with no intermediate stage counts, no screening agreement rates, and no documentation of the categorization process." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 4.2 'Threats to Validity' provides a dedicated discussion of potential validity concerns across four dimensions." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The threats section discusses study-specific concerns: sampling bias from the chosen databases, temporal bias from the 2020–2024 timeframe, reliability concerns about ArXiv preprints, and interpretation bias from categorizing the large corpus." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what it does NOT cover. Limitations are framed as potential biases rather than clear scope exclusions. No specific statements about excluded topics, populations, or settings." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The Zenodo dataset (DOI: 10.5281/zenodo.10290921) is provided, containing the full set of extracted data from the 36 reviewed papers." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 2 describes the data collection: four databases (ACM, DBLP, IEEE, ArXiv), the full search string, inclusion/exclusion criteria, and the resulting 211→36 paper counts." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper search strategy is documented: databases selected, search string provided, and filtering criteria described. Table 1 breaks down source venues." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The pipeline jumps from 211 initial papers to 36 final papers with no intermediate stage counts. The categorization of papers into Design/Impact/Quality is not documented as a process — no inter-rater agreement, no coding procedure described." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source is disclosed anywhere in the paper, despite two of three authors being affiliated with JetBrains Research, a commercial entity with products in the reviewed space." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are listed: JetBrains Research (Sergeyuk, Titov) and Delft University of Technology (Izadi). However, the paper does not acknowledge the conflict that JetBrains produces IDEs and an AI assistant plugin." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Two authors are employed by JetBrains Research, and JetBrains has a direct commercial interest in positive portrayals of in-IDE AI tools (JetBrains produces IntelliJ IDEA with an AI Assistant plugin, mentioned in footnote 1). The funder is not independent of the outcome." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement is present. JetBrains' commercial interest in the reviewed product space is not acknowledged." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "This is a literature survey that does not evaluate any pre-trained model on a benchmark." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "This is a literature survey that does not evaluate any pre-trained model on a benchmark." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "This is a literature survey that does not evaluate any pre-trained model on a benchmark." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this literature survey." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this literature survey." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this literature survey." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this literature survey." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this literature survey." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this literature survey." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this literature survey." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "Survey paper with no computational method to cost." 289 }, 290 "compute_budget_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "Survey paper with no computational experiments." 294 } 295 }, 296 "survey_methodology": { 297 "prisma_or_structured_protocol": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper provides a search string and databases but lacks a PRISMA flow diagram, protocol registration, intermediate-stage filtering counts, or screening agreement rates. The jump from 211 to 36 papers is presented as one step with no staged breakdown." 301 }, 302 "quality_assessment_of_sources": { 303 "applies": true, 304 "answer": false, 305 "justification": "No quality scoring, risk-of-bias assessment, or structured evaluation of the 36 included papers is performed. ArXiv preprints (22/36, 61%) are treated identically to peer-reviewed publications. The survey launders varying source quality without assessment." 306 }, 307 "publication_bias_discussed": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper mentions that ArXiv 'encourages publishing negative or null results' (Section 4.2) but does not systematically address publication bias, funnel plots, or whether the positive-result skew in the reviewed papers affects conclusions." 311 } 312 } 313 }, 314 "claims": [ 315 { 316 "claim": "In-IDE HAX research divides into three primary research branches: Design, Impact, and Quality of Interaction.", 317 "evidence": "Categorization of 36 papers: 14 on Design, 13 on Impact, 9 on Quality (Section 3). Each category is discussed in a dedicated subsection.", 318 "supported": "moderate" 319 }, 320 { 321 "claim": "Using AI tools increases productivity but may involve a trade-off in code quality.", 322 "evidence": "Cited from reviewed studies [10, 13, 17, 35, 36, 39, 43] in Section 3 (Impact). No quality assessment of these studies or quantified meta-analysis is provided.", 323 "supported": "weak" 324 }, 325 { 326 "claim": "The user interface of in-IDE AI assistance affects the usefulness of the tool and should be built thoughtfully.", 327 "evidence": "Synthesized from Design category studies [16, 20, 34, 38] in Section 3. Design principles are listed but no quantified evidence is presented.", 328 "supported": "weak" 329 }, 330 { 331 "claim": "Security risks depend on the model used, with vulnerability rates reaching 40% for generated C programs.", 332 "evidence": "Cited from reviewed studies [13, 25, 30] in Section 3 (Quality). The 40% figure is attributed to specific C-language evaluation in the reviewed work.", 333 "supported": "moderate" 334 }, 335 { 336 "claim": "AI assistance for novices positively influences programming education but introduces challenges like over-reliance.", 337 "evidence": "Cited from studies [2, 15, 26] in Section 3 (Impact). No quality assessment or meta-analysis of these findings is conducted.", 338 "supported": "weak" 339 } 340 ], 341 "red_flags": [ 342 { 343 "flag": "Undisclosed conflict of interest", 344 "detail": "Two of three authors are from JetBrains Research. JetBrains produces IntelliJ IDEA with an AI Assistant plugin (footnote 1 explicitly references it). The paper reviews in-IDE AI research without disclosing this commercial conflict or discussing how it might influence the review's framing." 345 }, 346 { 347 "flag": "Heavy reliance on non-peer-reviewed sources without quality assessment", 348 "detail": "22 of 36 papers (61%) are ArXiv preprints without peer review. No quality assessment, risk-of-bias scoring, or distinction is made between peer-reviewed and non-peer-reviewed sources. This risks laundering weak or unvalidated findings as established knowledge." 349 }, 350 { 351 "flag": "Opaque filtering pipeline", 352 "detail": "211 papers were reduced to 36 in a single described step with no intermediate counts, no screening agreement rates, and no documentation of how categorization decisions were made. The subjectivity of the 'thematically irrelevant' exclusion criterion is not addressed." 353 }, 354 { 355 "flag": "No structured quality assessment of reviewed papers", 356 "detail": "All 36 papers are treated as equally valid evidence regardless of their methodology, sample size, or rigor. The survey synthesizes conclusions from studies of varying quality without weighting or assessment, potentially amplifying weak findings." 357 } 358 ], 359 "cited_papers": [ 360 { 361 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 362 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 363 "year": 2021, 364 "arxiv_id": "2108.09293", 365 "relevance": "Security evaluation of LLM code generation — directly relevant to AI code assistant safety." 366 }, 367 { 368 "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models", 369 "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"], 370 "year": 2023, 371 "relevance": "Empirical study of programmer-AI interaction patterns with code generation, core to understanding LLM programming workflows." 372 }, 373 { 374 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 375 "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh", "Michel C. Desmarais"], 376 "year": 2023, 377 "arxiv_id": "2206.15331", 378 "relevance": "Evaluation of Copilot code correctness and comprehensibility — directly assesses AI code generation quality." 379 }, 380 { 381 "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models", 382 "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"], 383 "year": 2022, 384 "relevance": "Usability evaluation of LLM-powered code generation tools — key evidence on developer experience with AI assistants." 385 }, 386 { 387 "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges", 388 "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"], 389 "year": 2023, 390 "arxiv_id": "2303.17125", 391 "relevance": "Large-scale survey of AI programming assistant usability, directly relevant to understanding developer productivity claims." 392 }, 393 { 394 "title": "Productivity Assessment of Neural Code Completion", 395 "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X. Alice Li", "Andrew Rice"], 396 "year": 2022, 397 "relevance": "Productivity measurement of neural code completion — key evidence for AI coding assistant impact claims." 398 }, 399 { 400 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 401 "authors": ["Nhan Nguyen", "Sarah Nadi"], 402 "year": 2022, 403 "relevance": "Empirical evaluation of Copilot suggestion quality — benchmark for AI code generation correctness." 404 }, 405 { 406 "title": "Is GitHub's Copilot as Bad as Humans at Introducing Vulnerabilities in Code?", 407 "authors": ["Owura Asare", "Meiyappan Nagappan", "N. Asokan"], 408 "year": 2023, 409 "arxiv_id": "2204.04741", 410 "relevance": "Comparative security assessment of AI-generated vs human-written code vulnerabilities." 411 }, 412 { 413 "title": "Lost at C: A User Study on the Security Implications of Large Language Model Code Assistants", 414 "authors": ["Gustavo Sandoval", "Hammond Pearce", "Teo Nys", "Ramesh Karri", "Siddharth Garg", "Brendan Dolan-Gavitt"], 415 "year": 2023, 416 "arxiv_id": "2208.09727", 417 "relevance": "User study on security implications of LLM code assistants — combines human factors with security evaluation." 418 }, 419 { 420 "title": "Reading Between the Lines: Modeling User Behavior and Costs in AI-Assisted Programming", 421 "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"], 422 "year": 2023, 423 "arxiv_id": "2210.14306", 424 "relevance": "Models user behavior and costs during AI-assisted programming — evidence for workflow transformation claims." 425 }, 426 { 427 "title": "Is GitHub Copilot a Substitute for Human Pair-Programming? An Empirical Study", 428 "authors": ["Saki Imai"], 429 "year": 2022, 430 "relevance": "Empirical comparison of AI assistance vs human pair programming — directly tests AI productivity claims." 431 }, 432 { 433 "title": "Guidelines for Human-AI Interaction", 434 "authors": ["Saleema Amershi", "Dan Weld", "Mihaela Vorvoreanu", "Adam Fourney"], 435 "year": 2019, 436 "relevance": "Foundational HAX design guidelines that inform the paper's framework for evaluating in-IDE AI interactions." 437 } 438 ], 439 "engagement_factors": { 440 "practical_relevance": { 441 "score": 1, 442 "justification": "Provides a categorized overview of the field and future directions, useful for researchers orienting in the space, but nothing directly usable by practitioners." 443 }, 444 "surprise_contrarian": { 445 "score": 0, 446 "justification": "Confirms expected findings about AI tools increasing productivity with quality trade-offs; no contrarian insights." 447 }, 448 "fear_safety": { 449 "score": 1, 450 "justification": "Mentions security vulnerabilities in AI-generated code (up to 40% for C) and over-reliance risks, but these are synthesized from existing work, not novel findings." 451 }, 452 "drama_conflict": { 453 "score": 0, 454 "justification": "No controversy, no challenges to established positions, straightforward literature categorization." 455 }, 456 "demo_ability": { 457 "score": 0, 458 "justification": "No code, tool, or demo. The Zenodo dataset is the only artifact." 459 }, 460 "brand_recognition": { 461 "score": 2, 462 "justification": "JetBrains (IntelliJ makers) is a recognizable brand; the paper discusses GitHub Copilot prominently. Published at ICSE." 463 } 464 } 465 }