scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19721B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "The Impact of LLM-Assistants on Software Developer Productivity: A Systematic Literature Review",
      6     "authors": [
      7       "Amr Mohamed",
      8       "Maram Assi",
      9       "Mariam Guizani"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2507.03156",
     14     "doi": "10.1145/nnnnnnn.nnnnnnn"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims—92% multidimensional coverage, 14% beyond three dimensions, named benefits/risks, underexplored SPACE dimensions—are directly supported by the RQ2 and RQ3 analyses across 37 primary studies.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "The SLR synthesizes findings from primary studies rather than making independent causal claims; aggregate statements like 'LLM-assistants offer benefits' are consistently framed as what the reviewed literature reports, not original causal findings.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Conclusions are bounded to the 37 peer-reviewed studies (2014–2024); the paper explicitly acknowledges methodological diversity, lack of longitudinal studies, and exploratory nature of the field as limits on generalizability.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Conflicting cognitive load and code quality findings are explicitly attributed to 'diverse operationalizations, differences in participant expertise, task design, and LLM capabilities'; the paper consistently offers multiple explanatory factors for heterogeneous results.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly critiques acceptance rate as a proxy for productivity, noting 'blind reliance on acceptance rate can lead to superficial improvements'; it systematically distinguishes self-reported perceptions from objective performance metrics throughout.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 9 'Threats to Validity' is a dedicated limitations section covering study selection bias, SLR repeatability bias, and classification rigor for SPACE framework mapping.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats include: search strings initially retrieving technical performance papers rather than human-centered ones, exclusion of grey literature introducing selection bias, and subjective SPACE sub-dimension mapping requiring interpretive decisions during data coding.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Scope is explicitly bounded to peer-reviewed studies in English, 2014–2024, directly investigating LLM-assistant impact on developer productivity; grey literature, secondary studies, and papers where productivity is a secondary topic are explicitly excluded.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding statement or acknowledgments section is present anywhere in the paper; funding sources are undisclosed.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: Amr Mohamed and Mariam Guizani at Queen's University (Canada), Maram Assi at Université du Québec à Montréal.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Funding is not disclosed, making it impossible to assess funder independence.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, conflict of interest declaration, or financial disclosure is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'LLM-assistants' is explicitly defined as 'generative AI tools powered by LLMs that support software development tasks'; the SPACE framework's five dimensions are defined in Section 2; 'developer productivity' is extensively discussed as a multidimensional construct.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Four contributions are explicitly enumerated in the introduction: first SLR on LLM-assistant productivity impact, structured methodological characterization, SPACE framework analysis, and actionable recommendations with a publicly available replication package.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper situates itself against prior SLR methodology (Kitchenham & Charters), prior productivity frameworks (SPACE, DevEx), and prior work on measuring developer productivity, explicitly explaining how this review fills the gap of no synthesis on LLM-specific productivity evidence.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "survey": {
    118       "search_and_selection": {
    119         "search_strategy_reproducible": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Full search queries for all four databases are provided in Table 1, searches were conducted on December 31, 2024, and database-specific syntax adaptations (e.g., NEAR/5 operators for IEEE/WoS) are documented.",
    123           "source": "haiku"
    124         },
    125         "inclusion_exclusion_explicit": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Three inclusion criteria (IC1–IC3) and four exclusion criteria (EC1–EC4) are explicitly stated; the PRISMA diagram (Figure 2) documents exclusion counts by criterion at each stage.",
    129           "source": "haiku"
    130         },
    131         "prisma_or_structured_protocol": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The paper explicitly uses a PRISMA flow chart (Figure 2, citing Page et al. 2021) and grounds the entire methodology in Kitchenham & Charters guidelines, including pre-review mapping and iterative query refinement.",
    135           "source": "haiku"
    136         },
    137         "search_terms_provided": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Complete search strings with full Boolean syntax for all four databases (ACM, IEEE Xplore, ScienceDirect, Web of Science) are provided in Table 1.",
    141           "source": "haiku"
    142         },
    143         "databases_listed": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Four databases are explicitly named—ACM Digital Library, IEEE Xplore, ScienceDirect, Web of Science—with result counts from each (4,044 + 491 + 3,734 + 271 = 8,540 total).",
    147           "source": "haiku"
    148         },
    149         "screening_process_documented": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Figure 2 (PRISMA flowchart) documents all stages with counts: 8,540 initial → 8,209 after dedup → 204 after title/abstract screening → 32 after full-text → 37 final (plus 5 from snowballing), with exclusion reasons labeled at each step.",
    153           "source": "haiku"
    154         },
    155         "review_scope_justified": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "The 2014–2024 timeframe is justified by noting LLM research is recent (only 4 of 37 papers predate 2022); database selection follows cited prior SE SLRs; the rationale for scope boundaries is explained if not exhaustively defended.",
    159           "source": "haiku"
    160         }
    161       },
    162       "synthesis_quality": {
    163         "conflicting_findings_acknowledged": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Conflicting findings are prominently acknowledged: code quality is reported as both benefit and risk; cognitive load findings range from reduced effort to increased frustration; a negative productivity-quality correlation (r=−0.45) directly contradicts papers showing quality improvements.",
    167           "source": "haiku"
    168         },
    169         "quality_assessment_of_sources": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The paper classifies primary studies by research strategy and methodology but applies no formal quality scoring rubric or risk-of-bias assessment to the 37 included studies; methodological characterization is descriptive, not evaluative.",
    173           "source": "haiku"
    174         },
    175         "publication_bias_discussed": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Publication bias is not discussed in the threats section or elsewhere; the paper excludes grey literature (which could skew toward positive peer-reviewed results) but does not acknowledge or address this as a systematic bias.",
    179           "source": "haiku"
    180         },
    181         "quantitative_synthesis_present": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Vote counting is used throughout (e.g., '92% of studies adopt a multidimensional perspective,' frequency tables in Tables 3–5, intersection diagrams in Figures 5/9); no effect-size meta-analysis is conducted but quantitative summary statistics are consistently provided.",
    185           "source": "haiku"
    186         },
    187         "recommendations_supported_by_evidence": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "All five recommendations (three for practitioners, two for researchers) are explicitly tied to findings from named primary studies with citations; no recommendation appears without reference to supporting evidence from the review.",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "92% of studies (34 out of 37) adopt a multidimensional perspective by examining at least two SPACE dimensions.",
    199       "evidence": "Direct count across all 37 primary studies mapped to SPACE dimensions in Section 7, Table 8, and Figure 9.",
    200       "supported": "strong"
    201     },
    202     {
    203       "claim": "Only 14% of studies (5 out of 37) extend beyond three SPACE dimensions.",
    204       "evidence": "RQ3 analysis; Figure 9 intersection diagram confirms only 5 studies cover more than 3 dimensions.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Laboratory experiments are the most common research strategy at 41% (15 out of 37 studies).",
    209       "evidence": "Table 3 classification of all 37 studies using Stol & Fitzgerald taxonomy.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "64% of studies have exploratory/formative objectives rather than summative conclusions.",
    214       "evidence": "Classification of 33 empirical studies using Hartson et al. taxonomy in Section 5.2.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Minimizing online code search is the most frequently reported benefit of LLM-assistants.",
    219       "evidence": "Thematic analysis of RQ2; reported across 7 field studies plus multiple controlled experiments (Figure 7 radar).",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Increased productivity through LLM-assistants is negatively correlated with code quality (r = −0.45).",
    224       "evidence": "Single study [PS30]: survey of 70 large global companies using econometric analysis.",
    225       "supported": "weak"
    226     },
    227     {
    228       "claim": "73% of all included studies (27 out of 37) were published in 2024.",
    229       "evidence": "Figure 3: publication frequency per year shows 27 of 37 studies in 2024.",
    230       "supported": "strong"
    231     },
    232     {
    233       "claim": "91% of empirical primary studies use self-reported data (surveys or interviews) as a primary instrument.",
    234       "evidence": "Table 5 and Section 5.3: 30 out of 33 empirical studies leverage self-reported data.",
    235       "supported": "strong"
    236     }
    237   ],
    238   "methodology_tags": [
    239     "qualitative",
    240     "meta-analysis"
    241   ],
    242   "key_findings": "This SLR of 37 peer-reviewed studies (2014–2024) finds that LLM-assistants offer mixed productivity outcomes: the most consistently reported benefits are reduced code search effort, accelerated development, and task automation, while key risks include over-reliance, cognitive offloading, and disrupted team collaboration. Code quality is a contested area—some studies show improvements while others show degradation, with one industry study finding a negative productivity-quality correlation (r=−0.45). The field is methodologically immature: 64% of studies are exploratory, 91% rely on self-report data, and only 14% examine more than three SPACE productivity dimensions, leaving Communication and Activity systematically underexplored. The research is heavily concentrated in 2024 (73% of studies), reflecting the recency of widespread LLM tool adoption.",
    243   "red_flags": [
    244     {
    245       "flag": "No quality assessment of primary studies",
    246       "detail": "The review classifies studies by strategy and methodology but applies no formal quality scoring rubric or risk-of-bias assessment to the 37 included papers, making it impossible to weight evidence by study quality."
    247     },
    248     {
    249       "flag": "Publication bias not addressed",
    250       "detail": "The threats section does not discuss publication bias; excluding grey literature while including only peer-reviewed venues could systematically inflate apparent benefits of LLM-assistants in the synthesized findings."
    251     },
    252     {
    253       "flag": "No funding disclosure",
    254       "detail": "No funding acknowledgment or competing interests statement is present, which is unusual for an academic SLR and prevents assessment of potential industry bias."
    255     },
    256     {
    257       "flag": "91% self-report reliance not flagged as critical validity threat",
    258       "detail": "The paper notes that 91% of empirical studies rely on self-reported data but treats this as a descriptive characteristic rather than a major threat to the validity of synthesized productivity claims."
    259     },
    260     {
    261       "flag": "Single-study effect size presented without adequate qualification",
    262       "detail": "The negative productivity-quality correlation (r=−0.45) comes from one survey of 70 companies [PS30]; it is cited as a key finding without sufficient caveats about its limited generalizability."
    263     }
    264   ],
    265   "cited_papers": [
    266     {
    267       "title": "The SPACE of developer productivity: There's more to it than you think (Forsgren et al.)",
    268       "relevance": "The organizing framework for RQ3; used to map all 37 primary studies to productivity dimensions."
    269     },
    270     {
    271       "title": "Guidelines for performing systematic literature reviews in software engineering (Kitchenham & Charters)",
    272       "relevance": "The foundational methodology this SLR follows throughout, including pre-review mapping and search string construction."
    273     },
    274     {
    275       "title": "Grounded Copilot: How programmers interact with code-generating models (Barke, James, Polikarpova)",
    276       "relevance": "Describes acceleration vs. exploration interaction modes; cited to contextualize findings on development speed and flow."
    277     },
    278     {
    279       "title": "DevEx: What actually drives productivity? (Noda et al.)",
    280       "relevance": "Alternative productivity framework (feedback loops, cognitive load, flow state) discussed alongside SPACE in the background."
    281     },
    282     {
    283       "title": "Productivity assessment of neural code completion (Ziegler et al.)",
    284       "relevance": "Primary study [PS16] finding strong correlation between accepted suggestions and perceived productivity; used to justify acceptance rate as proxy despite limitations."
    285     },
    286     {
    287       "title": "Large language models for software engineering: A systematic literature review (Hou et al.)",
    288       "relevance": "Broader LLM-in-SE SLR cited to situate this focused productivity review within the larger research landscape."
    289     },
    290     {
    291       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming (Mozannar et al.)",
    292       "relevance": "Key finding that developers spend 51.5% of coding time in LLM interaction states; supports claims about flow disruption and role shift from coder to reviewer."
    293     },
    294     {
    295       "title": "The ABC of software engineering research (Stol & Fitzgerald)",
    296       "relevance": "Taxonomy used to classify all 37 primary studies by research strategy (field study, lab experiment, etc.)."
    297     }
    298   ],
    299   "engagement_factors": {
    300     "practical_relevance": {
    301       "score": 3,
    302       "justification": "Directly synthesizes evidence for practitioners deciding whether and how to adopt LLM-assistants, with specific recommendations on trust calibration, workflow adaptation, and managing cognitive offloading risks."
    303     },
    304     "surprise_contrarian": {
    305       "score": 2,
    306       "justification": "The productivity-quality trade-off (r=−0.45) and finding that developers spend 50%+ of time in LLM evaluation rather than code writing challenge common productivity-gain narratives."
    307     },
    308     "fear_safety": {
    309       "score": 1,
    310       "justification": "Raises concerns about erosion of critical thinking skills in novices and automation complacency, but frames these as research gaps rather than urgent safety issues."
    311     },
    312     "drama_conflict": {
    313       "score": 1,
    314       "justification": "Code quality appearing as both benefit and risk creates interpretive tension, but the paper presents this as nuanced finding rather than controversy."
    315     },
    316     "demo_ability": {
    317       "score": 0,
    318       "justification": "This is a literature review paper with a replication package; there is nothing interactive to demonstrate."
    319     },
    320     "brand_recognition": {
    321       "score": 1,
    322       "justification": "Authors are from Queen's University and UQAM; no famous lab branding, though the reviewed tools (GitHub Copilot, ChatGPT) are well-known."
    323     }
    324   },
    325   "hn_data": {
    326     "threads": [
    327       {
    328         "hn_id": "40876840",
    329         "title": "LivePortrait: A fast, controllable portrait animation model",
    330         "points": 203,
    331         "comments": 25,
    332         "url": "https://news.ycombinator.com/item?id=40876840",
    333         "created_at": "2024-07-04T18:02:50Z"
    334       },
    335       {
    336         "hn_id": "43287470",
    337         "title": "Substructural Parametricity",
    338         "points": 3,
    339         "comments": 0,
    340         "url": "https://news.ycombinator.com/item?id=43287470",
    341         "created_at": "2025-03-07T04:57:16Z"
    342       },
    343       {
    344         "hn_id": "42635091",
    345         "title": "LLMs for AGI",
    346         "points": 2,
    347         "comments": 0,
    348         "url": "https://news.ycombinator.com/item?id=42635091",
    349         "created_at": "2025-01-08T15:15:35Z"
    350       }
    351     ],
    352     "top_points": 203,
    353     "total_points": 208,
    354     "total_comments": 25
    355   }
    356 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs