ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22408B)


      1 {
      2   "paper": {
      3     "title": "The Impact of LLM-Assistants on Software Developer Productivity: A Systematic Literature Review",
      4     "authors": ["Amr Mohamed", "Maram Assi", "Mariam Guizani"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.03156",
      8     "doi": "10.1145/nnnnnnn.nnnnnnn"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper states 'All artifacts associated with this study are publicly available at https://zenodo.org/records/15788502' and references [20] is the replication package."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The Zenodo replication package [20] contains 'all study data, selection decisions, and exclusion rationales' per the contributions section."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, dependency files, or software versions are mentioned. The study is a manual SLR, but the Zenodo package does not describe any tool environment."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While data is released, no step-by-step reproduction instructions are described. The paper describes methodology but does not provide a guide for reproducing the analysis from the replication package."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a systematic literature review that synthesizes qualitative findings via thematic analysis; it does not run experiments or produce quantitative results requiring CIs."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No statistical comparisons are made; the paper counts and categorizes studies but does not perform statistical tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No experiments are conducted; findings are qualitative synthesis of primary studies."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The final set is 37 studies but the paper does not justify whether this is sufficient for the claims made or discuss saturation."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experimental runs; the SLR reports counts and thematic categories, not repeated measurements."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not compare its findings against prior SLRs or reviews on similar topics. It claims to be 'the first systematic literature review focused on the impact of LLM-assistants on software developer productivity' but does not position against related reviews."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "No baselines are included, so contemporariness is not applicable."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "An SLR has no system components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "Not an experimental study; evaluation metrics are not applicable."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation of system outputs is not relevant; this is a survey paper."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No test sets in an SLR."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper provides extensive breakdowns: by research strategy (Table 3), by procedure (Table 4), by SPACE dimension (Table 8, Figure 8-9), by benefit/risk theme (Figure 7, Tables 6-7), and by venue (Table 2)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses gaps and risks extensively in RQ2 (Section 6.2), including cases where LLM-assistants fail to improve or actually degrade productivity, code quality, and collaboration."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Mixed and negative findings are reported throughout: e.g., no significant difference in task completion time (PS2), increased frustration (PS10), negative correlation between throughput and quality (PS30, r=-0.45)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about benefits (minimized code search, accelerated development, automation), risks (cognitive offloading, reduced collaboration, inconsistent quality), and SPACE coverage (92% multi-dimensional, 14% beyond three) are all supported by data in Sections 4-7."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper synthesizes findings from primary studies and does not make its own causal claims. It reports what other studies found without asserting causation."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper bounds its scope to 37 peer-reviewed studies from 2014-2024, explicitly states inclusion/exclusion criteria, and the threats to validity section acknowledges potential omissions from the selection criteria."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 9 (Threats to Validity) discusses study selection bias, SLR bias and repeatability, and classification rigor. The paper also notes throughout that inconsistent findings across studies may stem from 'diverse operationalizations of cognitive load, differences in participants' expertise, task design, and the capabilities of LLM-assistants.'"
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly addresses the proxy-outcome gap: Section 2 discusses how traditional metrics (LOC, velocity) fail to capture broader productivity, and adopts the SPACE framework precisely because 'a single metric cannot meaningfully capture productivity.' The paper frames its analysis around this multi-dimensional understanding."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "No AI models are used in this SLR's methodology."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting is used in this SLR."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No AI models or hyperparameters are involved in the SLR methodology."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The paper documents the full filtering pipeline: 8,540 initial records → 331 duplicates removed → 8,209 screened → 8,005 excluded by title/abstract → 204 full-text → 172 excluded (with per-criterion counts: EC1=15, EC2=124, EC3=18, EC4=10) → 32 + 5 snowballing = 37. Inclusion/exclusion criteria are explicitly stated in Section 3.1.1."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 9 'Threats to Validity' provides a dedicated, substantive discussion of study selection bias, SLR bias and repeatability, and classification rigor."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The threats are specific: initial search strings including 'performance' and 'efficiency' yielded false positives focused on technical benchmarks; mapping findings to SPACE required interpretive decisions since SPACE was not designed for human-LLM collaboration; screening was led by one author with validation by others."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states scope: peer-reviewed studies only (excluding grey literature, theses, workshops), English only, 2014-2024, focused on LLM-assistants' impact on developer productivity. Exclusion criteria EC1-EC4 are clearly enumerated."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The replication package on Zenodo [20] contains 'all study data, selection decisions, and exclusion rationales' enabling independent verification of the selection process."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3 describes the full SLR methodology: four databases searched (ACM, IEEE Xplore, ScienceDirect, Web of Science), exact search strings provided in Table 1, search date (December 31, 2024), and the PRISMA flow chart in Figure 2."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants in this SLR. The paper reviews other studies; data source is database searches of published literature."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is fully documented with counts at each stage (Figure 2 PRISMA diagram): 8,540 → 8,209 after dedup → 204 after title/abstract screening → 32 after full-text → 37 after snowballing. Exclusion counts per criterion are provided."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly stated: two at Queen's University (Canada) and one at Université du Québec à Montréal (Canada). No conflict with evaluated products."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Funding is not disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This is an SLR that does not evaluate any pre-trained model on a benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation; contamination is not applicable to this SLR."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation in this SLR."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This SLR has no human participants. It reviews published studies."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this SLR."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this SLR."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The paper has inclusion/exclusion criteria for studies, which is covered under data_preprocessing_documented."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants or experimental conditions in this SLR."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants or experimental conditions."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper; no inference costs are relevant."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "This is a survey paper; no computational budget is relevant."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The paper follows PRISMA (Figure 2 is a PRISMA flow chart citing [49] Page et al. 2020) and explicitly follows Kitchenham and Charters [43] guidelines for SLRs in software engineering. Search strings are reproducible (Table 1) across four databases."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper does not assess the methodological quality of its 37 primary studies. It classifies them by strategy, procedure, and instruments (Section 5), but does not apply a quality scoring rubric or risk-of-bias assessment. All 37 studies are treated equally regardless of their rigor."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper does not discuss publication bias. There is no funnel plot, no discussion of whether published studies skew toward positive findings about LLM-assistants, and no acknowledgment that unpublished negative results may be missing."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "92% of studies adopt a multi-dimensional perspective on productivity by examining at least two SPACE dimensions, but only 14% extend beyond three dimensions.",
    311       "evidence": "Section 7 and Figure 9: 34 out of 37 studies examine 2+ dimensions; only 5 out of 37 examine more than 3.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Satisfaction (78%), Performance (65%), and Efficiency (59%) are the most frequently investigated SPACE dimensions, while Activity (27%) and Communication (35%) are underexplored.",
    316       "evidence": "Table 8 and Figure 8 provide per-dimension counts across all 37 studies.",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "The majority of studies are exploratory: 64% have formative objectives and laboratory experiments are the most common strategy (41%).",
    321       "evidence": "Section 5.1 (Table 3) and Section 5.2 classify all 33 empirical studies by strategy and objective.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Code quality is the only theme reported as both a benefit and a risk of LLM-assistants.",
    326       "evidence": "Section 6.1.7 (improve code quality) and Section 6.2.1 (limit code quality) with references to multiple primary studies on each side (Figure 7).",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "LLM-assistants' most frequently reported benefits are minimizing online code search, accelerating development, and automating trivial/repetitive tasks.",
    331       "evidence": "Figure 7 radar plot and Table 6 synthesize benefit frequency across primary studies.",
    332       "supported": "moderate"
    333     },
    334     {
    335       "claim": "Cognitive load findings are mixed: some studies report reduced mental effort, others neutral effects, and one reports increased frustration.",
    336       "evidence": "Section 5.3.3 reviews 6 studies using NASA-TLX with divergent results: [PS11, PS26, PS36] report improvements, [PS3, PS15] neutral, [PS10] increased frustration.",
    337       "supported": "strong"
    338     }
    339   ],
    340   "methodology_tags": ["meta-analysis"],
    341   "key_findings": "This SLR of 37 peer-reviewed studies (2014-2024) finds that LLM-assistants offer benefits including reduced code search, faster development, and task automation, but also introduce risks of over-reliance, reduced team collaboration, and inconsistent code quality effects. While 92% of studies examine multiple SPACE dimensions, only 14% extend beyond three, with Communication and Activity particularly underexplored. The research landscape is predominantly exploratory (64% formative) and relies heavily on laboratory experiments (41%) and self-reported methods (91% use surveys/interviews).",
    342   "red_flags": [
    343     {
    344       "flag": "No quality assessment of included studies",
    345       "detail": "The SLR synthesizes findings from 37 primary studies without assessing their methodological quality. Studies with 10 participants are weighted equally with those involving thousands. This risks laundering weak results alongside strong ones."
    346     },
    347     {
    348       "flag": "No publication bias discussion",
    349       "detail": "The paper excludes grey literature and non-peer-reviewed work but does not discuss whether the resulting corpus skews toward positive findings about LLM-assistants. Given that many primary studies involve industry tool evaluations, positive publication bias is a significant concern."
    350     },
    351     {
    352       "flag": "Narrow search scope may miss relevant work",
    353       "detail": "The authors acknowledge removing 'performance' and 'efficiency' from search terms due to false positives, but this risks missing studies that frame developer productivity through those lenses. The threats section notes this but the mitigation (snowballing) adds only 5 papers."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "Large language models for software engineering: A systematic literature review",
    359       "authors": ["Xinyi Hou"],
    360       "year": 2024,
    361       "relevance": "Comprehensive SLR on LLMs in software engineering, broader scope than this paper."
    362     },
    363     {
    364       "title": "The SPACE of developer productivity: There's more to it than you think",
    365       "authors": ["N Forsgren"],
    366       "year": 2021,
    367       "relevance": "Foundational framework for multi-dimensional developer productivity used as the analytical lens in this paper."
    368     },
    369     {
    370       "title": "Taking Flight with Copilot: Early insights and opportunities of AI-powered pair-programming tools",
    371       "authors": ["Christian Bird"],
    372       "year": 2022,
    373       "relevance": "Early analysis of GitHub Copilot's impact on developer workflows and productivity."
    374     },
    375     {
    376       "title": "AI-assisted Code Authoring at Scale: Fine-tuning, deploying, and mixed methods evaluation",
    377       "authors": ["Vijayaraghavan Murali"],
    378       "year": 2024,
    379       "relevance": "Meta's internal LLM code completion system evaluation with acceptance rate metrics and deployment analysis."
    380     },
    381     {
    382       "title": "Productivity assessment of neural code completion",
    383       "authors": ["Albert Ziegler"],
    384       "year": 2022,
    385       "relevance": "GitHub's statistical analysis of code completion interaction metrics and their relationship to self-reported productivity."
    386     },
    387     {
    388       "title": "Beyond code generation: An observational study of chatgpt usage in software engineering practice",
    389       "authors": ["Ranim Khojah"],
    390       "year": 2024,
    391       "relevance": "Field study of ChatGPT usage patterns in professional development, finding expert consultation as dominant use case."
    392     },
    393     {
    394       "title": "Rocks coding, not development: A human-centric, experimental evaluation of LLM-supported SE tasks",
    395       "authors": ["Wei Wang"],
    396       "year": 2024,
    397       "relevance": "Controlled experiment evaluating LLM-assisted coding showing time savings but no quality improvement."
    398     },
    399     {
    400       "title": "A large-scale survey on the usability of ai programming assistants: Successes and challenges",
    401       "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"],
    402       "year": 2024,
    403       "relevance": "Large-scale survey of AI programming assistant usability covering benefits and limitations from developer perspective."
    404     },
    405     {
    406       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    407       "authors": ["Hussein Mozannar"],
    408       "year": 2024,
    409       "relevance": "CHI study modeling developer interaction states with Copilot, finding 51.5% time in LLM interaction states."
    410     },
    411     {
    412       "title": "Significant productivity gains through programming with large language models",
    413       "authors": ["Thomas Weber"],
    414       "year": 2024,
    415       "relevance": "Controlled experiment comparing LLM-assisted programming across web search, code completion, and chat conditions using SPACE framework."
    416     },
    417     {
    418       "title": "Ironies of generative AI: understanding and mitigating productivity loss in Human-AI interaction",
    419       "authors": ["Auste Simkute"],
    420       "year": 2025,
    421       "relevance": "Examines automation ironies in generative AI contexts where productivity gains are offset by evaluation overhead."
    422     },
    423     {
    424       "title": "The role of firm AI capabilities in generative AI-pair coding",
    425       "authors": ["Jacques Bughin"],
    426       "year": 2024,
    427       "relevance": "Industry study of 70 large companies finding negative correlation (r=-0.45) between LLM-assisted throughput and code quality."
    428     }
    429   ]
    430 }

Impressum · Datenschutz