ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18509B)


      1 {
      2   "paper": {
      3     "title": "On the Future of Software Reuse in the Era of AI Native Software Engineering",
      4     "authors": ["Antero Taivalsaari", "Tommi Mikkonen", "Cesare Pautasso"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.19834",
      8     "doi": "10.48550/arXiv.2508.19834"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical"],
     13   "key_findings": "This position paper frames AI-assisted code generation as a new form of 'generative reuse' analogous to cargo cult programming, where developers trust AI-generated code they don't understand. The authors review productivity studies finding contradictory results (26% faster in industry trials vs 19% slower for experienced OSS developers), identify hallucination and slopsquatting as key risks, and propose a research agenda of 16 questions about the limits of prompt engineering and AI-native development. They suggest an '80/20 rule' where AI handles 80% of requirements easily but the remaining 20% consumes 80% of development time.",
     14   "claims": [
     15     {
     16       "claim": "AI-assisted generative reuse is conceptually a new form of cargo cult development where developers trust code from an external oracle whose workings are unknown.",
     17       "evidence": "Argued by analogy throughout Sections 1, 3, and 6, drawing on Feynman's cargo cult science concept and the authors' prior work on opportunistic reuse.",
     18       "supported": "moderate"
     19     },
     20     {
     21       "claim": "Productivity studies on AI coding tools show contradictory results, ranging from 58% faster to 19% slower.",
     22       "evidence": "Section 4.3 cites Peng et al. (58% decrease in time), Cui et al. (26% increase in completed tasks), and Becker et al. (19% longer for experienced OSS developers).",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "Prompt engineering follows an '80/20 rule' where AI can generate ~80% of a system but the remaining ~20% consumes ~80% of development time.",
     27       "evidence": "Section 5 states this as an observation from the authors' experience, explicitly noting 'we have not yet performed any truly scientific empirical studies on actual percentages.'",
     28       "supported": "weak"
     29     },
     30     {
     31       "claim": "Slopsquatting opens new supply chain attack vectors by registering malicious packages under hallucinated library names.",
     32       "evidence": "Section 6 describes the mechanism and references Wang et al. [53] on deprecated library problems, but provides no empirical data on prevalence.",
     33       "supported": "weak"
     34     }
     35   ],
     36   "checklist": {
     37     "artifacts": {
     38       "code_released": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Theoretical/position paper with no software artifacts to release."
     42       },
     43       "data_released": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No data collected or analyzed; this is a discussion paper."
     47       },
     48       "environment_specified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No computational experiments performed."
     52       },
     53       "reproduction_instructions": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experiments to reproduce."
     57       }
     58     },
     59     "statistical_methodology": {
     60       "confidence_intervals_or_error_bars": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "No original experiments or statistical analyses conducted."
     64       },
     65       "significance_tests": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No comparative empirical claims from original data."
     69       },
     70       "effect_sizes_reported": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No original experiments; effect sizes cited are from other papers."
     74       },
     75       "sample_size_justified": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Theoretical paper with no sample."
     79       },
     80       "variance_reported": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No original experiments."
     84       }
     85     },
     86     "evaluation_design": {
     87       "baselines_included": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No evaluation conducted; this is a position paper proposing a research agenda."
     91       },
     92       "baselines_contemporary": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No evaluation conducted."
     96       },
     97       "ablation_study": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "No system or method to ablate."
    101       },
    102       "multiple_metrics": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "No evaluation conducted."
    106       },
    107       "human_evaluation": {
    108         "applies": false,
    109         "answer": false,
    110         "justification": "No system outputs to evaluate."
    111       },
    112       "held_out_test_set": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "No evaluation conducted."
    116       },
    117       "per_category_breakdown": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "No results to break down."
    121       },
    122       "failure_cases_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 4.2 discusses specific failure modes of AI-generated code: hallucination, slopsquatting, deprecated library references, ambiguity in prompt-based requirements, and non-repeatable outputs."
    126       },
    127       "negative_results_reported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 4.3 reports the Becker et al. finding that AI tools made experienced developers 19% slower, and the ZDNet comparison showing several tools produced outright wrong results."
    131       }
    132     },
    133     "claims_and_evidence": {
    134       "abstract_claims_supported": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The abstract claims to 'discuss the implications of AI-assisted generative software reuse, bring forth relevant questions, and define a research agenda.' The paper delivers on all three in Sections 4-5."
    138       },
    139       "causal_claims_justified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper makes no causal claims; it discusses implications and poses questions rather than asserting causal relationships."
    143       },
    144       "generalization_bounded": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper makes broad claims about AI-native software engineering as a paradigm shift based primarily on anecdotal experience ('based on our cumulative experience of over hundred years') without bounding these generalizations to specific contexts or technologies."
    148       },
    149       "alternative_explanations_discussed": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper presents one interpretation (generative reuse as cargo cult development) without seriously considering alternatives. For instance, it does not discuss whether AI-generated code might develop its own quality norms distinct from human conventions, or whether the cargo cult framing overstates the problem."
    153       },
    154       "proxy_outcome_distinction": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "Theoretical paper with no measurements of its own."
    158       }
    159     },
    160     "setup_transparency": {
    161       "model_versions_specified": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No models used in experiments."
    165       },
    166       "prompts_provided": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No prompting conducted."
    170       },
    171       "hyperparameters_reported": {
    172         "applies": false,
    173         "answer": false,
    174         "justification": "No experiments conducted."
    175       },
    176       "scaffolding_described": {
    177         "applies": false,
    178         "answer": false,
    179         "justification": "No agentic scaffolding used."
    180       },
    181       "data_preprocessing_documented": {
    182         "applies": false,
    183         "answer": false,
    184         "justification": "No data collected or processed."
    185       }
    186     },
    187     "limitations_and_scope": {
    188       "limitations_section_present": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No dedicated limitations section. The paper has Discussion (Section 6) and Conclusions (Section 7) but neither contains a substantive discussion of limitations of the paper's own analysis or arguments."
    192       },
    193       "threats_to_validity_specific": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No threats to validity discussed. The authors do not acknowledge, for instance, that their perspective may be biased by their specific experience or that the cargo cult framing may not generalize."
    197       },
    198       "scope_boundaries_stated": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper does not explicitly state what its analysis does NOT cover. It makes sweeping statements about AI-native software engineering without bounding the scope to specific types of development, specific tools, or specific domains."
    202       }
    203     },
    204     "data_integrity": {
    205       "raw_data_available": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No data collected; theoretical paper."
    209       },
    210       "data_collection_described": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No data collection conducted."
    214       },
    215       "recruitment_methods_described": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No participants or data sources recruited."
    219       },
    220       "data_pipeline_documented": {
    221         "applies": false,
    222         "answer": false,
    223         "justification": "No data pipeline."
    224       }
    225     },
    226     "conflicts_of_interest": {
    227       "funding_disclosed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding or acknowledgments section present in the paper."
    231       },
    232       "affiliations_disclosed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Author affiliations are clearly listed: Nokia Technologies (Taivalsaari), University of Jyväskylä (Mikkonen), and USI (Pautasso)."
    236       },
    237       "funder_independent_of_outcome": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Taivalsaari is affiliated with Nokia Technologies, which has a stake in software development practices. No funding disclosure is provided to assess independence."
    241       },
    242       "financial_interests_declared": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No competing interests statement present. Taivalsaari's Nokia affiliation represents a potential interest that is not explicitly addressed."
    246       }
    247     },
    248     "contamination": {
    249       "training_cutoff_stated": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No pre-trained model evaluated on any benchmark."
    253       },
    254       "train_test_overlap_discussed": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No benchmark evaluation conducted."
    258       },
    259       "benchmark_contamination_addressed": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No benchmark evaluation conducted."
    263       }
    264     },
    265     "human_studies": {
    266       "pre_registered": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "irb_or_ethics_approval": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "demographics_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "inclusion_exclusion_criteria": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       },
    286       "randomization_described": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants."
    290       },
    291       "blinding_described": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants."
    295       },
    296       "attrition_reported": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       }
    301     },
    302     "cost_and_practicality": {
    303       "inference_cost_reported": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "Theoretical paper; no method with inference costs."
    307       },
    308       "compute_budget_stated": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "No computation performed."
    312       }
    313     }
    314   },
    315   "red_flags": [
    316     {
    317       "flag": "Claims outrun evidence",
    318       "detail": "The '80/20 rule' for prompt engineering is presented as a finding but the authors explicitly admit they have 'not yet performed any truly scientific empirical studies on actual percentages.' This anecdotal observation is given prominent placement in Section 5."
    319     },
    320     {
    321       "flag": "Selective citation of productivity studies",
    322       "detail": "Section 4.3 cites studies showing both gains and losses but does not systematically review the literature. The selection of studies appears driven by the narrative (contradictory results) rather than a structured search."
    323     },
    324     {
    325       "flag": "Appeal to authority over evidence",
    326       "detail": "The paper repeatedly appeals to the authors' 'cumulative experience of over hundred years' as a basis for claims, rather than providing systematic evidence. Section 7 opens with this credential."
    327     },
    328     {
    329       "flag": "No structured methodology for literature review",
    330       "detail": "Despite reviewing multiple studies on AI productivity and citing 55 references, the paper does not describe any systematic method for identifying or selecting the literature it discusses."
    331     },
    332     {
    333       "flag": "Cargo cult framing is rhetorical rather than analytical",
    334       "detail": "The central analogy of AI-generated code as cargo cult programming is used as a rhetorical device rather than an analytically precise framework. The paper does not define measurable criteria for what distinguishes cargo cult reuse from legitimate reuse."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    340       "authors": ["Joel Becker", "Nate Rush", "Beth Barnes", "David Rein"],
    341       "year": 2025,
    342       "arxiv_id": "2507.09089",
    343       "relevance": "RCT finding AI tools made experienced OSS developers 19% slower despite their belief they were faster."
    344     },
    345     {
    346       "title": "The Effects of Generative AI on High-Skilled Work: Evidence from Three Field Experiments with Software Developers",
    347       "authors": ["Kevin Zheyuan Cui", "Mert Demirer", "Sonia Jaffe", "Leon Musolff", "Sida Peng", "Tobias Salz"],
    348       "year": 2025,
    349       "relevance": "Three large-scale RCTs at Microsoft/Accenture showing 26% productivity increase with Copilot."
    350     },
    351     {
    352       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    353       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    354       "year": 2023,
    355       "arxiv_id": "2302.06590",
    356       "relevance": "Early Copilot productivity study reporting 58% faster task completion in lab setting."
    357     },
    358     {
    359       "title": "AI-Driven Development is Here: Should You Worry?",
    360       "authors": ["Neil A Ernst", "Gabriele Bavota"],
    361       "year": 2022,
    362       "relevance": "Early discussion of AI-driven development implications for software engineering practice."
    363     },
    364     {
    365       "title": "Software Reuse in the Generative AI Era: From Cargo Cult Towards Systematic Practices",
    366       "authors": ["Tommi Mikkonen", "Antero Taivalsaari"],
    367       "year": 2025,
    368       "relevance": "Authors' prior work on generative AI reuse practices, direct precursor to this paper."
    369     },
    370     {
    371       "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?",
    372       "authors": ["Emily M. Bender", "Timnit Gebru", "Angelina McMillan-Major", "Shmargaret Shmitchell"],
    373       "year": 2021,
    374       "relevance": "Foundational critique of LLMs as 'stochastic parrots' with implications for AI-generated code quality."
    375     },
    376     {
    377       "title": "Vibe Coding vs. Agentic Coding: Fundamentals and Practical Implications of Agentic AI",
    378       "authors": ["Ranjan Sapkota", "Konstantinos I. Roumeliotis", "Manoj Karkee"],
    379       "year": 2025,
    380       "arxiv_id": "2505.19443",
    381       "relevance": "Defines and contrasts vibe coding and agentic coding paradigms for AI-assisted development."
    382     },
    383     {
    384       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    385       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"],
    386       "year": 2022,
    387       "relevance": "Study finding no statistically significant task completion time improvements with LLM code generation tools."
    388     },
    389     {
    390       "title": "Generative AI for Code Generation: Software Reuse Implications",
    391       "authors": ["Georgia M Kapitsaki"],
    392       "year": 2024,
    393       "relevance": "Directly examines software reuse implications of generative AI code generation."
    394     },
    395     {
    396       "title": "Imprompter: Tricking LLM Agents into Improper Tool Use",
    397       "authors": ["Xiaohan Fu", "Shuheng Li", "Zihan Wang", "Yihao Liu"],
    398       "year": 2024,
    399       "arxiv_id": "2410.14923",
    400       "relevance": "Security research on exploiting LLM agents' tool-use capabilities for malicious purposes."
    401     },
    402     {
    403       "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study",
    404       "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu"],
    405       "year": 2024,
    406       "arxiv_id": "2305.13860",
    407       "relevance": "Empirical study on prompt injection attacks against LLMs, relevant to AI code generation security."
    408     },
    409     {
    410       "title": "LLMs Meet Library Evolution: Evaluating Deprecated API Usage in LLM-based Code Completion",
    411       "authors": ["Chong Wang", "Kaifeng Huang", "Jian Zhang"],
    412       "year": 2025,
    413       "arxiv_id": "2406.09834",
    414       "relevance": "Evaluates how LLM code completion handles deprecated APIs — direct evidence of generative reuse problems."
    415     }
    416   ]
    417 }

Impressum · Datenschutz