ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (17067B)


      1 {
      2   "paper": {
      3     "title": "Ten Simple Rules for AI-Assisted Coding in Science",
      4     "authors": ["Eric W. Bridgeford", "Iain Campbell", "Zijiao Chen", "Zhicheng Lin", "Harrison Ritz", "Joachim Vandekerckhove", "Russell A. Poldrack"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.22254",
      8     "doi": "10.48550/arXiv.2510.22254"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides an interactive Jupyter Book at poldracklab.org/10sr_ai_assisted_coding with worked examples, permanently indexed at Zenodo (reference [8], DOI 10.5281/zenodo.17398109)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The Jupyter Book containing all examples is released via Zenodo. As a guidelines paper with no collected data, the accompanying examples constitute the relevant artifacts."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements files, or dependency details are mentioned in the paper for the Jupyter Book examples."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step instructions for reproducing the examples are provided in the paper itself. The Jupyter Book is referenced but no setup or execution instructions are given."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a guidelines paper with no quantitative experiments or statistical results."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No experiments or comparative claims requiring statistical tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No experiments producing effect sizes."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No data collection or sampling involved."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experimental runs to report variance over."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "This is a guidelines paper, not an evaluation paper. There is no system or method being evaluated against baselines."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "No evaluation conducted, so no baselines to assess."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No system with components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No evaluation metrics used."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No system outputs to evaluate."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No datasets or test sets involved."
     92       },
     93       "per_category_breakdown": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No results to break down by category."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses failure modes throughout, including AI generating 'paper tests' that pass without validating logic (Rule 6), context rot causing AI to lose track of details, and agent risks like breaking functionality or introducing vulnerabilities (Discussion section)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper cites negative findings: Becker & Rush (2025) RCT showing AI tools actually slowed experienced developers (Section 1), and Harding et al. (2024) showing increased copy-paste and decreased refactoring with AI assistants."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims are appropriately hedged ('have demonstrated potential', 'raises critical questions') and the paper provides the ten rules as promised. No unsupported empirical claims in the abstract."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper makes no causal claims of its own. It presents guidelines and recommendations, citing others' empirical work. Language like 'AI can be leveraged' is prescriptive, not causal."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper explicitly bounds its scope: 'intended for anyone who develops scientific software that will be used more than once' and 'Our focus is on creating maintainable, reliable software rather than one-off scripts.' The Limitations section also states 'we are operating in a rapidly evolving technological landscape.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper presents both sides of the AI productivity debate, citing positive enterprise studies alongside the Becker & Rush RCT showing negative effects, noting 'productivity effects are far from well-understood, and may vary based on developer experience, task complexity, and codebase characteristics.'"
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "This is a guidelines paper with no measurements. No proxy-outcome gap to address."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "The paper does not run experiments with AI models. It discusses tools generically (GitHub Copilot, ChatGPT, Claude) as context for guidelines."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting experiments conducted."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No experiments using hyperparameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding used in experiments."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No data collection or preprocessing involved."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The Discussion section contains a 'Limitations and Future Directions' subsection with substantive discussion of the rapidly evolving landscape and intentional focus on enduring principles."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The paper discusses specific limitations: the rules 'emerge from our collective experience' (subjective basis), rapid technology evolution may change which practices are valuable, and they 'deliberately avoided prescriptive recommendations tied to specific models, as these would quickly become outdated.'"
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states its scope: 'for anyone who develops scientific software that will be used more than once' and 'Our focus is on creating maintainable, reliable software rather than one-off scripts.' It also states what it does NOT cover: 'broader ethical concerns demand serious consideration' but 'these complex issues merit a dedicated treatment beyond our scope here.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": false,
    183         "answer": false,
    184         "justification": "No data collected. This is a guidelines paper based on collective experience."
    185       },
    186       "data_collection_described": {
    187         "applies": false,
    188         "answer": false,
    189         "justification": "No data collection involved."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No participants recruited."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No data pipeline involved."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgements section states: 'This work was supported by a grant from the Sloan Foundation to RAP (G-2025-25270).'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are listed: Stanford University, Princeton University, University of Science and Technology of China, Yonsei University, University of California Irvine."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The Sloan Foundation is an independent philanthropic foundation with no financial interest in the outcomes of AI coding guidelines."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation on benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation conducted."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a guidelines paper with no method that incurs computational cost."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No computation performed beyond writing the paper."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "AI coding tools have demonstrated measurable productivity gains in some controlled studies, but evidence remains contested and situation-dependent.",
    294       "evidence": "Section 1 cites Peng et al. (2023) and Kalliamvakou et al. (2024) for positive results, and Becker & Rush (2025) RCT showing AI tools slowed experienced developers despite perceived speedup.",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "AI use has led to substantial increases in copy-pasted code and decreases in refactoring.",
    299       "evidence": "Section 1 cites Harding et al. (2024) analyzing ~200 million lines of code showing these trends.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "AI tools are useful for coding tasks but require human guidance for problem framing decisions involving domain expertise.",
    304       "evidence": "Rule 2 argues this conceptually. Supported by experience rather than empirical study within this paper.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "Even when following these rules, flawless start-to-finish AI interactions are the exception rather than the norm.",
    309       "evidence": "Discussion section, stated as based on the authors' collective experience with no systematic data.",
    310       "supported": "weak"
    311     }
    312   ],
    313   "methodology_tags": ["theoretical", "qualitative"],
    314   "key_findings": "This paper proposes ten practical rules for AI-assisted coding in scientific computing, organized around preparation, context management, testing, and code quality. The rules emphasize maintaining human agency and domain expertise while leveraging AI capabilities. The paper highlights contested evidence on AI productivity (citing both positive enterprise studies and a negative RCT with experienced developers) and discusses risks including context rot, placeholder/mock implementations, and declining code quality metrics with AI adoption.",
    315   "red_flags": [
    316     {
    317       "flag": "Experience-based guidelines without systematic evidence",
    318       "detail": "The ten rules 'emerge from our collective experience' with no systematic study, user survey, or controlled evaluation of the rules' effectiveness. The paper is prescriptive without empirical validation of its prescriptions."
    319     },
    320     {
    321       "flag": "Selective citation of evidence",
    322       "detail": "The paper cites evidence both for and against AI productivity, which is balanced, but does not systematically review the evidence landscape. The selection of supporting citations appears ad-hoc rather than derived from a structured review."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Evaluating large language models trained on code",
    328       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    329       "year": 2021,
    330       "arxiv_id": "2107.03374",
    331       "relevance": "Foundational paper on LLM code generation capabilities (Codex/HumanEval)."
    332     },
    333     {
    334       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    335       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    336       "year": 2023,
    337       "arxiv_id": "2302.06590",
    338       "relevance": "Key RCT on Copilot productivity impact, widely cited in AI-assisted coding research."
    339     },
    340     {
    341       "title": "Measuring GitHub Copilot's impact on productivity",
    342       "authors": ["Eirini Kalliamvakou", "Albert Ziegler", "X Alice Li"],
    343       "year": 2024,
    344       "relevance": "Enterprise study measuring Copilot productivity, published in CACM."
    345     },
    346     {
    347       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    348       "authors": ["Joel Becker", "Nate Rush"],
    349       "year": 2025,
    350       "arxiv_id": "2507.09089",
    351       "relevance": "RCT finding AI tools slowed experienced developers despite perceived speedup — contradicts productivity narrative."
    352     },
    353     {
    354       "title": "AI Copilot code quality: 2023 data suggests downward pressure on code quality",
    355       "authors": ["Bill Harding", "Matthew Kloster"],
    356       "year": 2024,
    357       "relevance": "Large-scale analysis (~200M lines) showing increased copy-paste and decreased refactoring with AI assistants."
    358     },
    359     {
    360       "title": "Context rot: How increasing input tokens impacts LLM performance",
    361       "year": 2024,
    362       "relevance": "Technical report on context degradation in LLMs, directly relevant to AI-assisted coding limitations."
    363     },
    364     {
    365       "title": "LLMCarbon: Modeling the end-to-end carbon footprint of large language models",
    366       "authors": ["Ahmad Faiz"],
    367       "year": 2024,
    368       "relevance": "Environmental cost assessment of LLMs, relevant to sustainability concerns in AI-assisted development."
    369     },
    370     {
    371       "title": "Attention is all you need",
    372       "authors": ["Ashish Vaswani", "Noam Shazeer"],
    373       "year": 2017,
    374       "relevance": "Foundational transformer architecture paper underlying all LLM-based coding tools."
    375     }
    376   ]
    377 }

Impressum · Datenschutz