ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25831B)


      1 {
      2   "paper": {
      3     "title": "AI-Generated Code Is Not Reproducible (Yet): An Empirical Study of Dependency Gaps in LLM-Based Coding Agents",
      4     "authors": ["Bhanu Prakash Vangala", "Ali Adibifar", "Ashish Gehani", "Tanu Malik"],
      5     "year": 2025,
      6     "venue": "AAAI 2026",
      7     "arxiv_id": "2512.22387"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The authors describe their experimental infrastructure (AWS EC2 instances, SciUnit) and iterative resolution protocol but do not release the code implementing these experiments."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The 300 generated projects, 100 prompts, and dependency analysis results are not released. No download link or data archive is provided. The prompt template is shown in Figure 1 but is only a template with placeholders, not the actual 100 prompts used."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'AWS EC2 instances (t2.large with 4 vCPUs and 16GB RAM running Ubuntu 22.04 LTS)' and SciUnit v0.4.post135, but does not provide a requirements.txt, Dockerfile, or detailed dependency list for reproducing the experimental infrastructure itself."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 describes the iterative resolution protocol conceptually, but there are no scripts, commands, or README explaining how to replicate the full study."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Main results (68.3% overall, 89.2% Python, etc.) are reported as point estimates without confidence intervals or error bars. The runtime multipliers in Equation 17 include ± values (e.g., '12.3 ± 4.2') but these appear to be standard deviations across projects, not confidence intervals on the main reproducibility claims."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims differences between agents (Claude 73% vs Codex 60%) and between languages (Python 89.2% vs Java 44.0%) without any statistical significance tests. No p-values, chi-squared tests, or other tests are reported for these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context: e.g., '68.3% of projects execute out-of-the-box', '13.5× average expansion from declared to actual runtime dependencies', 'Python 89.2% vs Java 44.0%'. Tables 1-6 provide raw counts and percentages enabling readers to assess magnitudes."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample of 100 prompts (40 Python, 35 JavaScript, 25 Java) is not justified by power analysis or formal reasoning. The allocation is described as 'informed by the high prevalence of these languages' but no justification is given for why 100 prompts (or 300 total projects) is sufficient to support the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The runtime multipliers include ± notation (Equation 17: '12.3 ± 4.2' for Python), but the main reproducibility results are single-run numbers with no variance or spread measures. Each prompt-agent pair was tested once; no repeated runs are described."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three LLM agents (Claude Code, OpenAI Codex, Gemini) are compared against each other as baselines. The comparison is structured across agents and languages in Tables 1-3 and the success rate matrix (Equation 15)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The three agents evaluated (Claude Code with Opus 4.1, OpenAI Codex 0.52.0, Gemini 2.5 Pro) are all 2025 state-of-the-art coding agents, representing the most contemporary options available."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed. The paper does not test which components of its methodology matter (e.g., whether prompt wording affects results, whether environment reset procedures matter, or whether results change with different prompt formulations)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: executable reliability (success rate), completeness gap, runtime multiplier, error type distribution, and per-language/per-agent breakdowns. Tables 1, 2, 4, 5, 6 cover different evaluation dimensions."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the generated code quality is included. The evaluation is entirely automated (does the code execute or not). Human evaluation could have assessed code readability, correctness beyond execution, or documentation quality, which are relevant to the reproducibility claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning study with train/test splits. The paper generates and tests projects; there is no held-out evaluation scenario applicable here."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Detailed per-category breakdowns are provided: by language (Table 2), by agent (Table 1), by agent-language combination (Table 3), by error type (Table 5), and by missing dependency distribution (Table 4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Failure cases are discussed extensively. Table 5 categorizes all 95 failures into Code Bugs (52.6%), Not Processed (16.8%), Other (15.8%), Dependency (10.5%), and Environment (4.2%). Specific examples are given (e.g., 'malformed Maven XML configurations', 'compressed multi-file projects into single files')."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The entire paper is essentially a negative result: AI-generated code is not reproducible. Additionally, the paper reports that the majority of failures are NOT dependency issues as initially hypothesized, but code generation errors (52.6%), which counters their own initial framing."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims are supported: '68.3% of projects execute out-of-the-box' (Table 1/6), 'Python 89.2%, Java 44.0%' (Table 2), '13.5× average expansion' (Table 6). All numerical claims in the abstract match the results section."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims about why languages differ ('Python's success stems from its simple flat dependency structure', 'Java's challenges arise from complex XML configuration') and about agent specializations ('Gemini likely trained heavily on data science notebooks'). These are speculative explanations not supported by the study design, which only observes outcomes without testing causal mechanisms."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper title and abstract make broad claims about 'AI-Generated Code' and 'LLM-Based Coding Agents' generally, but tests only 3 agents on 100 prompts covering specific task types. The conclusion states 'AI coding agents generate code that looks complete but isn't reproducible' as a general claim. The task distribution (web scraping, data analysis, ML pipelines, etc.) is not justified as representative of all development work."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the observed results. For example, the paper does not consider whether prompt design could explain the failure rates, whether the clean-environment assumption is realistic, or whether different prompting strategies could improve results. No threats-to-validity or alternative-explanations section exists."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Model versions are specified: 'Claude Code Agent(Opus.4.1)', 'OpenAI Codex Agent(0.52.0)', and 'Gemini Code Agent(2.5.Pro)' in the Methodology section. These include version identifiers beyond just marketing names."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Figure 1 shows only a prompt TEMPLATE with placeholders like '[Task Description]', '[Environment]', and '[Language]'. The actual 100 task descriptions used are not provided. The reader cannot reconstruct the exact prompts sent to the models."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported for the LLM API calls. Temperature, top-p, max tokens, and other sampling parameters are not mentioned. These settings significantly affect output quality and reproducibility."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper evaluates third-party coding agents (Claude Code, Codex, Gemini) as black boxes. The authors cannot be expected to describe internal scaffolding they have no access to."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data pipeline is documented: 100 prompts designed across 3 languages (40/35/25 split), each sent to 3 agents producing 300 projects, tested in standardized AWS EC2 environments with 91 baseline packages. Algorithm 1 documents the iterative resolution process. The environment reset procedure is formalized in Equations 9-10."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The Discussion section discusses implications but does not acknowledge limitations of the study itself."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as: prompt design bias, single-run evaluation (no repeated trials), limited prompt diversity, or whether the clean-environment assumption matches real developer workflows."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show (e.g., that results apply only to these 3 agents, these specific task types, single-run generation without iteration, etc.). The title and conclusion generalize broadly without bounding the claims."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data (the 300 generated projects, dependency analysis outputs, error logs, SciUnit traces) is not made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described in detail: 100 prompts distributed across 3 languages, sent to 3 LLM agents, tested on standardized AWS EC2 instances. The prompt template is shown in Figure 1, the environment baseline is specified (91 packages, Ubuntu 22.04 LTS), and the iterative resolution protocol is formalized in Algorithm 1."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. The study evaluates LLM-generated code, not human subjects. Data sources are standard LLM APIs, not human-derived."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data pipeline is well documented: prompt design → code generation by 3 agents → environment standardization (Equations 9-10) → execution attempt → iterative resolution (Algorithm 1) → SciUnit runtime capture (Equations 11-13) → error classification (Table 5). Each stage is described with specific tools and counts."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: University of Missouri (Vangala, Adibifar, Malik) and SRI International (Gehani). Neither institution is a vendor of the evaluated products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Funding is not disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure is itself a concern."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper. Note that co-author Tanu Malik is associated with SciUnit, which is prominently used and promoted in the paper as a key tool, but this potential conflict is not acknowledged."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This study does not evaluate pre-trained model capability on a benchmark. It tests whether LLM-generated code is reproducible (i.e., whether it runs in a clean environment). The concern is code generation quality, not whether models memorized test answers."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable. The study generates new projects from new prompts; there is no pre-existing benchmark that could overlap with training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable. The study uses original prompts, not a published benchmark. There is no contamination concern as the tasks are novel."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study. It evaluates LLM-generated code execution, not human behavior."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved. IRB approval is not applicable."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or per-project generation costs are reported for running the 300 code generation requests across three commercial LLM agents."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The AWS EC2 instance type is mentioned (t2.large) but total compute budget, number of instance-hours, or total API spend is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Only 68.3% of AI-generated projects execute successfully out-of-the-box without any manual intervention.",
    286       "evidence": "Table 1 and Table 6 report 205 of 300 projects succeeding. The methodology section defines success as execution in a clean environment using only LLM-specified dependencies with zero manual intervention.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Python achieves 89.2% reproducibility while Java achieves only 44.0%.",
    291       "evidence": "Table 2 reports Python 107/120 success (89.2%) and Java 33/75 success (44.0%). Results are broken down by language in Equation 14.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "There is a 13.5x average expansion from declared to actual runtime dependencies.",
    296       "evidence": "Table 6 and the Conclusion report this multiplier. The SciUnit Provenance Analysis section gives a concrete example: 3 claimed dependencies → 52 runtime packages for an ML project (17x). Equation 17 gives per-language multipliers.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "52.6% of failures stem from code generation errors, not dependency issues (only 10.5%).",
    301       "evidence": "Table 5 classifies all 95 failed projects: Code Bugs 50 (52.6%), Not Processed 16 (16.8%), Other 15 (15.8%), Dependency 10 (10.5%), Environment 4 (4.2%).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Each LLM agent has hidden language specializations not advertised by vendors.",
    306       "evidence": "Table 3 and Equation 15 show Gemini achieves 100% Python but only 28% Java; Claude achieves 80% Java vs. Codex at 24%. The Discussion section speculates about training data explanations.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Each failed project requires approximately 15 minutes of manual debugging.",
    311       "evidence": "Stated in the Introduction ('we found takes 15 minutes on average') and Discussion ('Our manual processing averaged 15 minutes per failed project'). No formal timing methodology or variance is reported for this estimate.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "observational"],
    316   "key_findings": "Only 68.3% of 300 AI-generated projects (by Claude Code, Codex, and Gemini) execute out-of-the-box in clean environments, with Python at 89.2% and Java at 44.0%. The majority of failures (52.6%) stem from code generation errors rather than missing dependency declarations (10.5%). A three-layer dependency framework reveals a 13.5x average expansion from declared to runtime dependencies. Agents show distinct, unadvertised language specializations: Gemini achieves perfect Python reproducibility but struggles with Java, while Claude excels at Java where others fail.",
    317   "red_flags": [
    318     {
    319       "flag": "No limitations section",
    320       "detail": "The paper lacks any limitations, threats-to-validity, or scope-bounding section. This is a significant omission for a study making broad claims about 'AI-Generated Code' based on 100 prompts across 3 agents."
    321     },
    322     {
    323       "flag": "No statistical tests for comparative claims",
    324       "detail": "Differences between agents (Claude 73% vs Codex 60%) and languages (Python 89.2% vs Java 44.0%) are stated without any significance tests. With sample sizes of 25-40 per cell, some observed differences may not be statistically significant."
    325     },
    326     {
    327       "flag": "Single-run evaluation with no variance reporting",
    328       "detail": "Each prompt-agent pair appears to have been tested only once. LLM outputs are stochastic, so results may vary across runs. No repeated trials, temperature settings, or seed information is provided."
    329     },
    330     {
    331       "flag": "No artifacts released",
    332       "detail": "Despite studying reproducibility, the paper itself is not reproducible: no code, data, prompts, or generated projects are released. This is ironic given the paper's central thesis."
    333     },
    334     {
    335       "flag": "Unbounded generalization",
    336       "detail": "The title and conclusion generalize to 'AI-Generated Code' and 'LLM-Based Coding Agents' broadly, but the study tests only 3 specific agents on 100 prompts of specific task types. No scope boundaries are acknowledged."
    337     },
    338     {
    339       "flag": "Potential undisclosed conflict of interest with SciUnit",
    340       "detail": "Co-author Tanu Malik appears to be associated with the SciUnit tool (cited as 'Ton That, Fils, Yuan, and Malik 2017'), which is prominently featured as a key tool in the methodology. This connection is not disclosed as a potential conflict."
    341     },
    342     {
    343       "flag": "Speculative causal explanations",
    344       "detail": "The Discussion contains speculative causal claims about why agents differ ('Gemini likely trained heavily on data science notebooks', 'Claude appears optimized for enterprise patterns') without evidence. These are plausible hypotheses stated as explanations."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Evaluating Large Language Models Trained on Code",
    350       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    351       "year": 2021,
    352       "arxiv_id": "2107.03374",
    353       "relevance": "Introduces HumanEval benchmark for code generation evaluation, which this paper critiques for ignoring reproducibility."
    354     },
    355     {
    356       "title": "Program synthesis with large language models",
    357       "authors": ["J. Austin"],
    358       "year": 2021,
    359       "arxiv_id": "2108.07732",
    360       "relevance": "Introduces MBPP benchmark for code generation, another benchmark critiqued in this paper for missing reproducibility evaluation."
    361     },
    362     {
    363       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    364       "authors": ["C. E. Jimenez"],
    365       "year": 2024,
    366       "arxiv_id": "2310.06770",
    367       "relevance": "Major LLM code generation benchmark that this paper cites as recognizing environment configuration as a bottleneck."
    368     },
    369     {
    370       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    371       "authors": ["N. Jain", "K. Han", "A. Gu"],
    372       "year": 2024,
    373       "arxiv_id": "2403.07974",
    374       "relevance": "Code generation benchmark addressing contamination but still assuming reproducible environments exist."
    375     },
    376     {
    377       "title": "A reproducibility and generalizability study of large language models for query generation",
    378       "authors": ["M. Staudinger", "W. Kusa", "F. Piroi", "A. Lipani", "A. Hanbury"],
    379       "year": 2024,
    380       "relevance": "Studies LLM output consistency and reproducibility, directly related to the reproducibility crisis in LLM-generated artifacts."
    381     },
    382     {
    383       "title": "Assessing consistency and reproducibility in the outputs of large language models: Evidence across diverse finance and accounting tasks",
    384       "authors": ["J. J. Wang", "V. X. Wang"],
    385       "year": 2025,
    386       "arxiv_id": "2503.16974",
    387       "relevance": "Studies LLM output inconsistency across domains, relevant to understanding reproducibility challenges of AI systems."
    388     },
    389     {
    390       "title": "Hidden Technical Debt in Machine Learning Systems",
    391       "authors": ["D. Sculley", "G. Holt", "D. Golovin"],
    392       "year": 2015,
    393       "relevance": "Seminal paper on technical debt in ML systems, referenced in context of hidden costs of AI-generated code."
    394     },
    395     {
    396       "title": "On Reproducible AI: Towards Reproducible Research, Open Science, and Digital Scholarship in AI Publications",
    397       "authors": ["O. E. Gundersen", "Y. Gil", "D. W. Aha"],
    398       "year": 2018,
    399       "relevance": "Foundational work on AI reproducibility that this paper extends to AI-generated code."
    400     },
    401     {
    402       "title": "Improving Reproducibility in Machine Learning Research (A Report from the NeurIPS 2019 Reproducibility Program)",
    403       "authors": ["J. Pineau", "P. Vincent-Lamarre", "K. Sinha"],
    404       "year": 2021,
    405       "relevance": "Major effort to improve ML reproducibility, directly relevant to the reproducibility methodology assessment in this survey."
    406     },
    407     {
    408       "title": "An empirical comparison of dependency network evolution in seven software packaging ecosystems",
    409       "authors": ["A. Decan", "T. Mens", "P. Grosjean"],
    410       "year": 2019,
    411       "relevance": "Empirical study of software dependency ecosystems, providing context for the dependency complexity findings in this paper."
    412     }
    413   ]
    414 }

Impressum · Datenschutz