scan.json (25348B)
1 { 2 "paper": { 3 "title": "LLM Agents for Generating Microservice-based Applications: How Complex is Your Specification?", 4 "authors": ["Daniel M. Yellin"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.20119" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "LLM agents using GPT-o3-mini perform well on medium-difficulty microservice specifications (83-94% unit test pass rate) but poorly on complex ones with authentication, external APIs, and intricate business logic (10-65%). Fine-grained code generation (per-request rather than per-service) significantly improves correctness. Reflection/regeneration helps weaker models (GPT-3.5) but has limited benefit for stronger models. Key error types include incorrect package usage, superficial API understanding, datatype conversion failures, and disregarding specification details.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub repo URL: https://github.com/LLMs4code/LLMs4MSBA (footnote 2, Section 3.1) containing specifications and unit tests." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The specifications and unit tests are released, but the raw experimental results (generated code, error logs, per-run test outcomes) are not mentioned as released." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No requirements.txt, Dockerfile for the experimental harness, or detailed environment specification is provided in the paper. Docker is used for running generated code, but the full experimental environment is not specified." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "While the agentic workflow is described (Figure 2) and the GitHub repo has specs and tests, no step-by-step instructions for reproducing the experiments are provided in the paper." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Results are reported as averages (% passed) across experiments with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims GPT-o3-mini outperforms GPT-3.5 and that fine-grained generation improves results, but no statistical significance tests are performed." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Results are reported as percentage of unit tests passed with clear baselines (e.g., 'average passing percent across all 4 request types for V0 and V1 code is 48.5% and 77.8% respectively... significantly better than 23.6% and 25.8% for coarse-grained generation')." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "15-20 experiments per condition are used with no justification for why this number was chosen and no power analysis." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Only averages and highest scores are reported. No standard deviations, IQR, or other spread measures across the 15-20 runs." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Three LLMs are compared (GPT-3.5, GPT-4o-mini, GPT-o3-mini) and 0-shot vs 1-shot prompts, plus coarse-grained vs fine-grained generation." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "GPT-3.5 is dated. More importantly, no non-OpenAI models (Claude, Gemini, DeepSeek, open-source models) are included despite being relevant contemporary baselines for code generation." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The fine-grained experiments (Section 6) serve as an ablation, generating code per-request-type to isolate the effect of granularity on code correctness." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics are reported: % unit tests passed (V0 and V1), highest number of tests passed, % V1 improved, fraction of testable code, and per-request-type breakdowns." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "The authors manually examine error logs, generated code, and GPT-o3-mini reflection results (Section 5.7: 'we examined the run-time error-logs, the GPT-o3-mini reflection results, as well as the generated code')." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The MSBA specifications were custom-created (Section 3.1) specifically to avoid contamination. Unit tests were developed against ground truth implementations, not tuned to LLM outputs." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per microservice (Tables 1-6), per request type (Tables 8-9), per LLM, per prompt type (0-shot vs 1-shot), and per temperature (Table 10)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 5.7 provides detailed analysis of failure types: incorrect package usage, superficial API understanding, datatype conversion failures, and disregarding specification details, with specific code examples." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Several negative results: 1-shot prompts degrade GPT-o3-mini performance, reflection hurts GPT-o3-mini V1 scores, providing explicit API details did not fix Dishes test failures due to other issues (Section 5.7)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about strong LLMs doing 'fairly well on medium difficulty' and 'poorly on higher difficulty' are supported by Tables 1-6. Fine-grained improvement claim is supported by Tables 8-9." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper claims fine-grained generation 'improves' correctness but the fine-grained prompts include additional information (authentication wrapper, import statements, Mongo collection name) acknowledged in Section 6, confounding the comparison." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The abstract says 'agents using strong LLMs (like GPT-3o-mini)' generalizing from 2 applications with 8 microservices total, tested on only OpenAI models. The title frames this as about 'LLM Agents' broadly." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No discussion of alternative explanations for the results. For example, the poor restaurant performance could be due to prompt quality, test difficulty distribution, or other factors beyond specification complexity." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper uses unit test pass rate as a proxy for 'how well LLM Agents perform' without discussing limitations of this proxy. Not all unit tests are equal (acknowledged briefly in Section 5.3.2) but the gap between test pass rate and actual code correctness/quality is not addressed." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are referred to as 'GPT-3.5-16K', 'GPT-4o-mini', 'GPT-o3-mini' without specific version IDs or snapshot dates." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompts are provided in Appendix A (code generation, reflection, regeneration prompts in Figures 3-5), Appendix C (1-shot example in Figure 7), and Appendix D (fine-grained prompt in Figure 8)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Temperature settings are reported: 0.0, 0.3, 0.5 for GPT-3.5 and GPT-4o-mini; default for GPT-o3-mini. MaxGen=2 is stated." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The LangGraph-based agentic workflow is described in detail in Section 5.1 with Figure 2 showing the full workflow: Init Agent → Code Agent → Docker Test Agent → Evaluate Agent → Regen Agent." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 5.2 describes package installation handling (omitting stdlib packages, substituting package names). Section 5.1 describes error message truncation for context window limitations." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The conclusion (Section 8) briefly restates findings but does not discuss limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed. Issues like small number of applications (2), limited model selection (only OpenAI), and custom-built benchmarks are not addressed as limitations." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. No discussion of limitations in generalizing from 2 MSBAs to the broader domain." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "Raw experimental data (generated code, error logs, per-run test outcomes) is not released. Only aggregate statistics are reported in tables." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.1 describes how MSBA specifications were created (iterative process with GT implementation and testing). Section 5.1 describes the experimental methodology." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The study evaluates LLM-generated code on custom benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The full pipeline from specification creation (Section 3.1) through code generation, testing in Docker containers, error collection, reflection, and regeneration (Section 5.1, Figure 2) is documented." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information is provided anywhere in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliation email (post.runi.ac.il, Reichman University) is provided." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is provided." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training cutoff dates are stated for any of the models used (GPT-3.5, GPT-4o-mini, GPT-o3-mini)." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": true, 234 "justification": "Section 3.1: 'First we want to make sure that the LLMs have not been contaminated - have not seen these MSes before.' Custom specifications were created specifically to avoid contamination." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": true, 239 "justification": "The benchmark was custom-created (Section 3.1) specifically to avoid contamination. The specifications were novel and not publicly available before the experiments." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No API costs, tokens consumed, or wall-clock time are reported despite running hundreds of LLM experiments across three models." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total compute budget or API spend is stated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Results are averaged across 15-20 runs but no seed sensitivity analysis is reported. Temperature variation is explored (Appendix E) but this is not the same as random seed sensitivity." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Number of experiments is clearly stated: 15 for GPT-3.5 and GPT-o3-mini (Section 5.3), 15-20 for restaurant experiments (Section 5.4), 12-13 for fine-grained experiments (Tables 8-9)." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "Three temperature values are tested but no systematic hyperparameter search is described for prompts or other settings." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "All configurations and their results are reported (all temperatures, all models, all prompt types) rather than cherry-picking a best configuration." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applicable. But many comparisons are made without any statistical framework." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors built both the specifications, ground truth implementations, and unit tests. No acknowledgment that this could bias the evaluation (e.g., tests might reflect the author's implementation assumptions)." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "GPT-o3-mini is a reasoning model likely using significantly more compute than GPT-3.5, but compute costs are never compared." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether unit test pass rate on 2 custom MSBAs validly measures 'how well LLM Agents perform' on real-world microservice generation." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "The same LangGraph scaffold is used across all model comparisons (Section 5.1), so the scaffold is controlled." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "Custom-created specifications (Section 3.1) eliminate temporal leakage since benchmarks did not exist before the experiments." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup provides information not available in real usage. The 1-shot prompt and fine-grained prompts provide additional context that could be considered feature leakage in some framings." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The 8 microservices come from only 2 applications, and results within an application are not independent (shared specifications, shared dependencies). This is not discussed." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No formal leakage detection method is used. Custom benchmarks reduce contamination risk but no verification method is applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "GPT-o3-mini performs well on medium-difficulty MSBA specifications, averaging 83.5-94.4% unit test pass rate on the library application.", 364 "evidence": "Table 3 (Section 5.3.1) shows V0 pass rates of 83.5-94.4% across library microservices.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Performance degrades significantly on the more complex restaurant application, with GPT-o3-mini achieving only 10.8-65.8% V0 pass rates.", 369 "evidence": "Table 6 (Section 5.5) shows V0 pass rates ranging from 10.8% (Ratings) to 65.8% (Authentication).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Fine-grained code generation (per-request) improves correctness over coarse-grained (per-service) generation.", 374 "evidence": "Tables 8-9 (Section 6): Dishes V0/V1 improve from 23.6%/25.8% to 48.5%/77.8%; Ratings from 10.8%/39.2% to 33.3%/63.2%.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Reflection/regeneration is effective for weaker models (GPT-3.5) but much less so for GPT-o3-mini.", 379 "evidence": "Tables 1-2 show GPT-3.5 V1 improvement (e.g., 26.4%→50.2% for Cardholders). Tables 3-4 show GPT-o3-mini V1 often degrades (e.g., 94.4%→0.0% for Cardholders).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "1-shot prompting improves GPT-3.5 performance but degrades GPT-o3-mini performance.", 384 "evidence": "Comparing Tables 1 vs 2 (GPT-3.5 improves with 1-shot) and Tables 3 vs 4 (GPT-o3-mini mostly degrades with 1-shot).", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Very small benchmark", 391 "detail": "Only 2 applications with 8 total microservices. Conclusions about specification complexity and LLM performance are drawn from an extremely limited sample." 392 }, 393 { 394 "flag": "No statistical testing", 395 "detail": "All comparisons are based on raw percentages with no significance tests, confidence intervals, or variance measures. With only 15-20 runs, differences could easily be due to chance." 396 }, 397 { 398 "flag": "Confounded fine-grained comparison", 399 "detail": "The fine-grained experiments provide additional information (authentication wrapper, import statements, Mongo collection name) not in the coarse-grained prompts. The paper acknowledges this ('tempered by the knowledge that the fine-grained code is easier') but still claims improvement from the approach." 400 }, 401 { 402 "flag": "No limitations section", 403 "detail": "The paper has no dedicated limitations or threats-to-validity section despite significant scope limitations (2 apps, 3 OpenAI models, custom benchmarks, single author creating specs/GT/tests)." 404 }, 405 { 406 "flag": "Only OpenAI models tested", 407 "detail": "Only OpenAI models (GPT-3.5, GPT-4o-mini, GPT-o3-mini) are evaluated. No open-source models, Claude, Gemini, or other providers are tested despite claims about 'LLM Agents' broadly." 408 }, 409 { 410 "flag": "Model naming error in abstract", 411 "detail": "The abstract refers to 'GPT-3o-mini' (likely meaning GPT-o3-mini), and the conclusion also says 'GPT-3o-mini'. This inconsistency raises questions about care in manuscript preparation." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "Evaluating large language models trained on code", 417 "authors": ["M. Chen"], 418 "year": 2021, 419 "relevance": "HumanEval benchmark paper, foundational for LLM code generation evaluation." 420 }, 421 { 422 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 423 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"], 424 "year": 2023, 425 "arxiv_id": "2310.06770", 426 "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks." 427 }, 428 { 429 "title": "Program synthesis with large language models", 430 "authors": ["J. Austin", "A. Odena", "M. I. Nye"], 431 "year": 2021, 432 "relevance": "MBPP benchmark for LLM code generation evaluation." 433 }, 434 { 435 "title": "What's wrong with your code generated by large language models? an extensive study", 436 "authors": ["S. Dou", "H. Jia", "S. Wu"], 437 "year": 2024, 438 "relevance": "Extensive study of errors in LLM-generated code, directly relevant to understanding code generation quality." 439 }, 440 { 441 "title": "We have a package for you! A comprehensive analysis of package hallucinations by code generating LLMs", 442 "authors": ["J. Spracklen", "R. Wijewickrama"], 443 "year": 2024, 444 "relevance": "Studies package hallucination in LLM code generation, a key error type found in this paper." 445 }, 446 { 447 "title": "Reflexion: language agents with verbal reinforcement learning", 448 "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"], 449 "year": 2023, 450 "relevance": "Reflection technique used in this paper's agentic workflow for code regeneration." 451 }, 452 { 453 "title": "OpenHands: An open platform for AI software developers as generalist agents", 454 "authors": ["X. Wang", "B. Li", "Y. Song"], 455 "year": 2025, 456 "relevance": "Open platform for AI coding agents, relevant to agentic software engineering." 457 }, 458 { 459 "title": "Agentless: Demystifying LLM-based software engineering agents", 460 "authors": ["C. S. Xia", "Y. Deng", "S. Dunn", "L. Zhang"], 461 "year": 2024, 462 "arxiv_id": "2407.01489", 463 "relevance": "Alternative non-agentic approach to LLM-based software engineering." 464 }, 465 { 466 "title": "GitChameleon: Unmasking the version-switching capabilities of code generation models", 467 "authors": ["N. Islah", "J. Gehring", "D. Misra"], 468 "year": 2024, 469 "arxiv_id": "2411.05830", 470 "relevance": "Studies LLM capability to handle different package versions, relevant to dependency management challenges." 471 }, 472 { 473 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 474 "authors": ["DeepSeek-AI"], 475 "year": 2025, 476 "relevance": "Reasoning model referenced for few-shot sensitivity observation relevant to prompt engineering." 477 }, 478 { 479 "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models", 480 "authors": ["H. Yu", "B. Shen", "D. Ran"], 481 "year": 2024, 482 "relevance": "Defines runnable levels for code generation complexity, related to this paper's complexity metric." 483 }, 484 { 485 "title": "DI-BENCH: Benchmarking large language models on dependency inference with testable repositories at scale", 486 "authors": ["L. Zhang", "J. Wang", "S. He"], 487 "year": 2025, 488 "relevance": "Benchmarks LLM dependency inference, directly related to the dependency challenges identified in this paper." 489 } 490 ] 491 }