scan.json (31341B)
1 { 2 "paper": { 3 "title": "On Simulation-Guided LLM-based Code Generation for Safe Autonomous Driving Software", 4 "authors": [ 5 "Ali Nouri", 6 "Johan Andersson", 7 "Kailash De Jesus Hornig", 8 "Zhennan Fei", 9 "Emil Knabe", 10 "Håkan Sivencrona", 11 "Beatriz Cabrero-Daniel", 12 "Christian Berger" 13 ], 14 "year": 2025, 15 "venue": "EASE 2025 (International Conference on Evaluation & Assessment in Software Engineering)", 16 "arxiv_id": "2504.02141", 17 "doi": "10.1145/3756681.3756987" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "Zenodo links are provided for the interview protocol (doi:10.5281/zenodo.14783284) and sample generated codes/reports (doi:10.5281/zenodo.14783374), but the pipeline source code itself is not released. Only supplementary materials are available." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "Sample generated codes and reports for one iteration (t13) are on Zenodo, and a YouTube demo is available. However, the full experimental data (all 40 code versions, all test reports, simulation scenarios, interview transcripts) is not released." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions Python, esmini simulator, and specific Ollama model IDs in footnote 2 (e.g., deepseek-coder:33b ID: acec7c0b0fd9), but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided. The pipeline architecture is described conceptually but not with enough operational detail (commands, configuration files) to reproduce." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Results are reported as raw counts (e.g., 6 out of 20 successful for ACC, 1 out of 20 for CAEM) and percentages with no confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper compares models on success rates (e.g., GPT-4 vs Mistral:7b vs CodeGemma:7b) without any statistical significance tests. Claims of superiority are based solely on comparing raw counts." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper reports raw percentages (e.g., '9.2% improvement' for corrections, success rate from 30% to 35% for ACC) but no formal effect sizes (Cohen's d, odds ratios, etc.). The improvement percentages are raw differences without statistical context." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "N=20 pipeline initiations and N=11 interviewees. No justification for why 20 initiations were chosen, no power analysis, and no discussion of whether these sample sizes are adequate for the claims made." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance, standard deviation, or spread measures are reported across runs. Each of the 20 initiations is shown individually in Fig 4 but no aggregate variability measures are computed." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Multiple LLM models are compared: GPT-4, Codellama:34b, DeepSeek-r1:32b, DeepSeek-Coder:33b, CodeGemma:7b, and Mistral:7b (Fig 5). Initial code generation vs correction-loop versions are also compared." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "The models used (GPT-4, DeepSeek, CodeGemma, Mistral, Codellama) were contemporary at the time of the study. The paper was submitted January 2025." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The comparison between initial code generation (Specification Prompt only) vs the correction loop (with simulation feedback) effectively ablates the feedback mechanism. Results show 9.2% average improvement with the correction loop (Sec 7.1)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple evaluation criteria are used: compilability (syntax errors), number of test cases passed (TC1-TC7), expert ratings (useful/very useful), and qualitative analysis of failure modes." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "An interview study with 11 experts from 2 OEMs evaluated the prototype (Sec 7.3). Experts watched a pre-recorded demo and rated usefulness, strengths, weaknesses, and proposed improvements." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": false, 100 "justification": "The same 7 test cases (TC1-TC7) used to provide feedback to the LLM during the correction loop are also used as the final evaluation criteria. There is no separate held-out evaluation set." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down per model (Fig 5), per function (ACC vs CAEM), per iteration (Fig 4), and by outcome type (non-compilable, failed, corrected, successful)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Failure cases are discussed in detail: 3 instances of non-executable code, 3 runtime errors (Sec 7.1), regression during correction (Fig 4), complete failure of open-source models on CAEM (Sec 7.2), and token limitation effects on correction loops." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Multiple negative results: all open-source models failed for CAEM, correction loop sometimes causes regression, larger models don't necessarily outperform smaller ones (Mistral:7b > Codellama:34b), token limitations prevented open-source model corrections." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims are supported: they developed and evaluated a prototype (Sec 6-7), report experimental results for multiple models (Fig 5), and assessed with 11 experts (Sec 7.3). No overclaiming in the abstract." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The main causal claim — that the feedback loop improves code quality — is supported by a controlled comparison: same model, same prompts, with vs without correction. The 9.2% improvement and specific examples (C25→C26 in t13) demonstrate the mechanism." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Sec 7.5 (External Validity) explicitly bounds the results: 'designed with specific components such as GPT-4 for code generation, esmini for simulation, and for generating Python code for ADS.' The paper acknowledges adaptability claims are aspirational." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "Sec 7.5 discusses alternatives: LLM stochastic behavior causing variability, potential data leakage for ACC vs CAEM, simulation model inaccuracy. Sec 7.4 discusses token limitations as explanation for open-source model failures in correction loops." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper measures pass/fail on 7 simulated test scenarios and frames this as 'safety evaluation' and 'safety-related KPIs.' The gap between passing 7 predefined scenarios in esmini and actual safety of ADS code is not explicitly discussed. The paper acknowledges simulation limitations but does not distinguish the proxy (scenario pass rate) from the claimed outcome (safety)." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "Open-source models have Ollama hash IDs in footnote 2 (e.g., deepseek-coder:33b ID: acec7c0b0fd9), but GPT-4 — the primary model — is specified only as 'GPT-4' without a version, snapshot date, or API version." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": false, 154 "justification": "The Correction Prompt template is shown in Fig 2 with structural placeholders. The Specification Prompt structure is described (Context + Task Description) in Sec 6.1. But the actual prompt text is not provided — only descriptions of what each section contains." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "No temperature, top-p, max tokens, or other API/inference parameters are reported for any model. These significantly affect LLM output quality and reproducibility." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The pipeline architecture is well-described: LLM code generation → esmini simulation → rule-based report generator → natural language feedback → correction prompt (Fig 2). The baseline selection strategy, fresh-start mechanism, and iteration logic are documented (Fig 3, Sec 6.2)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "The data pipeline is documented: simulation generates tabular logs of positions/velocities → report generator translates to natural language → pass/fail assessment against acceptance criteria (Sec 6.2). Test case specifications are detailed in Sec 4.4." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Sec 7.5 'Threats to Validity' provides substantive discussion organized into Construction Validity, Internal Validity, and External Validity." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Threats are specific to this study: simulation model inaccuracy for ADS (construction), LLM stochastic behavior mitigated by 20 initiations (internal), CAEM chosen to minimize data leakage (internal), specific components limiting generalizability (external)." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper explicitly bounds scope: prototype is for 'concept phase and function abstraction level' (Sec 4.1), not production code. External validity (Sec 7.5) states the specific components. The disclaimer notes the work is 'not used in any engineering of production related projects.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "Only a sample of generated codes and reports for iteration t13 is available via Zenodo (doi:10.5281/zenodo.14783374). The full set of 40 code versions, test reports, and simulation logs is not released." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "The experiment procedure is clearly described: 20 pipeline initiations per model per function, 2 versions per initiation (initial + correction), evaluation against 7 test cases, expert interview protocol (Sec 3.3-3.4, Sec 7.1)." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Interview participants: '11 experts selected from 2 OEMs,' minimum 5 years ADS experience, average 12 years, roles listed (manager, product owners, V&V engineers, etc.), from 3 countries (Sec 3.4). Selection criteria are stated." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The full pipeline is documented: 20 initiations → 40 versions → 6 non-executable filtered out → 14 initial + 14 enhanced versions analyzed (Sec 7.1). Filtering at each stage is explained with counts." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Acknowledgments section: 'partially supported by Sweden's Innovation Agency (Vinnova, diarienummer: 2021-02585), and by the Wallenberg AI Autonomous Systems and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation.'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations clearly listed: 5 authors from Volvo Cars and/or Chalmers University, 2 from University of Gothenburg. Volvo Cars is identified as the case company." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Funders are Vinnova (Swedish government innovation agency) and WASP (academic foundation). Neither has a financial stake in the performance of specific LLM models or the pipeline." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is included. Authors from Volvo Cars are evaluating a prototype relevant to Volvo's business, but this potential conflict is not explicitly declared beyond affiliation listing." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff date is stated for any of the models used (GPT-4, Codellama, DeepSeek, CodeGemma, Mistral). The paper discusses leakage conceptually but not with temporal specifics." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": true, 242 "justification": "Sec 4.2 explicitly discusses this: 'AEB and ACC are mature, there is plenty of data in public databases, which increases the probability of the LLM being trained on them, posing a threat to validity.' They chose CAEM specifically because it is 'immature with respect to existing publicly available data.'" 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": true, 247 "justification": "Sec 2 discusses benchmark leakage risk. Sec 4.2 addresses it by selecting CAEM as a novel function to minimize leakage. Sec 7.4 interprets the ACC vs CAEM performance gap partly through this lens: 'models are specifically optimised to pass these known tasks and benchmarks rather than to solve truly novel tasks.'" 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": true, 253 "answer": false, 254 "justification": "No pre-registration is mentioned for the interview study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": true, 258 "answer": false, 259 "justification": "The paper states that 'ethical guidelines for software engineering interview studies, such as obtaining consent, anonymization, and confidentiality, as proposed by Strandberg [27] were followed,' but no IRB or ethics board approval is mentioned." 260 }, 261 "demographics_reported": { 262 "applies": true, 263 "answer": true, 264 "justification": "Participant demographics are reported: roles (manager, product owners, V&V engineers, data scientist, system architect, programmer, system safety experts), average 12 years experience, 3 countries, 2 OEMs (Sec 3.4)." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": true, 268 "answer": true, 269 "justification": "Sec 3.4: 'To be eligible, the participants had to have at least 5 years of experience in ADS development.'" 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "This is a qualitative interview study, not an experimental study with treatment/control conditions. Randomization is not applicable." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "This is a qualitative interview study where all participants view the same demo. Blinding is not applicable." 280 }, 281 "attrition_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "Sec 3.4 mentions '12 experts from two OEMs' but the Participants subsection and all results refer to '11 experts.' This discrepancy (12 invited vs 11 participated?) is not explained." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": true, 291 "justification": "Sec 7.4: 'Each full execution of the pipeline took less than 3 minutes from the initiation to delivering both the initial and enhanced versions, including simulations and report generation. This means that 20 full executions of the pipeline required less than an hour.'" 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Only wall-clock time is reported. No hardware specifications (GPU, CPU, RAM), API costs, or total compute budget are stated. For open-source models running locally, the hardware used is not specified." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No random seeds are set or reported. The 20 independent pipeline runs capture LLM stochasticity but this is not framed as seed sensitivity analysis, and no aggregate variability measures are computed." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": true, 308 "justification": "Sec 7.1: 'the pipeline was initiated 20 times, resulting in 40 versions of the code for the CAEM function using GPT-4.' Sec 7.2 states 20 codes per model per function." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Prompts were iteratively refined during design cycles but no systematic hyperparameter search is reported. No temperature, prompt variation, or configuration search budget is stated." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "All 40 versions for GPT-4/CAEM are shown in Fig 4. All models' results are reported in Fig 5. The baseline selection strategy is transparent (best-performing code at each step, Sec 6.2, Fig 3). No cherry-picking — all configurations are reported." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed at all, so correction for multiple comparisons does not apply. The absence of statistical tests is already captured in significance_tests." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors designed and evaluate their own pipeline. The interview study partially mitigates this with external expert evaluation, but the authors do not acknowledge author-evaluation bias for the experimental results." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Models range from 7b to 34b parameters plus GPT-4. The paper notes 'more parameters do not necessarily lead to better results' but does not compare performance at matched compute budgets or report per-model compute costs." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": true, 338 "justification": "Sec 2 and 4.2 explicitly discuss construct validity of standard benchmarks: 'benchmark leakage creates the risk of assessing the model's memory rather than its ability to handle unseen tasks.' They propose novel evaluation tasks as an alternative." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "All models are evaluated using the same pipeline, same prompts, same simulation environment, and same test cases. The scaffold is controlled across model comparisons. Differences in correction loop effectiveness are attributed to token limitations (Sec 7.4)." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "Sec 4.2 discusses that ACC/AEB have 'plenty of data in public databases' likely in LLM training data, while CAEM is 'immature with respect to existing publicly available data.' The choice of CAEM as the primary evaluation function directly addresses temporal leakage." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "The pipeline provides simulation feedback to the LLM during correction loops. While this is by design, the paper does not discuss whether this feedback provides information that would not be available in real deployment or whether it constitutes feature leakage." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "The 20 pipeline runs per model are treated as independent samples but there is no discussion of whether they are truly independent (same model weights, similar stochastic patterns, shared prompt structure)." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": true, 365 "justification": "The study uses a concrete prevention method: selecting CAEM as a novel function with minimal public data to avoid training data contamination, then comparing performance against ACC (known function with more public data) to detect leakage effects (Sec 4.2, 7.4)." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "The simulation-based feedback loop improves generated code, with an average 9.2% improvement in passed test cases across all corrections and 37% considering only successful improvements.", 372 "evidence": "Sec 7.1: 14 initial generations and 14 enhanced versions analyzed. 5 of 14 corrections showed improvement. For ACC, success rate increased from 30% (6/20) to 35% (5/14). The gold baseline (all 7 TCs passed) was achieved through correction in iteration t13.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "GPT-4 significantly outperforms all tested open-source models, being the only model to produce successful code for both ACC (11 successful) and CAEM (2 successful).", 377 "evidence": "Fig 5 and Sec 7.2: GPT-4 produced 11 successful codes for ACC and 2 for CAEM. Open-source models produced at most 3 (Mistral for ACC) and 0 for CAEM. All GPT-4 generated code compiled, unlike open-source models with many syntax errors.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "More parameters in an LLM do not necessarily lead to better code generation: Mistral:7B and CodeGemma:7B outperform CodeLlama:34B and DeepSeek-r1:32B.", 382 "evidence": "Fig 5: Mistral:7b produced 3 successful ACC codes, CodeGemma:7b produced 1, while CodeLlama:34b and DeepSeek models produced 0. CodeLlama had the highest non-compilable count (26).", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Standard benchmarks may overstate model performance due to benchmark leakage — models perform much worse on novel tasks.", 387 "evidence": "Sec 4.2 and 7.4: ACC (mature, more public data) had much higher success rates than CAEM (novel function). Open-source models succeeded on ACC but completely failed on CAEM. However, this is confounded with task complexity.", 388 "supported": "weak" 389 }, 390 { 391 "claim": "The prototype is useful for industrial ADS development, particularly for prototyping and concept-phase work.", 392 "evidence": "Sec 7.3: 7/11 experts rated 'very useful,' 4 rated 'useful.' Cited benefits include reduced workload (8 experts), prototyping aid (3 experts), automation (3 experts), and visualization (1 expert).", 393 "supported": "moderate" 394 } 395 ], 396 "methodology_tags": ["benchmark-eval", "case-study", "qualitative"], 397 "key_findings": "This paper develops and evaluates a simulation-guided LLM code generation pipeline for autonomous driving software (ACC and CAEM functions). GPT-4 is the only model that successfully generates code for the novel CAEM function, while open-source models (Codellama:34b, DeepSeek, CodeGemma:7b, Mistral:7b) largely fail. The simulation-based correction feedback loop provides a modest 9.2% average improvement in passed safety test cases. An interview study with 11 industry experts validates the tool's usefulness for concept-phase prototyping, while emphasizing the need for human oversight before production deployment.", 398 "red_flags": [ 399 { 400 "flag": "No statistical tests for comparative claims", 401 "detail": "The paper compares 6 models across 2 functions making claims of superiority (GPT-4 best, smaller models outperform larger ones) based solely on raw counts from N=20 runs with no significance tests, confidence intervals, or variance measures." 402 }, 403 { 404 "flag": "Small sample sizes", 405 "detail": "N=20 pipeline initiations per model per function and N=11 interviewees. With high LLM output variability and only 2 successful CAEM codes from GPT-4 (out of 40 versions), the sample is too small for robust conclusions about model capabilities." 406 }, 407 { 408 "flag": "Confounded comparison between ACC and CAEM", 409 "detail": "The claim that benchmark leakage explains the ACC vs CAEM performance gap is confounded with task complexity — CAEM is explicitly described as more complex than ACC (multi-agent evasive maneuvers vs cruise control). The paper cannot separate leakage from complexity effects." 410 }, 411 { 412 "flag": "Potential social desirability bias in expert interviews", 413 "detail": "Volvo Cars authors interview experts at their own company and a supplier company about a tool they developed. All 11 experts rated the tool as useful or very useful, which may reflect social desirability rather than genuine assessment." 414 }, 415 { 416 "flag": "Unexplained participant count discrepancy", 417 "detail": "Sec 3.4 states '12 experts from two OEMs are interviewed' but the Participants subsection and all subsequent analysis refer to 11 experts. The discrepancy is not explained (dropout? pilot participant?)." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence", 423 "authors": ["D. Guo", "Q. Zhu", "D. Yang"], 424 "year": 2024, 425 "arxiv_id": "2401.14196", 426 "relevance": "LLM code generation capability evaluation, one of the models tested in this study." 427 }, 428 { 429 "title": "Evaluating large language models trained on code", 430 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 431 "year": 2021, 432 "relevance": "Foundational Codex/HumanEval paper on LLM code generation benchmarking and IP/security concerns." 433 }, 434 { 435 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 436 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 437 "year": 2023, 438 "arxiv_id": "2305.01210", 439 "relevance": "Rigorous evaluation of LLM code correctness, relevant to methodology quality in code generation studies." 440 }, 441 { 442 "title": "An Empirical Study of the Code Generation of Safety-Critical Software Using LLMs", 443 "authors": ["M. Liu", "J. Wang", "T. Lin"], 444 "year": 2024, 445 "relevance": "Directly relevant prior work on LLM code generation for safety-critical automotive software with prompt engineering techniques." 446 }, 447 { 448 "title": "Fully autonomous programming with large language models", 449 "authors": ["V. Liventsev", "A. Grishina", "A. Härmä", "L. Moonen"], 450 "year": 2023, 451 "relevance": "Iterative Synthesis, Execution, and Debugging (SED) approach for LLM programming, key precursor to the correction loop concept." 452 }, 453 { 454 "title": "Don't make your LLM an evaluation benchmark cheater", 455 "authors": ["K. Zhou", "Y. Zhu", "Z. Chen"], 456 "year": 2023, 457 "arxiv_id": "2311.01964", 458 "relevance": "Benchmark leakage in LLM evaluation, directly cited to motivate the novel-task evaluation approach." 459 }, 460 { 461 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 462 "authors": ["N. Nguyen", "S. Nadi"], 463 "year": 2022, 464 "relevance": "Empirical evaluation of AI code generation quality across programming languages, provides the Python baseline (42%) cited in this paper." 465 }, 466 { 467 "title": "SWE-bench: Can language models resolve real-world github issues?", 468 "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"], 469 "year": 2024, 470 "relevance": "Major LLM code generation benchmark, cited regarding readability and comprehensiveness of LLM-generated code." 471 }, 472 { 473 "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges", 474 "authors": ["J. T. Liang", "C. Yang", "B. A. Myers"], 475 "year": 2023, 476 "arxiv_id": "2303.17125", 477 "relevance": "Large-scale survey on AI programming assistant usability, cited for LLM code generation limitations and IP concerns." 478 }, 479 { 480 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 481 "authors": ["N. Shinn", "F. Cassano", "E. Berman"], 482 "year": 2023, 483 "arxiv_id": "2303.11366", 484 "relevance": "Key prior work on iterative LLM self-improvement through verbal feedback, foundational to the correction loop concept." 485 }, 486 { 487 "title": "Langprop: A code optimization framework using large language models applied to driving", 488 "authors": ["S. Ishida", "G. Corrado", "G. Fedoseev"], 489 "year": 2024, 490 "relevance": "LLM code optimization applied specifically to driving tasks, most closely related prior work in the autonomous driving domain." 491 }, 492 { 493 "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis", 494 "authors": ["E. Nijkamp", "B. Pang", "H. Hayashi"], 495 "year": 2023, 496 "arxiv_id": "2203.13474", 497 "relevance": "Multi-turn code generation and task decomposition approach for LLM programming." 498 } 499 ] 500 }