scan.json (28864B)
1 { 2 "paper": { 3 "title": "Test-Driven Development for Code Generation", 4 "authors": ["Noble Saji Mathews", "Meiyappan Nagappan"], 5 "year": 2024, 6 "venue": "International Conference on Automated Software Engineering", 7 "arxiv_id": "2402.13521", 8 "doi": "10.1145/3691620.3695527" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Providing test cases alongside problem statements to LLMs improves code generation correctness by 9.15–29.57% across MBPP, HumanEval, and a curated CodeChef dataset, with less performant models (Llama 3) benefiting more than stronger ones (GPT-4). Remediation loops that feed test failures back to the LLM add an additional 5.26–9.02% improvement. Problem difficulty inversely correlates with TDD effectiveness, and solutions generated with test information also perform well against unseen private tests, suggesting the improvement is not mere test overfitting.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A replication package is provided at OSF (https://osf.io/e3jy6/) which includes 'code scripts for the experiments' as stated in Section 3.2." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The primary datasets (MBPP, HumanEval via EvalPlus) are publicly available. The CodeChef dataset is curated from a public platform. The replication package includes 'output and runtime details from TGen for all cases we experimented with' (Section 3.2)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions GPT-4 Turbo v1106, Llama 3 70B Instruct, temperature 0, and seed 1106, but provides no requirements.txt, Dockerfile, or detailed environment specification for reproducing the Python execution environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The replication package contains scripts and output data, but the paper itself includes no step-by-step reproduction instructions, README description, or 'Reproducing Results' section." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimate percentages (e.g., '80.5% of MBPP problems') with no confidence intervals or error bars anywhere in the paper." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Claims like 'including tests contributes to solving an additional 12.0%' are made by comparing raw percentages with no statistical significance tests (no p-values, t-tests, or bootstrap tests)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 and the text report percentage improvements with baseline context (e.g., MBPP: 69.67% → 82.45% → 87.71%, improvement of +12.78% and +5.26%), allowing readers to assess magnitude in context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is given for sample sizes. The 399 MBPP and 164 HumanEval problems are inherited from existing benchmarks; the 1100 CodeChef problems (100 per difficulty level) are chosen without power analysis or justification for why 100 per level is sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All experiments use a single seed (1106) and temperature 0 with no variance, standard deviation, or spread measure reported across runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper systematically compares: (1) baseline code generation with problem statement alone, (2) with tests added, (3) with tests plus remediation loop. Figure 4, Table 1, and Table 3 present these comparisons." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "GPT-4 Turbo v1106 and Llama 3 70B Instruct were among the most capable models at the time of publication. The baseline is the same model without TDD intervention, which is the appropriate comparison." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper progressively adds components: no tests → tests → tests + remediation, measuring each component's contribution. Section 5.2 also varies the number of tests (Figure 7), serving as an ablation of test quantity." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "The sole evaluation metric is pass rate (fraction of problems solved correctly). No secondary metrics such as code quality, code length, readability, or execution efficiency are reported." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation is entirely automated via PyTest pass/fail. While the authors manually inspect specific cases for qualitative insights (Sections 4.2–4.4), there is no systematic human evaluation of the generated code." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "EvalPlus private tests (35x for MBPP, 80x for HumanEval) are used as held-out evaluation that the LLM never sees. CodeChef uses the platform's private test suites. Results validated against these are reported in Table 1." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by dataset (MBPP, HumanEval, CodeChef), by strategy (solved without tests, needs tests, needs remediation), and by difficulty level (Figure 5, Figure 6, Table 2)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.3 extensively categorizes unsolved problems: misunderstanding core logic, data structure handling, logical errors, input/output format issues, and performance bottlenecks, with specific problem IDs cited." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports: remediation has diminishing returns beyond 3–4 iterations (Section 4.3), adding more tests can cause regressions due to 'lost in the middle' (Section 5.2), and 47.18% of CodeChef problems remain unsolved (Section 5.1)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims that 'including test cases leads to higher success in solving programming challenges' which is supported by the consistent improvements across MBPP (+12.78%), HumanEval (+9.15%), and CodeChef (+3.09%) shown in Table 1." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper claims 'providing LLMs with tests enhances code generation outcomes.' The experimental design holds the model constant and varies only the input (with/without tests), which is adequate controlled single-variable manipulation for this causal claim." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The conclusion states 'we therefore advocate for the widespread adoption of test-driven methodologies to maximize the benefits of LLMs in code generation' — a broad generalization from experiments on two models, Python-only benchmarks, and function/file-level tasks. The title 'Test-Driven Development for Code Generation' is similarly unbounded." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not consider that adding tests simply adds more input context (any additional context might help), or that tests may leak structural information about the expected solution. Section 7 discusses generic threats (prompt sensitivity, benchmark limitations) but not alternative explanations for why the improvements occur." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures pass rate on test suites and claims to measure 'correctness' of code generation. The proxy (test-passing) closely matches the claimed outcome (correctness), with no broader framing that would create a proxy gap." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions are stated: 'GPT-4 Turbo v1106' (Section 3.2), 'GPT-3.5 Turbo v1106' (Section 5.3), and 'Meta Llama 3 70B Instruct' (Section 5.3)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text for the Coder Agent is shown in Figure 2 and the Remediation Agent configuration in Figure 3, including the multi-step reasoning instructions and dataset-specific guidelines." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.2 states: 'We use a seed value of 1106 and a temperature of 0 in our experiments.' Remediation loop parameters are also specified: 5 iterations max, 3 repeated failures." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The TGen framework is described in detail in Section 3.2 with Figure 1 showing the full pipeline: input phase, LLM engine, coder and remediation agents, verifier, and remediation loop with specific stopping criteria." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Dataset preparation is documented: MBPP uses the EvalPlus sanitized subset (399 of 427 problems), HumanEval uses 164 problems with EvalPlus enrichment, CodeChef uses 100 most popular problems per difficulty level scraped November 2023. Sample test cases were removed from problem statements for the baseline experiment." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 'Threats to Validity' provides a dedicated discussion of limitations spanning prompt engineering, benchmark limitations, non-deterministic problems, and model variability." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 7 includes study-specific threats: test assumptions may not hold for non-deterministic problems (they excluded such cases), the drop in baseline performance vs. leaderboards due to removed sample tests, and that EvalPlus test quality was used to address known benchmark weaknesses." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "While the paper notes benchmarks 'do not capture the full complexity' of real-world problems (Section 7), it does not explicitly state what the results do NOT show — e.g., no statement that results don't generalize to repository-level tasks, non-Python languages, or models beyond GPT-4/Llama 3." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The replication package at OSF includes 'output and runtime details from TGen for all cases we experimented with' (Section 3.2), providing raw experimental outputs for verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "MBPP and HumanEval datasets are described with their EvalPlus variants (Section 3.1). CodeChef collection is described: 100 most popular problems per difficulty level, popularity defined as accepted solutions count, scraped November 2023 (Section 5.1.1)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks (MBPP, HumanEval) and a curated dataset from a public platform (CodeChef) with selection criteria described." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented in Section 3.2: problem statements and tests as input → LLM generation → PyTest verification → optional remediation loop. The sequential evaluation process (solved without tests → needs tests → needs remediation → unsolved) is clearly described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors is present." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Both authors are affiliated with University of Waterloo, Canada, clearly listed in the paper header. They evaluate GPT-4 (OpenAI) and Llama 3 (Meta) with no affiliation to either company." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of funder cannot be assessed. The paper evaluates commercial products (GPT-4, Llama 3) but there is no evidence of industry funding." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosure is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "Neither GPT-4 Turbo's nor Llama 3's training data cutoff dates are stated. This is relevant because MBPP (2021) and HumanEval (2021) predate both models' training." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether MBPP or HumanEval problems appeared in GPT-4's or Llama 3's training data, despite both benchmarks being widely available online since 2021." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MBPP and HumanEval were published in 2021, well before GPT-4 and Llama 3 training. The paper uses these benchmarks without any discussion of contamination risk, which could inflate baseline performance and reduce the measured improvement from TDD." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All experiments involve automated LLM evaluation on coding benchmarks." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates LLM code generation on programming benchmarks." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, token counts, or cost per example are reported despite extensive GPT-4 API usage across hundreds of problems with multiple iterations each." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget, API spend, or wall-clock time is reported for the experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "A single seed (1106) with temperature 0 is used for all experiments. No sensitivity analysis across multiple seeds is performed." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not explicitly state the number of experimental runs per problem. The use of fixed seed and temperature 0 implies single runs but this is not explicitly stated." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. Temperature (0), seed (1106), max iterations (5), and repeated failure threshold (3) appear to be chosen without reported search or justification." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The configuration choices (temperature 0, seed 1106, 5 iteration max, 3 repeated failure limit) are stated but not justified with systematic comparison. The remediation limits are partially justified by observing 'diminishing returns' but no formal analysis is provided." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own TGen framework without acknowledging potential author-evaluation bias. No independent evaluation or discussion of this bias is provided." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "TGen with remediation makes multiple LLM calls per problem (up to 10+ round trips for 5 remediation iterations) compared to a single call for the baseline, yet this compute difference is never quantified or discussed." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 3.1 discusses that 'existing work in the literature has questioned the accuracy of function-level benchmarks for evaluating LLMs' and the paper addresses this by also using CodeChef for file-level generation and noting benchmarks 'do not capture the full complexity and variability of real-world programming problems' (Section 7)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "The TGen scaffold is held constant across conditions (the variable is test inclusion, not scaffolding). When comparing GPT-4 vs Llama 3, the same TGen pipeline is used for both, controlling the scaffold confound." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "MBPP (2021) and HumanEval (2021) were published years before GPT-4 and Llama 3 were trained. No discussion of whether these benchmarks and their solutions were in the training data." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether providing test cases as input leaks structural information about the expected solution beyond what would be available in a realistic TDD scenario." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether MBPP and HumanEval problems (or solutions similar to them) appear in the training corpora of GPT-4 or Llama 3." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is employed (no canary strings, membership inference, temporal splits, or decontamination)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Providing test cases improves LLM code generation correctness by 9.15–29.57% across benchmarks after private test validation.", 365 "evidence": "Table 1 shows MBPP: +12.78%, HumanEval: +9.15%, CodeChef: +3.09% for GPT-4. Table 3 shows MBPP: +29.57%, HumanEval: +13.41% for Llama 3. All validated against private/held-out tests.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Remediation loops that feed test failures back to the LLM provide additional improvement of 5.26–9.02%.", 370 "evidence": "Table 1 shows remediation adds +5.26% (MBPP), +5.49% (HumanEval), +4.18% (CodeChef) for GPT-4. Table 3 shows +9.02% (MBPP), +8.54% (HumanEval) for Llama 3.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Less performant models benefit more from TDD — Llama 3 shows almost double the improvement of GPT-4.", 375 "evidence": "Section 5.3: Llama 3 total improvement 38.60% (MBPP) and 21.95% (HumanEval) vs GPT-4's 18.04% and 14.64% respectively (Tables 1 and 3).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Problem difficulty inversely correlates with TDD effectiveness — simpler problems benefit more from tests while harder problems remain unsolved.", 380 "evidence": "Figure 5 and Figure 6 show success rates sharply decrease with difficulty. Section 5.1.2 notes 47.18% of CodeChef problems remain unsolved, concentrated in harder difficulty levels.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Solutions generated with test information are robust against unseen private tests, not just tailored to supplied tests.", 385 "evidence": "Table 1 shows improvements persist after evaluation with EvalPlus private tests (35x–80x more tests). CodeChef solutions also validated against the platform's private test suites.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Remediation advice becomes repetitive beyond 3–4 iterations with diminishing returns.", 390 "evidence": "Section 4.3 reports most problems solved in first 1–2 remediation attempts (MBPP: 11 of 21 on first attempt; HumanEval: 7 of 9 on first attempt). No problems were solved at the 5th attempt despite attempts utilizing all 5 iterations.", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No contamination discussion", 397 "detail": "MBPP and HumanEval (both 2021) were almost certainly in GPT-4's training data. If the model has memorized solutions, the measured 'baseline' is inflated and the relative improvement from TDD may be distorted. This fundamental confound is never addressed." 398 }, 399 { 400 "flag": "No statistical tests or uncertainty quantification", 401 "detail": "All comparisons use raw percentage differences with no significance tests, confidence intervals, or error bars. A 3.09% improvement on CodeChef (34 problems out of 1100) could easily be noise, but no statistical test confirms it is meaningful." 402 }, 403 { 404 "flag": "Single-run experiments", 405 "detail": "Despite using temperature 0 and fixed seed for reproducibility, there is no exploration of sensitivity to seed choice. LLM outputs can still vary across API calls even with fixed parameters, and the results may not be stable." 406 }, 407 { 408 "flag": "Compute cost of TDD approach unquantified", 409 "detail": "The remediation loop makes up to 10+ LLM API calls per problem (coder + remediation agent alternating for 5 iterations), but the paper never quantifies this cost. The 5.26% improvement from remediation could require 5x the compute of the baseline, making it impractical." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation", 415 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 416 "year": 2024, 417 "relevance": "EvalPlus benchmark used as primary evaluation framework; addresses insufficiency of existing code generation benchmarks." 418 }, 419 { 420 "title": "Evaluating large language models trained on code", 421 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 422 "year": 2021, 423 "arxiv_id": "2107.03374", 424 "relevance": "Introduces HumanEval benchmark and Codex; foundational work on LLM code generation evaluation." 425 }, 426 { 427 "title": "Program synthesis with large language models", 428 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 429 "year": 2021, 430 "arxiv_id": "2108.07732", 431 "relevance": "Introduces MBPP benchmark used as primary evaluation dataset in this study." 432 }, 433 { 434 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 435 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"], 436 "year": 2023, 437 "arxiv_id": "2310.06770", 438 "relevance": "Real-world code generation benchmark using GitHub issues; contrasts with function-level benchmarks used here." 439 }, 440 { 441 "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection", 442 "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"], 443 "year": 2023, 444 "arxiv_id": "2303.11366", 445 "relevance": "Self-improvement through iterative reflection in code agents; related to the remediation loop approach." 446 }, 447 { 448 "title": "AutoCodeRover: Autonomous Program Improvement", 449 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 450 "year": 2024, 451 "arxiv_id": "2404.05427", 452 "relevance": "Agentic system using test cases for spectrum-based fault localization in code generation." 453 }, 454 { 455 "title": "Codet: Code generation with generated tests", 456 "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen"], 457 "year": 2022, 458 "arxiv_id": "2207.10397", 459 "relevance": "Uses generated test cases to improve code generation quality; directly related methodology." 460 }, 461 { 462 "title": "LLM-based Test-driven Interactive Code Generation: User Study and Empirical Evaluation", 463 "authors": ["Sarah Fakhoury", "Aaditya Naik", "Georgios Sakkas"], 464 "year": 2024, 465 "arxiv_id": "2404.10100", 466 "relevance": "Explores LLM-generated tests with user feedback for code generation; closely related TDD approach." 467 }, 468 { 469 "title": "MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation", 470 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 471 "year": 2023, 472 "relevance": "Polyglot code generation benchmark; raises concerns about accuracy of function-level benchmarks for LLM evaluation." 473 }, 474 { 475 "title": "Quality and Trust in LLM-generated Code", 476 "authors": ["Claudio Spiess", "David Gros", "Kunal Suresh Pai"], 477 "year": 2024, 478 "arxiv_id": "2402.02047", 479 "relevance": "Addresses trust and correctness issues in LLM-generated code; motivates need for verification mechanisms." 480 }, 481 { 482 "title": "Competition-level code generation with alphacode", 483 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 484 "year": 2022, 485 "relevance": "AlphaCode system for competitive programming; CodeContests benchmark contrasted with CodeChef dataset used here." 486 }, 487 { 488 "title": "Interactive code generation via test-driven user-intent formalization", 489 "authors": ["Shuvendu K Lahiri", "Aaditya Naik", "Georgios Sakkas"], 490 "year": 2022, 491 "arxiv_id": "2208.05950", 492 "relevance": "Explores test cases for formalizing user intent in code generation; foundational TDD-for-LLM work." 493 } 494 ] 495 }