scan.json (32264B)
1 { 2 "paper": { 3 "title": "DynaCode: A Dynamic Complexity-Aware Code Benchmark for Evaluating Large Language Models in Code Generation", 4 "authors": [ 5 "Wenhao Hu", 6 "Jinhao Duan", 7 "Chunchen Wei", 8 "Li Zhang", 9 "Yue Zhang", 10 "Kaidi Xu" 11 ], 12 "year": 2025, 13 "venue": "Annual Meeting of the Association for Computational Linguistics", 14 "arxiv_id": "2503.10452", 15 "doi": "10.48550/arXiv.2503.10452" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract states 'Our benchmark and evaluation code are available at https://github.com/HWH-2000/DynaCode' — a concrete URL is provided." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The benchmark is dynamically generated from MBPP+ (a publicly available dataset via EvalPlus) and LeetCode problems. The generation code is released on GitHub, allowing reconstruction of the full benchmark." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. No library versions are listed beyond the tools used (Radon, Monkeytype)." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper does not describe how to run the benchmark or reproduce specific results." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": true, 44 "justification": "Table 2 reports ± values for all DynaCode results, e.g., 'GPT-4o: 74.4 (±1.6)' for Unit 1. These represent standard deviation across runs." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper makes numerous comparative claims (e.g., 'GPT-4o consistently outperforms others') based solely on comparing point estimates. No statistical significance tests (t-test, bootstrap, etc.) are reported anywhere." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "Performance drops are reported with full context, e.g., 'GPT-4o achieves 87.6% on MBPP and 72.2% on MBPP+, but drops significantly to 55.4% on DynaCode.' Abstract states 'average performance drop of 16.8% to 45.7%.' Baseline numbers always accompany claims." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": true, 59 "justification": "Section 4.2 ('Effect of the Problem Sizes') explicitly studies the effect of problem count {25, 50, 75, 100, 125, 150} on evaluation stability (Figure 4) and concludes 'when the number of problems is greater than or equal to 75, the evaluation results meet our requirements for stability.' They chose N=100." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "Table 2 states 'all experiments were conducted three times with 5 different random seeds, and the average results are presented' with ± standard deviation reported for all DynaCode results." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Results are compared against MBPP and MBPP+ benchmarks across all 12 models (Table 2). The fine-tuning experiment (Figure 5) also includes baseline (non-fine-tuned) performance." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "MBPP+ (EvalPlus version) is a contemporary baseline. The evaluated models include recent ones like GPT-4o, DeepSeek-V3, Qwen2.5-Coder, and Meta-Llama-3.3-70B-Instruct." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "The benchmark design is systematically decomposed: results are broken down by code complexity unit (1-4), call-graph level (1-4), and individual call-graph structure (G1-G16). The fine-tuning experiment ablates by training source (MBPP+ vs DynaCode vs unit functions). The problem size study varies N." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "Pass@1 is the primary metric (Table 2) and Pass@3 is reported in Table 4 (Appendix D.1). The schema examples explicitly list 'Pass@1 AND Pass@10' as qualifying for multiple metrics." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "Evaluation is entirely automated via pass/fail test case execution. No human evaluation of generated code quality, readability, or correctness is performed." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "For the main evaluation, models are evaluated off-the-shelf (temperature=0) on dynamically generated problems, so no dev/test contamination is possible. The benchmark is designed so test problems are guaranteed unseen." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Extensive breakdowns: by code complexity unit (Table 2), by call-graph level (Figure 3), by individual call graph G1-G16 (Figure 6), by error type and unit (Table 3), and per-model detailed trends (Figure 10)." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 4.3 presents detailed error analysis: 4,279 error examples from GPT-3.5-Turbo categorized into Problem Understanding, Code Pattern Generation, and Context Management (Table 3). Error distribution across units is analyzed." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "Fine-tuning on DynaCode unit functions actually decreases performance (GPT-3.5 drops from 32.6% to 15.2%, Meta-Llama from 10.6% to 6.9% in Figure 5). Models like codegemma-7b-it show near-zero performance (2.9% average). These are reported without downplaying." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims 'average performance drop of 16.8% to 45.7% compared to MBPP+' — Table 2 supports this across models. '189 million unique nested code problems' is supported by Table 1 (total 189,263,141). 'Performance progressively decreasing as complexity increases' is shown in Figure 3 and Table 2." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": false, 123 "justification": "The paper claims 'DynaCode's dynamic evaluation strategy effectively mitigates data contamination' and 'LLMs struggle with parallel function dependencies.' These are causal claims. The fine-tuning experiment provides some evidence for the contamination claim but has confounds (different training set sizes, different data distributions). The complexity claims don't control for prompt length, which increases with call-graph complexity." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title claims 'Evaluating Large Language Models in Code Generation' broadly, but the benchmark tests only Python, only function-level composition from MBPP+, with a maximum of 5 nodes in call graphs. The paper does not bound its claims to these constraints. The Limitations section mentions the 5-node limit but does not address the language or task-type restrictions." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper does not discuss alternative explanations for the observed performance drops. For example, longer prompts from nested problems could independently cause degradation. Performance differences across call-graph types could be due to prompt structure rather than genuine code understanding differences. No confounds are discussed." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper measures Pass@1 on code generation tasks and frames results in terms of Pass@1 scores and code generation capability. The claims largely match the granularity of the measurements — they don't inflate Pass@1 into broader 'programming ability' claims." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "Open models are well-specified (e.g., 'Meta-Llama-3.1-8B-Instruct', 'Qwen2.5-Coder-32B-Instruct'). However, 'GPT-4o' and 'GPT-3.5-Turbo' lack snapshot dates or API versions. Per the schema, marketing names like 'GPT-4o' without a snapshot date do not count as specified versions." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix E describes the prompt construction process and Table 8 provides a full example prompt for call graph G8. The prompts are dynamically generated from a documented hard-coding strategy, and the actual text is shown." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section 4.1 states 'we set the temperature to 0 to eliminate any randomness during the generation process.' For fine-tuning experiments, epochs and total steps are reported (e.g., '5 epochs', '1890 steps')." 156 }, 157 "scaffolding_described": { 158 "applies": false, 159 "answer": false, 160 "justification": "No agentic scaffolding is used. Models are evaluated directly via single-turn prompting with temperature=0." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 3.3 documents the full pipeline: problem collection from MBPP+ and LeetCode, cyclomatic complexity classification using Radon, call-graph construction, type-based problem combination via Monkeytype, test case generation with batch execution, and filtering of bad generations." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": true, 172 "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing the maximum node count of 5 in call graphs and future extensions to more complex structures." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "The Limitations section only mentions the 5-node call-graph constraint. It does not discuss specific threats like prompt-length confounds, reliance on MBPP+ as the sole base problem source, or potential biases in cyclomatic complexity as the sole complexity metric." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper does not explicitly state what the results do NOT show. It doesn't note that results are limited to Python, to function-level composition, to a specific set of base problems, or that the complexity metric captures only one dimension of real-world code difficulty." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "The benchmark generation code is released, but raw experimental outputs (model-generated code, per-problem pass/fail results) are not mentioned as being available. Only aggregated results are shown in tables and figures." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 3.3 and 4.1 describe data sources: MBPP+ from EvalPlus as the primary unit function set, plus 40 new problems from LeetCode (22 for Unit 3, 18 for Unit 4). Collection criteria and integration process are explained." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. Data sources are standard benchmarks (MBPP+) and public programming problems (LeetCode)." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "The pipeline is documented across Sections 3.1-3.3: MBPP+ problems → cyclomatic complexity computation via Radon → unit classification → call-graph construction → type-based combination → test case generation → bad generation filtering. Table 1 shows problem counts at each stage." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Author affiliations are clearly listed: University of Electronic Science and Technology of China and Drexel University. The authors are not affiliated with any of the model vendors being evaluated." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding information is disclosed, so independence cannot be verified. The authors are academic researchers not affiliated with model vendors, but without a funding disclosure, this criterion is not satisfied." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests or financial interest declaration is present in the paper." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper discusses data contamination extensively but does not state the training data cutoff dates for any of the 12 evaluated models. Without cutoff dates, it's impossible to verify which benchmarks were in training data." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": true, 238 "justification": "The entire paper's motivation is addressing train/test overlap. Section 1 discusses how 'Meta-Llama-3-8B-Instruct and Phi-2 have been reported to exhibit data contamination.' The fine-tuning experiments (Section 4.3, Figure 5) directly test memorization effects." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Benchmark contamination is the core problem addressed. The paper demonstrates that MBPP/MBPP+ are vulnerable (Figure 1 shows contamination effects) and proposes dynamic generation as a mitigation. The fine-tuning experiment shows DynaCode resists memorization compared to MBPP+." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants. This is a benchmark evaluation study using automated code execution." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants involved in the study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants involved in the study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants involved in the study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants involved in the study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants involved in the study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants involved in the study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "No inference cost, latency, or tokens consumed are reported despite evaluating 12 LLMs (including commercial APIs like GPT-4o and GPT-3.5-Turbo) across thousands of problems with multiple seeds." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "No GPU hours, total API spend, or hardware specifications are reported for either the benchmark generation process or the model evaluation." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": true, 299 "justification": "Table 2 reports mean ± std across 'three times with 5 different random seeds.' Variation is visible (e.g., GPT-4o Unit 2: ±1.4, Unit 4: ±0.3), demonstrating seed sensitivity analysis." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": true, 304 "justification": "Section 4.1 explicitly states 'all experiments were conducted three times with 5 different random seeds, and the average results are presented.'" 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "For the main evaluation, temperature=0 is fixed with no search. For the fine-tuning experiments, specific epoch counts and steps are chosen (5 epochs, 1890 steps for GPT-3.5) without reporting what configurations were tried or how these values were selected." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": false, 314 "justification": "The fine-tuning experiments use specific epoch counts (5 for GPT-3.5, 10 for Llama) and fixed total steps without justifying why these configurations were chosen or whether alternatives were explored." 315 }, 316 "multiple_comparison_correction": { 317 "applies": false, 318 "answer": false, 319 "justification": "No formal statistical tests are performed in the paper. All comparisons are made by inspecting point estimates and standard deviations. Since no tests are run, multiple comparison correction is structurally inapplicable." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors propose DynaCode and evaluate it as both benchmark designers and evaluators. They do not acknowledge this bias or include any independent evaluation of the benchmark's validity." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": true, 328 "answer": false, 329 "justification": "Models ranging from 7B to 405B parameters (and commercial APIs) are compared without discussing compute differences. The benchmark itself requires generating and executing large numbers of test cases, but compute costs are not reported." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": true, 334 "justification": "The paper extensively discusses what DynaCode measures: code generation capability under varying cyclomatic complexity and call-graph structure. Section 4.3 provides error analysis (Table 3) mapping failures to specific capabilities (Problem Understanding, Code Pattern Generation, Context Management). The paper contrasts this with static benchmarks' construct validity problems." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": false, 338 "answer": false, 339 "justification": "No scaffolding is involved. All models are evaluated via direct single-turn prompting without any agentic framework." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": true, 346 "justification": "Temporal leakage is the paper's central motivation. They demonstrate that MBPP/MBPP+ (published 2021) are available before model training cutoffs, causing contamination (Figure 1). DynaCode's dynamic generation is designed to prevent temporal leakage. They also add recent LeetCode problems to further mitigate this." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "The paper does not discuss whether the evaluation setup itself leaks information. For example, the assert statements in prompts (Table 8) provide input-output examples that could serve as hints. Whether the prompt structure provides more information than a real-world coding task is not discussed." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "The paper does not discuss whether base problems from MBPP+ share structural similarities or dependencies. Problems within the same unit share similar complexity, and the same base problems are reused across many nested combinations, but this non-independence is not analyzed." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": true, 361 "justification": "The fine-tuning experiments (Section 4.3, Figure 5) serve as a concrete leakage detection method: by fine-tuning on the benchmark and measuring performance gains, they quantify how much memorization contributes to scores (MBPP+ gains 19pp vs DynaCode gains 3.4pp for GPT-3.5)." 362 } 363 } 364 }, 365 "scan_version": 3, 366 "active_modules": [ 367 "experimental_rigor", 368 "data_leakage" 369 ], 370 "claims": [ 371 { 372 "claim": "DynaCode generates approximately 189 million unique nested code generation tasks across 4 units and 16 call-graph structures.", 373 "evidence": "Table 1 shows the full breakdown: Unit 1 (164.5M), Unit 2 (3.2M), Unit 3 (2.4M), Unit 4 (19.2M), totaling 189,263,141 problems.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "LLMs show an average performance drop of 16.8% to 45.7% on DynaCode compared to MBPP+.", 378 "evidence": "Table 2 shows drops for all 12 models. GPT-4o drops from 72.2% (MBPP+) to 55.4% (DynaCode, -16.8pp). Meta-Llama-3.1-8B-Instruct drops from 55.6% to 9.9% (-45.7pp).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Performance progressively decreases as code complexity (unit) and call-graph complexity (level) increase.", 383 "evidence": "Table 2 shows declining scores across units for most models. Figure 3 shows consistent degradation across levels 1-4 for GPT-4o, GPT-3.5-Turbo, WizardLM-2, and Llama-405B.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "DynaCode's dynamic evaluation strategy effectively mitigates data contamination.", 388 "evidence": "Fine-tuning experiments (Figure 5): GPT-3.5 gains 19pp on MBPP+ but only 3.4pp on DynaCode after fine-tuning. Llama-8B gains 42.5pp on MBPP+ but only 13pp on DynaCode. Fine-tuning on unit functions alone drops DynaCode performance.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "LLMs perform significantly better on sequential call graphs than complex multi-branch structures.", 393 "evidence": "Figure 6 shows 4 models scoring higher on sequential graphs {G1-G4, G8} vs multi-branch graphs {G9-G16}. GPT-4o maintains ~60-80% on sequential but drops to ~30-50% on complex graphs.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Problem Understanding errors increase as code complexity rises, from 64.1% in Unit 1 to 88.8% in Unit 4.", 398 "evidence": "Table 3 shows error distribution for GPT-3.5-Turbo: Problem Understanding errors increase monotonically (64.1% → 79.9% → 88.2% → 88.8%) while Code Pattern Generation and Context Management errors decrease.", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval" 404 ], 405 "key_findings": "DynaCode is a dynamic benchmark generating ~189M unique nested Python code problems by combining base problems from MBPP+ with 16 call-graph structures across 4 complexity levels. Evaluation of 12 LLMs shows 16.8-45.7 percentage point drops compared to MBPP+, with performance degrading consistently as both code and call-graph complexity increase. Fine-tuning experiments demonstrate that DynaCode resists memorization better than static benchmarks, with models gaining much less from training on DynaCode data than on MBPP+. Error analysis reveals that Problem Understanding failures dominate at higher complexity levels, while LLMs perform relatively well on sequential call graphs but struggle with multi-branch dependencies.", 406 "red_flags": [ 407 { 408 "flag": "Prompt length confound not addressed", 409 "detail": "Higher-complexity call graphs produce longer prompts (more functions, more instructions). The observed performance degradation could be partly attributable to increased prompt length rather than genuine complexity-handling difficulty. This confound is never discussed." 410 }, 411 { 412 "flag": "No statistical significance testing", 413 "detail": "The paper compares 12 models across 4 units, 4 levels, and 16 call graphs, making dozens of comparative claims (e.g., 'GPT-4o consistently outperforms others') without any statistical significance tests. Given the reported standard deviations, some claimed differences may not be statistically significant." 414 }, 415 { 416 "flag": "Self-evaluation bias", 417 "detail": "The authors design, implement, and evaluate DynaCode themselves without independent validation. They select which metrics to emphasize, which models to compare, and how to present results. No independent replication or external evaluation is included." 418 }, 419 { 420 "flag": "Limited base problem diversity", 421 "detail": "The entire benchmark is built from 378 MBPP+ problems plus 40 LeetCode problems. Despite generating 189M combinations, the underlying function implementations are drawn from a narrow set. The paper does not analyze whether this limits the diversity of actual coding challenges tested." 422 }, 423 { 424 "flag": "Cyclomatic complexity as sole complexity metric", 425 "detail": "Code complexity is classified solely by cyclomatic complexity, which measures control-flow branching. Other complexity dimensions (algorithmic complexity, data structure usage, API usage, string manipulation) are ignored. The paper acknowledges only the node-count limitation, not this metric limitation." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Evaluating large language models trained on code", 431 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 432 "year": 2021, 433 "arxiv_id": "2107.03374", 434 "relevance": "Introduces HumanEval benchmark and Pass@k metric, foundational to LLM code generation evaluation." 435 }, 436 { 437 "title": "Program synthesis with large language models", 438 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 439 "year": 2021, 440 "arxiv_id": "2108.07732", 441 "relevance": "Introduces MBPP benchmark, the base problem set used by DynaCode." 442 }, 443 { 444 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 445 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 446 "year": 2024, 447 "relevance": "Introduces EvalPlus and MBPP+, the extended benchmark used as DynaCode's unit function source." 448 }, 449 { 450 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 451 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 452 "year": 2023, 453 "arxiv_id": "2310.06770", 454 "relevance": "Real-world code generation benchmark focusing on repository-level tasks, contrasted with DynaCode's function-level approach." 455 }, 456 { 457 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 458 "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"], 459 "year": 2024, 460 "arxiv_id": "2406.15877", 461 "relevance": "Code generation benchmark with diverse function calls and complex instructions, directly comparable to DynaCode." 462 }, 463 { 464 "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM", 465 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"], 466 "year": 2024, 467 "relevance": "Dynamic benchmark evolution approach using LLMs, addressing similar data contamination concerns as DynaCode." 468 }, 469 { 470 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 471 "authors": ["Naman Jain", "King Han", "Alex Gu"], 472 "year": 2024, 473 "arxiv_id": "2403.07974", 474 "relevance": "Contamination-free code benchmark using temporal splits, addressing the same data contamination problem." 475 }, 476 { 477 "title": "CRUXEval: A benchmark for code reasoning, understanding and execution", 478 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather"], 479 "year": 2024, 480 "arxiv_id": "2401.03065", 481 "relevance": "Code reasoning benchmark evaluating understanding and execution, complementary to code generation evaluation." 482 }, 483 { 484 "title": "A careful examination of large language model performance on grade school arithmetic", 485 "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee"], 486 "year": 2024, 487 "arxiv_id": "2405.00332", 488 "relevance": "Documents data contamination in LLMs including Meta-Llama-3-8B-Instruct, directly cited as evidence for contamination." 489 }, 490 { 491 "title": "Large language models for software engineering: A systematic literature review", 492 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 493 "year": 2024, 494 "relevance": "Comprehensive survey of LLMs for software engineering, providing broader context for code generation evaluation." 495 }, 496 { 497 "title": "DyVal: Graph-informed dynamic evaluation of large language models", 498 "authors": ["Kaijie Zhu", "Jiaao Chen", "Jindong Wang"], 499 "year": 2023, 500 "arxiv_id": "2309.17167", 501 "relevance": "Graph-based dynamic evaluation approach that inspired DynaCode's methodology for generating dynamic benchmarks." 502 }, 503 { 504 "title": "EvoCodeBench: An evolving code generation benchmark aligned with real-world code repositories", 505 "authors": ["Jia Li", "Ge Li", "Xuanming Zhang"], 506 "year": 2024, 507 "arxiv_id": "2404.00599", 508 "relevance": "Evolving code benchmark focused on real-world repository alignment, addressing benchmark staleness." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "Released benchmark tool that practitioners can use to evaluate LLMs on code generation, though limited to Python function composition." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "Confirms the expected finding that LLMs struggle with increased complexity; the magnitude of performance drops (up to 45.7pp) is noteworthy but not fundamentally surprising." 519 }, 520 "fear_safety": { 521 "score": 0, 522 "justification": "No AI safety or security concerns raised; purely a benchmark evaluation paper." 523 }, 524 "drama_conflict": { 525 "score": 1, 526 "justification": "Shows popular benchmarks like MBPP are contaminated and unreliable, but data contamination in LLM benchmarks is already well-established." 527 }, 528 "demo_ability": { 529 "score": 2, 530 "justification": "GitHub repository released with benchmark generation and evaluation code; users could run it on their own models." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Evaluates well-known models (GPT-4o, DeepSeek-V3, Llama-3) but paper is from UESTC/Drexel, not a major AI lab." 535 } 536 } 537 }