scan.json (35006B)
1 { 2 "paper": { 3 "title": "Narrowing the Complexity Gap in the Evaluation of Large Language Models", 4 "authors": [ 5 "Yang Chen", 6 "Shuyang Liu", 7 "Reyhaneh Jabbarvand" 8 ], 9 "year": 2026, 10 "venue": "arXiv", 11 "arxiv_id": "2602.18928" 12 }, 13 "scan_version": 3, 14 "active_modules": ["experimental_rigor", "data_leakage"], 15 "methodology_tags": ["benchmark-eval"], 16 "key_findings": "GeneBench uses multi-objective genetic optimization to transform 4 existing benchmarks (HumanEval, ClassEval, CRUXEval, Avatar) into more complex versions, increasing relative complexity by 298% on average while maintaining readability (12% decrease). Evaluating 13 LLMs across 4 programming tasks shows a 14.9%–60.5% (avg 35.2%) performance drop, with the struggle persisting under few-shot prompting (avg 41% drop) and fine-tuning (avg 17% gap). Performance on GeneBench bug repair correlates with SWE-Bench performance, suggesting it can proxy real-world benchmarks without costly construction.", 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No repository URL, GitHub link, or archive is provided in the paper. GeneBench is described but its implementation is not released." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "The input benchmarks (HumanEval, ClassEval, CRUXEval, Avatar) are public, but GeneBench's transformed benchmark outputs are not released. No download link for the generated benchmarks is provided." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper mentions A100 GPUs, bfloat16 precision, Python AST library, and py2cfg, but provides no requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithm is described at a high level (Algorithms 1-2) but not with enough specificity to reproduce without the source code." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Table 5 reports point estimates for all models and tasks with no confidence intervals, error bars, or ± notation. Figures report point values without uncertainty bands." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": true, 49 "justification": "Statistical significance tests are reported: p-value = 6e-18 for overall performance drop (§5.3), p-value = 4e-8 for complexity distribution difference between Success_Success and Success_Failure groups (§5.3.4), and Spearman's rank correlation with p-value for operator frequency vs complexity (§5.2)." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "Effect sizes are reported as percentage drops with baseline context throughout: Δ = (Aft−Bef)/Bef (Table 5), e.g., 14.9%–60.5% avg=35.2% drop. Relative complexity increases (75%–650%) and readability decreases (10%–15%) are also provided in Table 4." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No power analysis or justification for sample sizes. CRUXEval is sampled at 200 of 800 programs without justification. SWE-Sub filters to 94 instances without discussing whether this is sufficient for the claims. No rationale for the 13-model selection." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "Main results (Table 5) are single-run at temperature=0 with no variance or spread measures. The single-operator experiment was repeated 3 times, but only qualitative summary ('almost the same outcome') is given, not quantitative variance. Three GeneBench versions (§5.4.3) show consistency but without formal spread measures." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Before-transformation performance serves as the baseline across all experiments (Bef. columns in Table 5). SWE-Bench performance is used as a real-world baseline for comparison in §5.5." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Evaluated models include recent ones: GPT-4o, o4-mini, DeepSeek-R1 (2024–2025 models). The comparison framework uses SWE-Bench, a contemporary real-world benchmark. Prior transformation techniques are discussed and compared (Table 3, §4)." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "Single-operator experiments (§5.3) test each of 22 operators independently. Figure 8 shows per-operator frequency and complexity contribution. §5.3.4 isolates the effect of complexity vs representation shift through NNMR analysis." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "Multiple metrics are used: success rate per task, relative complexity (RC), relative readability (RR), PyLint score, NNMR, CodeBLEU, cosine similarity, Jaccard similarity, and Spearman's rank correlation." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of GeneBench transformations, their naturalness, or LLM outputs. The paper relies entirely on automated metrics (test pass/fail, PyLint scores, readability metrics) to assess quality." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "For fine-tuning (§5.4.2), training uses transformations NOT selected for the final benchmark, with deduplication (Jaccard > 0.8). The benchmark transformations (from the Pareto front) serve as the held-out test set. For the main experiment, temperature=0 means no tuning on test data." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Table 5 provides per-model and per-task breakdowns across 5 benchmark-task combinations and 13 models. Figure 8 breaks down per-operator contributions. Figure 9 shows per-task Success_Success and Success_Failure distributions." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "§5.3 includes extensive failure analysis: §5.3.1 analyzes across tasks, §5.3.2 across models, §5.3.3 by size/training strategy, §5.3.4 investigates root causes of failure (representation shift vs complexity). Figure 10 compares complexity of success vs failure cases." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper reports that NNMR (representation shift) does NOT correlate with performance drop (§5.3.4), ruling out a plausible explanation. Few-shot learning sometimes worsens performance (green vs red in Table 5 Δ(IC)). Fine-tuning leaves substantial performance gaps (Figure 11)." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Abstract claims are supported: 14.9%–60.5% avg=35.2% performance drop (Table 5), struggle persists under few-shot (Table 5 Δ(IC)) and fine-tuning (Figure 11, 4.7%–30.3% avg=17%), SWE-Bench similarity shown in §5.5. The claim of 'consistent reproduction' is supported by three versions (§5.4.3)." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper claims complexity increases cause performance drops. The study design controls for this: same programs before/after transformation, single-operator experiments confirm combination matters, NNMR analysis rules out representation shift as the sole cause, and Figure 10 shows statistically significant complexity differences between success/failure groups (p=4e-8)." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title 'Narrowing the Complexity Gap in the Evaluation of Large Language Models' is broad, but results are exclusively on Python. §7 acknowledges 'Our current implementation targets Python' but frames multi-language as 'primarily an engineering effort,' understating the potential for different results in other languages." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": true, 133 "justification": "§5.3.4 systematically investigates alternative explanations: representation shift (NNMR, Table 6), increased input length (token counts vs context windows), and data contamination (multiple GeneBench versions). They rule out representation shift as the sole driver and show complexity is the primary factor (Figure 10)." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper explicitly tests whether GeneBench performance proxies real-world performance through the SWE-Bench comparison (§5.5), showing GeneBench repair rates (45.3%/20.6%) are closer to SWE-Sub (17.6%) than original benchmarks (66.4%/32.1%). They acknowledge this validation is limited to repair, noting no real-world benchmarks exist for code reasoning." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "Open-source models are identified by size (e.g., 'CodeLlama-13B-Base', 'DeepSeekCoder-6.7B-Instruct') but proprietary models GPT-4o and o4-mini lack specific API versions or snapshot dates. References cite generic URLs (e.g., [1] links to GPT-4 technical report, [5] links to o4-mini docs page) without version specifics." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": false, 150 "justification": "Figure 6 shows prompt templates with placeholders (${CODE}, ${TESTS}, ${EXAMPLE_CODE}, ${EXAMPLE CoT REASONING}). While the benchmark data (fill values for code/tests) is from public sources, the specific ICL example selections, CoT reasoning examples, and exact prompt text sent to models are not fully specified." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": false, 155 "justification": "Temperature=0 is reported for inference (§5.1.1). 'For other parameters, we used the default settings' without specifying what those defaults are. Fine-tuning uses QLoRA on 8 A100-40GB GPUs for up to 2 epochs with early stopping, but learning rate, LoRA rank, alpha, batch size, and other training hyperparameters are not reported." 156 }, 157 "scaffolding_described": { 158 "applies": false, 159 "answer": false, 160 "justification": "GeneBench is a program transformation tool, not an agentic scaffold. The Agentless tool used in §5.5 is a third-party black-box repair tool — the authors cannot be expected to describe its internal scaffolding." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": true, 165 "justification": "Data preprocessing is documented: CRUXEval sampling (200 programs), SWE-Bench-Lite filtering to SWE-Sub (300 → 94 instances with criterion: >3 lines additions vs deletions), fine-tuning deduplication (Jaccard > 0.8, resulting in 106k samples), test coverage validation (90.8%–99.5% original, 93.8%–99.4% transformed)." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": true, 172 "justification": "§7 'Threats to Validity' provides a dedicated section with three subsections: External Validity, Internal Validity, and Construct Validity, each discussing specific concerns." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": true, 177 "justification": "Threats are specific to this study: Python-only implementation (external), temperature=0 and bfloat16 precision choices (internal), reliance on test execution for semantic equivalence with coverage statistics (internal), and use of Python AST and py2cfg tools (construct)." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": true, 182 "justification": "§7 explicitly states 'Our current implementation targets Python, as it dominates benchmarks like HumanEval and SWE-Bench.' The paper also notes GeneBench is for code-to-code/text tasks (footnote 2 states it is 'orthogonal to research on automated generation of program synthesis (text-to-code) problems')." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "No raw experimental data is available. Neither the generated GeneBench transformations, the LLM outputs, nor the detailed per-problem results are released. Only aggregated results in tables and figures are presented." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": true, 194 "justification": "The data collection is well-described: 4 benchmarks selected with rationale (§5.1.2), 200 CRUXEval programs sampled, SWE-Sub filtered from SWE-Bench-Lite (94 of 300), operator design based on manual examination of 2,692 Python files from top 100 PyPI projects (§4)." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. Data sources are standard public benchmarks (HumanEval, ClassEval, CRUXEval, Avatar, SWE-Bench)." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": true, 204 "justification": "The full pipeline is documented: Algorithms 1-2 describe the transformation and validation process, including selection (roulette wheel + NSGA-II), transformation, readability/PyLint validation, test execution, and Pareto front selection. SWE-Sub filtering (300→94) and fine-tuning deduplication (→106k samples) include counts and criteria." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding acknowledgments, grants, or sponsorship information is mentioned anywhere in the paper." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "All three authors' affiliations are listed as University of Illinois Urbana-Champaign. The paper evaluates third-party models (GPT-4o, DeepSeek, CodeLlama, etc.) without affiliation to any of the model providers." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding source is disclosed, so independence cannot be assessed. University researchers typically receive grant funding, but none is mentioned." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "No competing interests or financial disclosure statement is present in the paper." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper notes 'more than 90% of SWE-Bench problems are dated before the training cut-off date of most recent LLMs' (§1) but does not state the actual training cutoff dates for any of the 13 evaluated models." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": true, 238 "justification": "Data contamination is extensively discussed as a core motivation (L3, §1). The paper argues GeneBench 'overcomes L3 limitation by upgrading programs to more complex versions with several non-trivial changes.' §5.4.3 shows different GeneBench versions produce different programs (Figure 12), and Table 6 shows high NNMR (19.4%–97.65%) indicating transformed versions are embedding-distant from originals." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Contamination is a primary concern addressed throughout. The paper explicitly notes original benchmarks (HumanEval, 2021) predate most models' training data. GeneBench's entire purpose is to mitigate this by creating novel transformed versions that are 'generated through the non-deterministic application of 22 genetic operators; re-executing GeneBench results in different programs' (§1)." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants. This is a benchmark evaluation study using automated metrics." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants. The study evaluates LLMs on transformed programming benchmarks." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "No inference costs, API costs, or per-example latency are reported for the LLM evaluations. Token counts are listed in Table 6 but not translated into cost or time." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "Partial: GeneBench uses a 1-hour time budget per program (§5.1), and fine-tuning uses 8 A100-40GB GPUs for up to 2 epochs. But total GPU-hours for all experiments, API costs for GPT-4o/o4-mini, and total compute for the full evaluation pipeline are not stated." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": false, 299 "justification": "Main LLM evaluations use temperature=0 for deterministic single runs. Three GeneBench versions with different seeds show similar trends (§5.4.3, Figure 13) but no formal seed sensitivity analysis for model outputs is reported. Reasoning models that don't allow temperature changes are not analyzed for stochastic variation." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": false, 304 "justification": "The main Table 5 results are implied single-run at temperature=0 but not explicitly stated as such. The single-operator experiment states 'repeated the experiment three times' (§5.3). The number of runs for the main experiments is not explicitly stated." 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "GeneBench parameters (breed size 20%, 1-hour budget) are stated but no search over these hyperparameters is described. Fine-tuning uses QLoRA with early stopping but no hyperparameter search budget is reported." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": true, 314 "justification": "The selection criterion is clearly defined: GeneBench selects the highest-RC individual from the Pareto front (Algorithm 1, Line 13, §3.3.1). For fine-tuning, loss-based early stopping is used. The breed size (20%) is justified by preventing population bloating (footnote 7)." 315 }, 316 "multiple_comparison_correction": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper runs comparisons across 13 models and 5 benchmark-task combinations and reports multiple p-values (6e-18, 4e-8) without any mention of Bonferroni, Holm, or other family-wise error rate corrections." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors designed GeneBench and evaluate it without acknowledging potential author-evaluation bias. No independent evaluation or discussion of this bias is present." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": true, 328 "answer": false, 329 "justification": "GeneBench transformations increase token counts 1.7x–6.2x (Table 6), meaning models process significantly more tokens, but this compute cost difference is not analyzed against the performance gains claimed. The 1-hour budget is mentioned but not varied to show performance as a function of compute." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": true, 334 "justification": "Benchmark construct validity is a central theme of the paper. §2 systematically compares complexity metrics of existing benchmarks against SWE-Bench (Figure 1), demonstrating that existing benchmarks lack real-world complexity. §5.5 validates that GeneBench performance correlates with SWE-Bench performance." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": true, 338 "answer": true, 339 "justification": "For the main experiments, LLMs are evaluated directly without scaffolding, eliminating the scaffold confound. For the SWE-Bench comparison (§5.5), all models use the same scaffold (Agentless), controlling for scaffold differences across model comparisons." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": true, 346 "justification": "Temporal leakage is a core motivation. §1 (L3) notes 'more than 90% of SWE-Bench problems are dated before the training cut-off date of most recent LLMs.' GeneBench is explicitly designed to address this by creating new transformed versions that did not exist during model training." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "§5.4.1 avoids solution leakage in bug repair by selecting a different program's transformation as the ICL example rather than transformations of the same program. §1 (L2) discusses solution leakage in SWE-Bench where 'the issue report or the comments explicitly solving the issue' leak answers." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether original benchmark programs share structural similarities (e.g., similar coding patterns in HumanEval). The fine-tuning deduplication (Jaccard > 0.8) addresses near-duplicate training examples but not structural non-independence between train and test." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": true, 361 "justification": "NNMR (Nearest Neighbor Mismatch Rate, Table 6) measures the percentage of programs whose nearest neighbor in embedding space is NOT their transformed counterpart, serving as a concrete method to verify that transformed versions are sufficiently different from potentially-contaminated originals (19.4%–97.65% mismatch)." 362 } 363 } 364 }, 365 "claims": [ 366 { 367 "claim": "GeneBench increases relative complexity by 298% on average while readability decreases only 12%", 368 "evidence": "Table 4 shows RC increases from avg 0.08 to 0.21 (ranging 75%–650% by benchmark), while RR decreases from 0.73 to 0.65. Average PyLint score improves from 9.75 to 9.87 (§5.2).", 369 "supported": "strong" 370 }, 371 { 372 "claim": "LLMs suffer 14.9%–60.5% (avg 35.2%) performance drop across all tasks under GeneBench transformations", 373 "evidence": "Table 5 reports Δ values across 13 models and 5 benchmark-task combinations, all negative, with p-value = 6e-18 (§5.3).", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Few-shot learning with GeneBench examples does not help LLMs bypass the complexity challenge, and often worsens performance", 378 "evidence": "Table 5 Δ(IC) columns show average 41% drop under few-shot prompting. 'Except for a few cases, the performance drop is even higher than in the original experiment' (§5.4.1).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Fine-tuned models still show 4.7%–30.3% (avg 17%) performance gap on GeneBench", 383 "evidence": "Figure 11 shows CodeLlama-13-I and DeepSeekCoder-6.7-B fine-tuned on 106k GeneBench samples still underperform on all tasks compared to original benchmarks (§5.4.2).", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "LLMs' performance on GeneBench bug repair is similar to SWE-Bench, making GeneBench a proxy for real-world performance", 388 "evidence": "Table 5: GeneBench repair rates (HumanEval Aft.=45.3%, ClassEval Aft.=20.6%) are closer to SWE-Sub (17.6%) than original benchmarks (66.4%, 32.1%). §5.5 provides detailed comparison.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Complexity, not representation shift, is the primary driver of LLMs' performance drop", 393 "evidence": "§5.3.4: NNMR shows no correlation with average performance drop. CRUXEval-I has highest NNMR (97.65%) but lowest Δ. Figure 10 shows statistically significant complexity difference between Success_Failure and Success_Success groups (p=4e-8).", 394 "supported": "strong" 395 }, 396 { 397 "claim": "Different versions of GeneBench produce notably different programs but consistently challenge LLMs", 398 "evidence": "§5.4.3: Three versions show CodeBLEU 0.60–0.80, Jaccard 0.40–0.64 (Figure 12). Figure 13 shows all versions produce similar performance drop trends across models and tasks.", 399 "supported": "strong" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No code or data release", 405 "detail": "Neither GeneBench's implementation nor its transformed benchmarks are publicly available. This is a significant limitation for a paper whose core contribution is a tool — readers cannot verify the transformations, reproduce results, or use GeneBench for their own evaluations." 406 }, 407 { 408 "flag": "No human evaluation of transformation naturalness", 409 "detail": "The paper claims transformations maintain readability comparable to real-world code, supported only by automated metrics (PyLint score, readability metrics). No developer study validates whether the transformations look natural or representative. The Figure 2 vs Figure 3 comparison is anecdotal (one example)." 410 }, 411 { 412 "flag": "Limited real-world validation", 413 "detail": "The SWE-Bench comparison (§5.5) validating GeneBench as a real-world proxy is limited to 94 instances in one task (bug repair). The paper acknowledges no real-world benchmarks exist for code reasoning or translation, leaving 3 of 4 tasks without real-world validation." 414 }, 415 { 416 "flag": "Fine-tuning limited to smallest models", 417 "detail": "Fine-tuning experiments (§5.4.2) use only CodeLlama-13-I and DeepSeekCoder-6.7-B, the smallest models in their families. The 17% average gap may not hold for larger or proprietary models. Footnote 10 acknowledges this is due to GPU constraints, not experimental design choice." 418 }, 419 { 420 "flag": "No error bars on main results", 421 "detail": "Table 5 reports single-point results at temperature=0 without any uncertainty quantification. While deterministic output eliminates stochastic variance, it doesn't account for sensitivity to other factors (prompt wording, which Pareto solution is selected, operator ordering)." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 427 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 428 "year": 2024, 429 "relevance": "Primary real-world benchmark used for complexity comparison and validating GeneBench as a proxy for real-world performance." 430 }, 431 { 432 "title": "Evaluating large language models trained on code", 433 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 434 "year": 2021, 435 "arxiv_id": "2107.03374", 436 "relevance": "HumanEval benchmark used as one of four GeneBench evaluation targets; foundational code LLM evaluation benchmark." 437 }, 438 { 439 "title": "ClassEval: A manually-crafted benchmark for evaluating llms on class-level code generation", 440 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 441 "year": 2023, 442 "arxiv_id": "2308.01861", 443 "relevance": "Class-level code generation benchmark used as GeneBench evaluation target for program repair." 444 }, 445 { 446 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 447 "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather", "Armando Solar-Lezama", "Gabriel Synnaeve", "Sida I Wang"], 448 "year": 2024, 449 "arxiv_id": "2401.03065", 450 "relevance": "Code reasoning benchmark used for input/output prediction tasks in GeneBench evaluation." 451 }, 452 { 453 "title": "Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM", 454 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"], 455 "year": 2024, 456 "relevance": "Related work on evolving benchmarks via LLM prompting; GeneBench differs by using genetic algorithms for code-to-code transformation." 457 }, 458 { 459 "title": "SWE-Bench+: Enhanced coding benchmark for llms", 460 "authors": ["Reem Aleithan", "Haoran Xue", "Mohammad Mahdi Mohajer"], 461 "year": 2024, 462 "arxiv_id": "2410.06992", 463 "relevance": "Identified solution leakage in SWE-Bench including SWE-Bench Verified, directly motivating GeneBench's contamination-resistant design." 464 }, 465 { 466 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 467 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 468 "year": 2024, 469 "relevance": "Repair tool used in the SWE-Bench comparison experiment to evaluate LLMs on real-world bug repair." 470 }, 471 { 472 "title": "Code llama: Open foundation models for code", 473 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 474 "year": 2023, 475 "arxiv_id": "2308.12950", 476 "relevance": "One of the evaluated model families (3 variants), showing significant performance drops under GeneBench transformations." 477 }, 478 { 479 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming–The Rise of Code Intelligence", 480 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 481 "year": 2024, 482 "arxiv_id": "2401.14196", 483 "relevance": "Evaluated model family (3 variants + R1); DeepSeekCoder-33-I showed lowest open-source performance drop, attributed to high code training ratio (87%)." 484 }, 485 { 486 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 487 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 488 "year": 2025, 489 "arxiv_id": "2501.12948", 490 "relevance": "One of two evaluated reasoning models showing the smallest performance drops under GeneBench, demonstrating reasoning abilities help with complexity." 491 }, 492 { 493 "title": "R2E: Turning any Github Repository into a Programming Agent Environment", 494 "authors": ["Naman Jain", "Manish Shetty", "Tianjun Zhang", "King Han", "Koushik Sen", "Ion Stoica"], 495 "year": 2024, 496 "relevance": "Related real-world benchmark approach that collects methods from GitHub repos; used as comparison point for GeneBench's advantages." 497 }, 498 { 499 "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model", 500 "authors": ["Jialun Cao", "Wuqi Zhang", "Shing-Chi Cheung"], 501 "year": 2024, 502 "arxiv_id": "2403.16898", 503 "relevance": "Prior work on data contamination countermeasures for code LLMs; GeneBench operators are more complex than their semantic-preserving perturbations." 504 }, 505 { 506 "title": "Repository-level compositional code translation and validation", 507 "authors": ["Ali Reza Ibrahimzada", "Kaiyao Ke", "Mrigank Pawagi"], 508 "year": 2024, 509 "arxiv_id": "2410.24117", 510 "relevance": "Related work on repository-level code translation showing real-world translation is non-trivial, motivating GeneBench's complexity-aware evaluation." 511 }, 512 { 513 "title": "SemCoder: Training Code Language Models with Comprehensive Semantics", 514 "authors": ["Yangruibo Ding", "Jinjun Peng", "Marcus J Min", "Gail Kaiser", "Junfeng Yang", "Baishakhi Ray"], 515 "year": 2024, 516 "arxiv_id": "2406.01006", 517 "relevance": "Evaluated model instruction-tuned with execution data; showed less performance drop in some tasks, suggesting execution-aware training helps with complexity." 518 } 519 ], 520 "engagement_factors": { 521 "practical_relevance": { 522 "score": 2, 523 "justification": "GeneBench could be used by researchers to augment existing benchmarks, but the tool is not released, limiting immediate practical use." 524 }, 525 "surprise_contrarian": { 526 "score": 2, 527 "justification": "Demonstrates a 35% average performance drop when adding real-world complexity to benchmarks, challenging the narrative that high benchmark scores indicate real-world capability." 528 }, 529 "fear_safety": { 530 "score": 0, 531 "justification": "No AI safety or security concerns raised; the paper is about evaluation methodology." 532 }, 533 "drama_conflict": { 534 "score": 1, 535 "justification": "The 'benchmarks are too easy' angle is mildly provocative but already well-known in the community." 536 }, 537 "demo_ability": { 538 "score": 0, 539 "justification": "No code, demo, or tool is released — readers cannot try GeneBench themselves." 540 }, 541 "brand_recognition": { 542 "score": 1, 543 "justification": "From UIUC (well-known CS program); evaluates GPT-4o and o4-mini but the paper is not from a famous AI lab." 544 } 545 } 546 }