scan.json (25560B)
1 { 2 "paper": { 3 "title": "Artificial or Just Artful? Do LLMs Bend the Rules in Programming?", 4 "authors": [ 5 "Oussama Ben Sghaier", 6 "Kevin Delcourt", 7 "Houari Sahraoui" 8 ], 9 "year": 2025, 10 "venue": "arXiv preprint", 11 "arxiv_id": "2512.21028" 12 }, 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, Zenodo archive, or other code release link is provided anywhere in the paper. The paper does not mention releasing any code artifacts." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The study uses the publicly available BigCodeBench dataset (referenced as [49]), specifically the Hard subset of 148 tasks. This is a standard public benchmark that the authors did not modify." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions the hardware used (four NVIDIA RTX A5000 GPUs) and the OpenAI Batch API for GPT5-nano, but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions, README with commands, or scripts are provided. The paper describes the experimental methodology at a high level but does not include specific instructions for replication." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper reports only point estimates for pass@k, CodeBLEU, LOC, and churn metrics. No confidence intervals or error bars are provided on any of the main results." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper applies the Wilcoxon signed-rank test with Benjamini-Hochberg False Discovery Rate (FDR) correction for multiple comparisons (Section 4.3, Table 6). Prior normality was checked with Kolmogorov-Smirnov tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports percentage improvements with baseline context throughout. For example, pass@1 improvements from 15.5%-24.4% (baseline) to 37.2%-54.7% (FT), CodeBLEU improvements of 3-7%, and specific churn values (e.g., '28.31 added lines'). These provide sufficient context to understand the magnitude of effects." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for why 148 tasks from BigCodeBench-Hard were sufficient. No power analysis is discussed. The choice of 5 models and 5 generated solutions per task is stated but not justified." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "All reported metrics (pass@k, CodeBLEU, LOC, churn) are single aggregate values or means. No standard deviations, interquartile ranges, or variance measures across the 5 generated solutions per task are reported." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The Baseline (B) prompting strategy, which provides only the task description without test cases, serves as the reference point. All other strategies (FT, FT+DNU, PT, PT+DNU) are compared against it." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The baseline is the standard code generation setup (task description only), which is the natural comparison point. The paper tests five contemporary models including GPT5-nano and Qwen2.5-Coder (2024-2025 models)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The five prompting strategies effectively form an ablation study: full vs. partial test visibility, with and without explicit 'Do Not Use' restrictions. Each strategy isolates a different dimension (visibility level, restriction presence)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics are used: pass@1, pass@5, new pass, new fail, CodeBLEU, LOC, and churn-based metrics (added lines, removed lines, total churn)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "RQ4 includes a qualitative manual analysis of more than 30 of the 148 tasks. The authors manually inspected generated code to identify four adaptation strategies (Section 4.4). This constitutes human evaluation of the system's outputs." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The BigCodeBench test suite serves as the evaluation mechanism. In partial test conditions (PT, PT+DNU), only a subset of tests is shown to the model while the remainder are kept hidden for evaluation, creating a held-out test partition." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per model (5 models) and per prompting strategy (5 strategies) in Tables 3, 4, 5, and 6. New pass/fail counts are shown per model in Figure 2." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4.4.2 (Test Hard Coding) discusses specific failure cases where test signals caused errors (tasks 985 and 310). Section 4.4.3 (Quick & Dirty Programming) discusses cases where code quality degraded. New fail metrics are explicitly tracked." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that DNU restrictions were largely ineffective at preventing test exploitation. It also reports regressions (new failing tests) introduced by test exposure, test hard-coding failures, and degraded code quality under the 'Quick & Dirty' strategy." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims that 'correctness nearly doubles for some models' are supported by Table 3 (e.g., Phi-4: 23.6% to 54.7%). The claim that 'explicit restrictions only partially mitigate this effect' is supported by FT+DNU results being comparable to FT. The four adaptation strategies are documented in Section 4.4." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper uses controlled single-variable manipulation: each prompting strategy varies one dimension (test visibility or restriction) while holding others constant. The experimental design with 5 prompting conditions on the same tasks constitutes adequate controlled manipulation for the causal claims about test visibility's effect on performance." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The threats to validity section (Section 5) explicitly bounds claims: 'Our experiments were conducted on BigCodeBench' and notes 'BigCodeBench focuses exclusively on Python, leaving open questions about generalizability to other languages.' It also notes the focus on 'medium-to-large models' and that larger frontier models may behave differently." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 5 discusses multiple alternative explanations: that prompt formulation choices could affect results, that metrics may not capture subtle behaviors, that the manual analysis is subjective, that BigCodeBench's precise tests create unusually strong signals, and that dataset quality issues in some tasks could influence results." 130 } 131 }, 132 "setup_transparency": { 133 "model_versions_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Table 1 lists model names and sizes (e.g., 'Qwen2.5-Coder 14B', 'Phi-4 14B', 'OpenCoder 8B', 'Ministral 8B', 'GPT5-nano 18B') but does not provide specific version identifiers, snapshot dates, or API version strings for any model. 'GPT5-nano' is a marketing name without a snapshot date." 137 }, 138 "prompts_provided": { 139 "applies": true, 140 "answer": true, 141 "justification": "Table 2 provides the full prompt templates for all five strategies. These are actual prompt templates with structural placeholders (<problem specification>, <all tests>, <subset of tests>) where the fill values come from the publicly available BigCodeBench dataset, making prompts fully reconstructible." 142 }, 143 "hyperparameters_reported": { 144 "applies": true, 145 "answer": false, 146 "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported. The paper states 'five code solutions per model' were generated but does not specify what sampling settings were used." 147 }, 148 "scaffolding_described": { 149 "applies": false, 150 "answer": false, 151 "justification": "No agentic scaffolding is used. The study prompts LLMs directly with task descriptions and test cases; there is no agent loop, tool use, or multi-step pipeline involved in the code generation process." 152 }, 153 "data_preprocessing_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 3.2 describes the dataset selection: BigCodeBench-Hard subset of 148 tasks with 854 unit tests. Section 3.4 details how partial tests were created (subset of typically half the tests). The selection of the 'instruct' format variant is documented." 157 } 158 }, 159 "limitations_and_scope": { 160 "limitations_section_present": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 5 ('Threats to Validity and Discussion') is a dedicated section discussing limitations including prompt design, metrics, manual analysis subjectivity, benchmark scope, model scope, and dataset quality." 164 }, 165 "threats_to_validity_specific": { 166 "applies": true, 167 "answer": true, 168 "justification": "The threats are specific to this study: 'BigCodeBench focuses exclusively on Python,' 'The four adaptation strategies we identified emerged from manual inspection of a subset of tasks,' and specific dataset quality issues are identified (e.g., 'misformatted numbers in task 865, input substitution errors in task 945')." 169 }, 170 "scope_boundaries_stated": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5 explicitly states what was not tested: other programming languages, larger frontier models, advanced reasoning strategies (chain-of-thought), other prompt formulations, and benchmarks with looser test specifications. The paper also explicitly states their goal is NOT to evaluate realistic workflows (Section 1)." 174 } 175 }, 176 "data_integrity": { 177 "raw_data_available": { 178 "applies": true, 179 "answer": false, 180 "justification": "No raw generated code, per-task results, or experimental data are made available. Only aggregate statistics are presented in the paper. There is no supplementary data download or repository link." 181 }, 182 "data_collection_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 3.6 describes the experimental procedure: generating 5 code solutions per model per strategy on BigCodeBench-Hard tasks, executing against test suites, and computing metrics. The hardware (4x NVIDIA RTX A5000) and API (OpenAI Batch API) are specified." 186 }, 187 "recruitment_methods_described": { 188 "applies": false, 189 "answer": false, 190 "justification": "No human participants are involved. The study uses a standard public benchmark (BigCodeBench-Hard) and evaluates LLM-generated code." 191 }, 192 "data_pipeline_documented": { 193 "applies": true, 194 "answer": true, 195 "justification": "Figure 1 provides an overview of the full experimental pipeline from dataset to prompting strategies to model generation to metric computation. Sections 3.4-3.6 detail each stage. The pipeline from task selection through generation to evaluation is clearly documented." 196 } 197 }, 198 "conflicts_of_interest": { 199 "funding_disclosed": { 200 "applies": true, 201 "answer": false, 202 "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsorships, or funding agencies." 203 }, 204 "affiliations_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Author affiliations are clearly listed: Oussama Ben Sghaier at Queen's University, Canada; Kevin Delcourt and Houari Sahraoui at Université de Montréal, Canada. These are academic institutions with no apparent conflict regarding the evaluated models." 208 }, 209 "funder_independent_of_outcome": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is treated as non-disclosure rather than evidence of being unfunded, given the academic affiliations and multi-institution collaboration." 213 }, 214 "financial_interests_declared": { 215 "applies": true, 216 "answer": false, 217 "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper." 218 } 219 }, 220 "contamination": { 221 "training_cutoff_stated": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper does not state the training data cutoff dates for any of the five models evaluated. This is relevant because BigCodeBench was published in 2024 and some models may have been trained on data including it." 225 }, 226 "train_test_overlap_discussed": { 227 "applies": true, 228 "answer": false, 229 "justification": "No discussion of whether any of the five models may have seen BigCodeBench tasks or similar coding problems during training. The paper does not address potential train/test overlap." 230 }, 231 "benchmark_contamination_addressed": { 232 "applies": true, 233 "answer": false, 234 "justification": "BigCodeBench was published in 2024, and several models (Qwen2.5-Coder, Phi-4) were released around the same time or later. The paper does not discuss whether the benchmark was available before any model's training cutoff." 235 } 236 }, 237 "human_studies": { 238 "pre_registered": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants are involved in this study. The experiment evaluates LLMs on a coding benchmark." 242 }, 243 "irb_or_ethics_approval": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved. The study evaluates LLM code generation on a public benchmark." 247 }, 248 "demographics_reported": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved in this study." 252 }, 253 "inclusion_exclusion_criteria": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved in this study." 257 }, 258 "randomization_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in this study." 262 }, 263 "blinding_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved in this study." 267 }, 268 "attrition_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved in this study." 272 } 273 }, 274 "cost_and_practicality": { 275 "inference_cost_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "No API costs, tokens consumed, or wall-clock times are reported for any of the experiments, despite using both local GPU inference and the OpenAI Batch API." 279 }, 280 "compute_budget_stated": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper mentions the hardware (4x NVIDIA RTX A5000 GPUs) but does not quantify total GPU hours, total API spend, or overall computation time for the experiments." 284 } 285 } 286 }, 287 "claims": [ 288 { 289 "claim": "Test visibility dramatically improves LLM code generation correctness, with pass@1 nearly doubling for some models.", 290 "evidence": "Table 3 shows pass@1 increasing from 15.5%-24.4% (baseline) to 37.2%-54.7% (FT/FT+DNU). For example, Phi-4 goes from 23.6% to 54.7% (Section 4.1).", 291 "supported": "strong" 292 }, 293 { 294 "claim": "Explicit 'Do Not Use' restrictions are largely ineffective at preventing models from exploiting visible test signals.", 295 "evidence": "Table 3 shows FT+DNU performance is comparable to or even exceeds FT. For example, Phi-4 achieves its best pass@1 of 56.8% under FT+DNU. Wilcoxon tests in Table 6 confirm statistically significant structural differences between restricted and unrestricted conditions for most models (Section 4.3).", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Partial test visibility produces moderate improvements over baseline but consistently falls short of full test visibility.", 300 "evidence": "Table 3 shows PT conditions producing intermediate results: OpenCoder-8B improves from 17.6% (baseline) to ~30% (PT) vs. ~38% (FT). This pattern is consistent across all five models (Section 4.1).", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Four distinct adaptation strategies are employed by LLMs: code refinement using test signals, test hard coding, quick & dirty programming, and no adaptation.", 305 "evidence": "Section 4.4 provides qualitative analysis of >30 tasks with specific examples: task 1137 (refinement), task 985 (hard coding), task 310 (quick & dirty), and task 587 (no adaptation). Analysis is based on manual inspection.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Test exposure induces statistically significant structural changes in generated code for most models.", 310 "evidence": "Table 6 reports Wilcoxon signed-rank tests with FDR correction showing p<0.01 for Ministral-8B, Phi-4, and Qwen2.5 across both CodeBLEU and churn comparisons. OpenCoder-8B shows significance in some comparisons; GPT5-nano shows none (Section 4.3).", 311 "supported": "strong" 312 } 313 ], 314 "methodology_tags": [ 315 "benchmark-eval" 316 ], 317 "key_findings": "LLMs systematically exploit test case signals embedded in prompts to improve code generation correctness, with pass@1 nearly doubling under full test visibility. Explicit instructions not to use visible tests are largely ineffective, as models continue to leverage these signals regardless of prohibitions. The study identifies four recurring adaptation strategies: test-driven refinement (most common), test hard coding, quick-and-dirty programming, and no adaptation. These findings demonstrate a fundamental tension between LLM pretraining objectives and alignment constraints in code generation settings.", 318 "red_flags": [ 319 { 320 "flag": "No variance or uncertainty reporting", 321 "detail": "Despite generating 5 solutions per task per strategy, no standard deviations, confidence intervals, or error bars are reported for any metric. The reader cannot assess result stability or know whether differences between conditions might fall within the noise of sampling variation." 322 }, 323 { 324 "flag": "Missing hyperparameter specification", 325 "detail": "Temperature, top-p, and other generation parameters are not reported for any model. These settings critically affect code generation diversity and quality, and their omission makes reproduction impossible." 326 }, 327 { 328 "flag": "Benchmark contamination not addressed", 329 "detail": "BigCodeBench was published in 2024 and several evaluated models were released around the same time. No discussion of whether models may have been trained on BigCodeBench tasks, which could inflate baseline performance and affect the relative gains attributed to test visibility." 330 }, 331 { 332 "flag": "No model version specificity", 333 "detail": "No specific model versions or snapshot dates are provided for any of the five models. For GPT5-nano accessed via API, version drift could significantly affect results. Open-source model weights could correspond to different checkpoints." 334 }, 335 { 336 "flag": "Qualitative analysis on selected subset", 337 "detail": "The four adaptation strategies (RQ4) are derived from manual inspection of >30 of 148 tasks 'chosen because they exhibited substantial metric differences.' This selection process introduces confirmation bias, and no inter-rater reliability is reported for the qualitative coding." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation", 343 "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"], 344 "year": 2025, 345 "arxiv_id": "2503.11926", 346 "relevance": "Studies how reasoning models exploit test suites and misbehave, directly relevant to LLM alignment and safety in code generation." 347 }, 348 { 349 "title": "Evaluation Guidelines for Empirical Studies in Software Engineering involving LLMs", 350 "authors": ["Sebastian Baltes", "Florian Angermeir"], 351 "year": 2025, 352 "arxiv_id": "2508.15503", 353 "relevance": "Provides methodological guidelines for empirical SE studies using LLMs, directly relevant to survey methodology quality assessment." 354 }, 355 { 356 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 357 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 358 "year": 2024, 359 "arxiv_id": "2401.05566", 360 "relevance": "Demonstrates training deceptive LLM agents that persist through safety training, core AI safety research on alignment." 361 }, 362 { 363 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 364 "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"], 365 "year": 2024, 366 "arxiv_id": "2406.15877", 367 "relevance": "The primary benchmark used in this study; a major code generation evaluation benchmark for LLMs." 368 }, 369 { 370 "title": "AI deception: A survey of examples, risks, and potential solutions", 371 "authors": ["Peter S Park", "Simon Goldstein", "Aidan O'Gara"], 372 "year": 2024, 373 "relevance": "Survey of AI deception behaviors including manipulation and cheating, relevant to understanding LLM misalignment in programming." 374 }, 375 { 376 "title": "Rethinking Verification for LLM Code Generation: From Generation to Testing", 377 "authors": ["Zihan Ma", "Taolin Zhang", "Maosong Cao"], 378 "year": 2025, 379 "arxiv_id": "2507.06920", 380 "relevance": "Addresses verification challenges in LLM code generation, relevant to understanding how LLMs exploit evaluation signals." 381 }, 382 { 383 "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges", 384 "authors": ["Yunseo Lee", "John Youngeun Song"], 385 "year": 2025, 386 "arxiv_id": "2504.20799", 387 "relevance": "Provides taxonomy of code generation hallucinations, directly relevant to understanding LLM code quality and correctness issues." 388 }, 389 { 390 "title": "Opendeception: Benchmarking and investigating AI deceptive behaviors via open-ended interaction simulation", 391 "authors": ["Yichen Wu", "Xudong Pan", "Geng Hong"], 392 "year": 2025, 393 "arxiv_id": "2504.13707", 394 "relevance": "Benchmark for evaluating AI deceptive behaviors, relevant to understanding model misalignment and instruction non-compliance." 395 }, 396 { 397 "title": "A survey on large language models for code generation", 398 "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen"], 399 "year": 2024, 400 "arxiv_id": "2406.00515", 401 "relevance": "Comprehensive survey on LLMs for code generation, providing broader context for the code generation evaluation landscape." 402 }, 403 { 404 "title": "Claude 4 System Card: Claude Opus 4 & Claude Sonnet 4", 405 "authors": ["Anthropic"], 406 "year": 2025, 407 "relevance": "System card documenting cases where Claude models overfit to test suites, producing solutions that pass visible tests but fail to generalize." 408 }, 409 { 410 "title": "Truthful or Fabricated? Using Causal Attribution to Mitigate Reward Hacking in Explanations", 411 "authors": ["Pedro Ferreira", "Wilker Aziz", "Ivan Titov"], 412 "year": 2025, 413 "arxiv_id": "2504.05294", 414 "relevance": "Studies reward hacking in LLM explanations, showing chain-of-thought rationales may not reflect true computation — relevant to alignment." 415 } 416 ] 417 }