scan.json (31118B)
1 { 2 "paper": { 3 "title": "On the Reliability and Explainability of Language Models for Program Generation", 4 "authors": [ 5 "Yue Liu", 6 "Chakkrit Tantithamthavorn", 7 "Yonghui Liu", 8 "Li Li" 9 ], 10 "year": 2023, 11 "venue": "ACM Transactions on Software Engineering and Methodology", 12 "arxiv_id": "2302.09587", 13 "doi": "10.1145/3641540" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval", "observational"], 18 "key_findings": "Evaluating 8 pre-trained language models on 5 program generation benchmarks reveals that performance is substantially inflated by data duplication between training and testing sets (>20% identical test samples in several datasets). Models frequently copy input sequences rather than generating new code — up to 80% output-input duplication for some models on code review tasks. SHAP-based explainability analysis shows models prioritize identifiers and keywords but exhibit poor robustness even to removal of supposedly unimportant tokens.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states 'we publish the studied dataset and a replication package, which are publicly available on GitHub' with a concrete URL: https://github.com/yueyueL/ProgramGen-LMs-Reliability (footnote 1, Section 1)." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "All five benchmark datasets used are publicly available (CodeXGLUE, Tufano et al., CodeReview), and the replication package includes the studied datasets. Section 3.2 describes each dataset with public sources." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "Section 3.4 lists hardware (AMD Ryzen 9 5950X, 64GB RAM, NVIDIA RTX 3090) and mentions PyTorch, Transformers, Captum, and Ecco libraries, but no library version numbers, requirements.txt, or Dockerfile are provided in the paper." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper provides a GitHub replication package (footnote 1) and Section 3.4 describes the experimental setup including model configurations, data splits from CodeXGLUE, and fine-tuning procedures. The replication package is explicitly intended for reproduction." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Table 2 and all performance tables report only point estimates (e.g., '15.20% accuracy') with no confidence intervals or error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper compares 8 models across 12 datasets and makes claims like 'CodeT5+ shows superior performance' based solely on comparing raw accuracy numbers without any statistical significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Performance differences are consistently reported with baseline context, e.g., Table 3 shows 'Original Accuracy: 70.60% → New Accuracy: 53.09%' when removing high-similarity instances, and 'accuracy of CodeT5+ decreases from 15.2% to 9.19%' for token removal (Section 4.3.2)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper uses standard benchmark datasets without justifying why these sample sizes are appropriate or adequate for the claims made. No power analysis is discussed." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "All results appear to be from single experimental runs. No standard deviations, variance across seeds, or spread measures are reported anywhere in the paper." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Eight models are compared, including T5 (NL-only pre-training) as a baseline against code-specialized models. Section 3.1 describes the rationale for including encoder-only, decoder-only, and encoder-decoder architectures for completeness." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The baselines include CodeT5+ (2023), CodeReviewer (2022), CodeT5 (2021), and other recent models. These represent the state of the art at the time of writing." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "The paper does not propose a multi-component system — it is an empirical evaluation of existing pre-trained models. The token removal experiments (Section 4.3.2) are robustness analyses, not component ablations." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses both accuracy (exact match rate) and BLEU-4 score. Section 3.3 describes both metrics and Table 3 reports both for CodeReviewer and CodeT5+." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation of generated code quality is performed. All evaluation is automated (accuracy, BLEU, SHAP analysis). Human evaluation of code correctness and usefulness would be relevant to reliability claims." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 3.4 states 'For each dataset, we use identical training, validation, and test splits as well as fine-tuning approaches described in CodeXGLUE and original papers.' Separate test sets are used for evaluation." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 provides per-dataset breakdowns across all 12 subsets. Figures 2 and 3 further break down performance by similarity score ranges. Figure 5 breaks down feature importance by token type." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 4.2.3 discusses the failure mode of models copying inputs. Figure 4 shows concrete examples of test instances with duplicated sources and different targets. Section 4.3 uses XAI to analyze failure patterns." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper's core contribution is negative results: performance inflation from data duplication, models merely copying inputs (Table 4), significant performance drops from minor input changes (Figure 6), and overall low accuracy on most tasks." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims (1) 'inappropriate performance evaluation stemming from severe data duplication, causing over-optimistic results' — supported by RQ2 (Section 4.2, Tables 3-4), and (2) 'models can recognize code grammar and structural information, but they exhibit limited robustness' — supported by RQ3 (Section 4.3, Figures 5-7)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims about data duplication inflating performance, supported by a controlled experiment: removing high-similarity instances and measuring performance drop (Table 3). The token removal study (Section 4.3.2) uses controlled single-variable manipulation to establish causal relationships between tokens and output." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title 'On the Reliability and Explainability of Language Models for Program Generation' and conclusions like 'Automated program generation models are not reliable' generalize beyond the 8 tested models and 5 datasets. The study primarily covers Java/C# with specific model architectures, but conclusions are framed broadly." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "Section 5.2 discusses threats related to model settings and dataset selection but does not discuss alternative explanations for the findings. For example, it does not consider whether performance drops from removing duplicates might reflect difficulty differences in the remaining test set rather than memorization." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper's central thesis IS about the gap between benchmark metrics (accuracy/BLEU) and actual model reliability. The paper explicitly argues that high benchmark accuracy is a poor proxy for practical reliability, providing extensive evidence (RQ2) that measured performance does not reflect real capability." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are identified by name (CodeT5, CodeBERT, etc.) with '12-layer base version' specified (Section 3.4), but exact HuggingFace model paths or checkpoint IDs are not provided. For CodeTrans, the paper says 'the version with the most downloads,' which is not reproducibly specific." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "The paper does not use prompting. All models are fine-tuned encoder-decoder or encoder/decoder models that directly generate code from input sequences." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "Section 3.4 mentions beam search size of 1 and references CodeXGLUE and original papers for fine-tuning settings, but key hyperparameters (learning rate, batch size, epochs, optimizer) are not stated in this paper." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. Models are fine-tuned and run standard encoder-decoder inference." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.2 describes each dataset's structure, splits, and sizes in detail. Section 3.4 describes the identical data splits from CodeXGLUE and original papers. The analysis methodology (BLEU-4 similarity computation, token type classification, special token handling) is described in Sections 4.2-4.3." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5.2 'Threats to Validity' provides a substantive discussion of internal and external threats, covering model architecture choices, hyper-parameter settings, dataset selection, and generalizability." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.2 discusses specific threats: 'The primary threat to internal validity mainly lies in the model architecture and hyper-parameter setting. We use eight program generation models, which are based on the same model settings in the original papers.' It also notes specific dataset coverage (four task types, multiple programming languages, three input types)." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper does not explicitly state what it did NOT test. It does not mention excluding large language models (GPT-3.5/4, ChatGPT), larger model sizes, or additional programming languages. Section 5.2 implies scope limitations but does not state specific exclusions." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper publishes a replication package on GitHub (footnote 1) including the studied datasets, and all five benchmark datasets are publicly available from their original sources." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 describes each dataset's collection process by referencing original papers: Tufano et al. collected from Gerrit repositories, Bugs2Fix from GitHub bug-fixing commits, CodeTrans from CodeXGLUE, CONCODE from ~33,000 Java projects. Collection details are traced to original sources." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are standard public benchmarks." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline from datasets to analysis is documented: standard splits are used (Section 3.4), BLEU-4 similarity computation is formalized with equations (Equations 1-4 in Section 4.2), and token classification for explainability is described in Section 4.3.1." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The Acknowledgment section states: 'Chakkrit Tantithamthavorn was supported by the Australian Research Council's Discovery Early Career Researcher Award (DECRA) funding scheme (DE200100941).'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: Monash University (Australia) and Beihang University (China). None of the authors are affiliated with companies whose models are evaluated (all are open-source academic models)." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "The Australian Research Council is a government funding body with no financial interest in the outcome of the model evaluations." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "While the paper extensively discusses fine-tuning train/test overlap, it does not state the pre-training data cutoff dates for the base models (CodeT5, CodeBERT, etc.), which could have been pre-trained on data containing the benchmark examples." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "This is the paper's primary contribution. RQ2 (Section 4.2) provides a comprehensive analysis of train-test overlap using BLEU-4 similarity scoring, finding >20% of test samples are identical to training samples in several datasets (Tufano et al., CodeTrans-Dataset)." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": true, 245 "justification": "The entire RQ2 addresses benchmark contamination. Section 4.2.1 analyzes train-test duplication with formal metrics (Equations 1-2), Section 4.2.2 examines intra-test-set duplication, and Section 4.2.3 analyzes output-input similarity. Table 3 quantifies performance inflation from contamination." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. It is a computational evaluation of language models on benchmark datasets." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, latency, or per-example time is reported despite fine-tuning and running inference on 8 models across 12 dataset configurations." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Section 3.4 lists hardware (RTX 3090, AMD Ryzen 9, 64GB RAM) but does not state total GPU hours, training time, or overall computational budget for the 96 model-dataset experiments." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single experimental runs." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper does not state how many runs produced the results. No mention of 'averaged over K runs' or similar." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper states it uses original paper settings and 'the goal of our work is not to find the best setting' (Section 5.2) but does not report any search budget or confirm that no tuning was performed." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "Section 3.4 states 'we use identical training, validation, and test splits as well as fine-tuning approaches described in CodeXGLUE and original papers.' Using standard configurations from prior work is a justified selection strategy for a comparative study." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper compares 8 models across 12 datasets, making numerous comparisons, but performs no statistical tests at all — and therefore no multiple comparison correction." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement fine-tuning and evaluation pipelines for all 8 models but do not discuss potential bias from their own implementation choices affecting baseline performance." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "While the paper controls for model size by using 12-layer base versions (Section 3.4), performance is not reported as a function of compute budget, and compute differences between models are not discussed." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "This is the paper's central contribution. RQ2 directly questions whether benchmark metrics reflect real model capability, finding that data duplication, output-input copying, and test set redundancy undermine construct validity of program generation benchmarks." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. Models are fine-tuned and run standard sequence-to-sequence inference." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The paper does not discuss temporal leakage — whether pre-trained models' training data temporally overlaps with the benchmark datasets. The analysis focuses on fine-tuning train/test overlap, not pre-training data contamination." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Feature leakage is not specifically discussed. The paper does not analyze whether evaluation inputs leak answer information." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": true, 358 "justification": "RQ2 extensively addresses non-independence: Section 4.2.1 quantifies train-test overlap using BLEU-4 similarity (Equations 1-2), and Section 4.2.2 quantifies intra-test-set duplication (Equations 3-4). Finding 2 and Finding 3 detail the prevalence across all 12 datasets." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": true, 363 "justification": "The paper uses BLEU-4 scoring as a concrete leakage detection method (Equations 1-4), computing maximum pairwise similarity scores between test and training instances to identify exact and near-duplicate contamination." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Pre-trained encoder-decoder models (CodeReviewer, CodeT5+) show superior performance compared to encoder-only and decoder-only models across diverse program generation datasets.", 370 "evidence": "Table 2 shows CodeT5+ and CodeReviewer consistently achieving highest accuracy across most of the 12 datasets, e.g., CodeReviewer at 30.43% on CodeReview and CodeT5+ at 70.6% on C#2Java (Section 4.1).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Multiple program generation benchmark datasets contain substantial data duplication between training and testing sets, inflating performance metrics.", 375 "evidence": "Figure 2 and Table 3 show >20% of test samples are identical to training data in Tufano et al. datasets. Removing high-similarity instances (>0.6 BLEU) causes CodeT5+ accuracy to drop from 70.6% to 53.09% on C#2Java (Section 4.2.1).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "10 of 12 studied datasets contain duplicated source sequences within their test sets, requiring models to generate different targets for identical inputs.", 380 "evidence": "Figure 3 shows intra-test-set duplication across datasets, with Android_S having 11% duplicates and Ovirt_M over 10%. Figure 4 provides a concrete example from Android_S (Section 4.2.2).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Language models frequently generate outputs identical to input sequences in code review and repair tasks, rather than generating updated code.", 385 "evidence": "Table 4 shows output-input duplication rates up to 84% (CodeTrans on B2F_M), 80% (T5 on Android_M), and 35% (CodeT5+ on B2F_S). Only CONCODE (text-to-code) shows 0% duplication (Section 4.2.3).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Identifiers and keywords are consistently assigned higher feature importance than operators and separators by program generation models.", 390 "evidence": "Figure 5 shows average feature importance of token types across CodeT5, CodeReviewer, and CodeT5+ on 10 datasets, with identifiers and keywords consistently receiving higher SHAP importance scores (Section 4.3.1).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Language models exhibit poor robustness to input changes — removing even a single least-important token significantly degrades performance.", 395 "evidence": "Figure 6 shows CodeT5+ accuracy drops from 15.2% to 9.19% on Android_S after removing just one token, and to 0% after removing 10 tokens. Figure 7 confirms the pattern across all 12 datasets for three models (Section 4.3.2).", 396 "supported": "strong" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No statistical significance testing", 402 "detail": "The paper compares 8 models across 12 datasets and draws conclusions about relative performance without any statistical significance tests. All comparisons are based on raw accuracy numbers, making it impossible to determine whether observed differences are meaningful or due to random variation." 403 }, 404 { 405 "flag": "No variance or multi-run reporting", 406 "detail": "All results appear to be from single experimental runs with no standard deviations, confidence intervals, or cross-seed variation reported. For fine-tuned neural models, single-run results can be unreliable due to random initialization and training stochasticity." 407 }, 408 { 409 "flag": "Over-broad conclusions from limited scope", 410 "detail": "The paper concludes 'Automated program generation models are not reliable' (Section 7) based on 8 specific models (all relatively small, 12-layer base versions) on 5 datasets. Large language models like GPT-3.5/4 and larger model variants are not studied, yet conclusions are stated without qualification." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "The adverse effects of code duplication in machine learning models of code", 416 "authors": ["Miltiadis Allamanis"], 417 "year": 2019, 418 "relevance": "Directly relevant finding that code duplication inflates ML model performance metrics by up to 100%, foundational work for this paper's RQ2." 419 }, 420 { 421 "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation", 422 "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"], 423 "year": 2021, 424 "arxiv_id": "2109.00859", 425 "relevance": "State-of-the-art pre-trained code model evaluated in the study, widely used in code generation research." 426 }, 427 { 428 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 429 "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"], 430 "year": 2020, 431 "relevance": "Foundational encoder-based pre-trained code model evaluated in the study." 432 }, 433 { 434 "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation", 435 "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"], 436 "year": 2021, 437 "relevance": "Benchmark suite used for datasets and evaluation protocol in this study; its data quality is a subject of the findings." 438 }, 439 { 440 "title": "Natural attack for pre-trained models of code", 441 "authors": ["Zhou Yang", "Jieke Shi", "Junda He", "David Lo"], 442 "year": 2022, 443 "relevance": "Demonstrates adversarial vulnerability of pre-trained code models, complementary to this paper's robustness analysis." 444 }, 445 { 446 "title": "An extensive study on pre-trained models for program understanding and generation", 447 "authors": ["Zhengran Zeng", "Hanzhuo Tan", "Haotian Zhang"], 448 "year": 2022, 449 "relevance": "Empirical evaluation of pre-trained code models suggesting more rigorous evaluations are needed, closely related work." 450 }, 451 { 452 "title": "Pitfalls in Language Models for Code Intelligence: A Taxonomy and Survey", 453 "authors": ["Xinyu She", "Yue Liu", "Yanjie Zhao"], 454 "year": 2023, 455 "arxiv_id": "2310.17903", 456 "relevance": "Survey of common experimental biases in LM-for-code research, including data noise and labeling errors." 457 }, 458 { 459 "title": "Large language models for software engineering: A systematic literature review", 460 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 461 "year": 2023, 462 "arxiv_id": "2308.10620", 463 "relevance": "Systematic review of LLMs in software engineering, providing survey-level context for code generation capabilities." 464 }, 465 { 466 "title": "Codet5+: Open code large language models for code understanding and generation", 467 "authors": ["Yue Wang", "Hung Le", "Akhilesh Deepak Gotmare"], 468 "year": 2023, 469 "arxiv_id": "2305.07922", 470 "relevance": "Enhanced pre-trained code model evaluated as top performer in this study." 471 }, 472 { 473 "title": "On learning meaningful code changes via neural machine translation", 474 "authors": ["Michele Tufano", "Jevgenija Pantiuchina", "Cody Watson", "Gabriele Bavota", "Denys Poshyvanyk"], 475 "year": 2019, 476 "relevance": "Source of the Tufano et al. code review dataset found to have severe train-test duplication in this study." 477 }, 478 { 479 "title": "You autocomplete me: Poisoning vulnerabilities in neural code completion", 480 "authors": ["Roei Schuster", "Congzheng Song", "Eran Tromer", "Vitaly Shmatikov"], 481 "year": 2021, 482 "relevance": "Demonstrates poisoning attacks on code completion models, relevant to reliability and security of code generation." 483 }, 484 { 485 "title": "On the importance of building high-quality training datasets for neural code search", 486 "authors": ["Zhensu Sun", "Li Li", "Yan Liu", "Xiaoning Du"], 487 "year": 2022, 488 "relevance": "Demonstrates that data quality significantly impacts code intelligence model performance, supporting this paper's data quality findings." 489 } 490 ], 491 "engagement_factors": { 492 "practical_relevance": { 493 "score": 2, 494 "justification": "Findings about benchmark duplication and model unreliability are directly useful for researchers evaluating or building code generation models." 495 }, 496 "surprise_contrarian": { 497 "score": 2, 498 "justification": "Reveals that widely-used benchmark results are inflated by severe data duplication, and models often just copy inputs — challenging the perceived effectiveness of code generation models." 499 }, 500 "fear_safety": { 501 "score": 0, 502 "justification": "No direct AI safety or security concerns raised; the paper is about evaluation methodology, not harmful capabilities." 503 }, 504 "drama_conflict": { 505 "score": 1, 506 "justification": "Some 'benchmarks are broken' angle — finding that prior performance claims are inflated by dataset issues — but framed academically rather than confrontationally." 507 }, 508 "demo_ability": { 509 "score": 1, 510 "justification": "Replication package available on GitHub, but no interactive demo or easily runnable tool for practitioners." 511 }, 512 "brand_recognition": { 513 "score": 1, 514 "justification": "Academic authors from Monash University and Beihang University; models studied (CodeT5, CodeBERT) are known in the research community but not household names." 515 } 516 } 517 }