scan.json (31227B)
1 { 2 "paper": { 3 "title": "Mercury: A Code Efficiency Benchmark for Code Large Language Models", 4 "authors": [ 5 "Mingzhe Du", 6 "Anh Tuan Luu", 7 "Bin Ji", 8 "Qian Liu", 9 "See-Kiong Ng" 10 ], 11 "year": 2024, 12 "venue": "Preprint (arXiv)", 13 "arxiv_id": "2402.07844", 14 "doi": "10.48550/arXiv.2402.07844" 15 }, 16 "scan_version": 3, 17 "active_modules": [ 18 "experimental_rigor", 19 "data_leakage" 20 ], 21 "methodology_tags": [ 22 "benchmark-eval" 23 ], 24 "key_findings": "Mercury is the first code efficiency benchmark for Code LLMs, comprising 1,889 Python tasks from LeetCode with a runtime-percentile-based metric called Beyond. Leading Code LLMs achieve up to 65% on functional correctness (Pass) but below 50% on Beyond, revealing a significant efficiency gap. Direct Preference Optimization (DPO) consistently improves both correctness and efficiency, especially for models over 15B parameters, while Supervised Fine-Tuning (SFT) often induces catastrophic forgetting in larger models.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper provides a GitHub link in the abstract footnote: 'Our code and data are available on GitHub: https://github.com/Elfsong/Mercury.' Code is released." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The Mercury dataset is hosted on HuggingFace (Section A.12): 'https://huggingface.co/datasets/Elfsong/Mercury'. The dataset includes tasks, solutions, test case generators, and prompts." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "Section 4.3 mentions 'two A100-80G GPUs' and names libraries (Accelerate, DeepSpeed, BitsandBytes) but does not provide version numbers, requirements.txt, or Dockerfile. Library names without versions are insufficient for environment recreation." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper provides a GitHub repository with code, a HuggingFace dataset, and Section 4.3 describes experimental configuration in detail (LoRA parameters, optimizer, learning rates, training steps, sampling settings). Combined with the public code and data, this is sufficient for a competent researcher to reproduce." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Tables 2 and 3 (the main results) report only point estimates without confidence intervals, error bars, or ± notation. Appendix Figure 11 shows bootstrapped distributions for 3 of 10 models, but the main results tables lack any uncertainty quantification." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "No statistical significance tests are used. Comparative claims like 'DPO invariably enhances the overall Pass scores' (Section 4.4) and 'SFT detracts most Beyond scores' are based solely on comparing point estimates without p-values or any statistical test." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Tables 2 and 3 report absolute differences from baseline in parentheses (e.g., '+8.4', '-6.3') and Gap metric showing the difference between Beyond and Pass. Results are contextualized with both baseline and improved values." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "The evaluation set contains 256 tasks and the training set 1,633 tasks. No justification is given for why 256 evaluation tasks suffice, and no power analysis is provided." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "Main results tables (2, 3, 4) show single point estimates without standard deviation or variance. Appendix Figure 11 shows bootstrapped distributions but only for 3 of 10 models and only for the Beyond metric, not for training experiments (SFT/DPO)." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper compares original (pre-trained) models against SFT and DPO baselines. It also includes HumanEval and MBPP as supplementary functional correctness benchmarks (Section 4.4, Tables 2 and 3)." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "The 10 evaluated models include contemporary Code LLMs from 2023-2024: DeepSeek-Coder, StarCoder2, CodeQwen1.5, and CodeLlama (Table 7). These represent the state of the art for open-source Code LLMs at the time of writing." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": false, 89 "justification": "The paper compares original, SFT, and DPO training strategies but does not ablate individual components of the Mercury benchmark design (e.g., the runtime distribution modeling, test case generator quality, or metric formulation)." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper uses Pass (functional correctness), Beyond (efficiency-weighted correctness), and Gap (Beyond minus Pass difference) as metrics. Results are also reported on HumanEval and MBPP." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "Evaluation is entirely automated via test case execution and runtime measurement. No human evaluation of code quality, readability, or maintainability is performed." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "Mercury-eval (256 tasks) is a separate held-out set from Mercury-train (1,633 tasks) as described in Section 2 and Table 6. SFT and DPO training use Mercury-train; evaluation uses Mercury-eval." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Results are broken down by difficulty level (Easy, Medium, Hard) in Tables 2, 3, and 4, showing significant performance variation across categories." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 4.5 provides a detailed failure analysis with Table 4 breaking down errors into three categories: Generation Errors, Execution Errors, and Test Case Errors, with counts per model and difficulty level." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper reports that SFT diminishes functional correctness on larger models: deepseek-coder-33b-base (-6.3) and CodeLlama-34b-hf (-4.9) in Table 2. SFT also detracts most Beyond scores (Table 3). These negative findings are explicitly discussed." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims Code LLMs 'achieve 65% on Pass, while less than 50% on Beyond' — supported by Tables 2 and 3 (e.g., deepseek-coder-33b achieves 65% Pass and 48.53% Beyond). The claim that DPO is a robust baseline compared to SFT is supported across Tables 2 and 3." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper claims DPO 'enhances' code efficiency and SFT 'may induce catastrophic forgetting.' These are supported by controlled single-variable manipulation: the same base model is trained with either SFT or DPO, with all other variables held constant (Section 4.3)." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title claims to be 'A Code Efficiency Benchmark for Code Large Language Models' but the benchmark is Python-only, sourced exclusively from LeetCode algorithmic problems. The limitations section (Section 6) acknowledges runtime distribution simplification and contamination but does not bound claims to Python or algorithmic tasks." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper attributes DPO's superiority over SFT to catastrophic forgetting in SFT but does not explore other explanations (e.g., training data quality differences, convergence properties, or task-specific effects). No robustness checks against confounds are performed." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper defines code efficiency explicitly as execution time (Section 1), explains why absolute runtime is insufficient (hardware variability), and justifies the Beyond metric as a runtime-percentile approach. The measurement (runtime percentile) matches the claim (code efficiency) with minimal proxy gap." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": true, 153 "justification": "Table 7 (Appendix A.6) lists all 10 models with exact HuggingFace model identifiers and links (e.g., 'deepseek-ai/deepseek-coder-1.3b-base'). These are precise, versioned model references. Note: GPT-4 used for test case generation is not versioned beyond 'GPT-4'." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Figure 9 (Appendix A.9) provides the full one-shot prompt template. The placeholders (<task_content>, <code_starter>, <code_completion>) map to specific dataset fields (pretty_content, prompt, solution), and the dataset is publicly available on HuggingFace, allowing full prompt reconstruction." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 4.3 reports LoRA parameters (alpha=16, dropout=0.05, r=8), optimizer (AdamW), learning rates (1e-4 SFT, 5e-5 DPO), training steps (200 SFT, 500 DPO), β=0.1 for DPO, temperature=0.2 for generation, and K=5 for Beyond." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used. Models are prompted directly for code completion without tools, retry logic, or multi-step workflows." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 2 documents the data filtering pipeline: tasks collected from LeetCode, filtered by number of solutions (≥2), restricted data structures (Python built-ins + TreeNode/ListNode), unique outputs verified across solutions, and Locality-Sensitive Hashing to deduplicate solutions." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 6 is titled 'Limitations' and discusses the uniform runtime distribution assumption and data contamination concerns." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 6 identifies specific threats: (1) the uniform runtime distribution assumption may not hold in practice, requiring more solution samples; (2) data contamination from LeetCode tasks appearing in training data compromises benchmark precision. These are specific to this study." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to Python, LeetCode-style algorithmic tasks, or open-source models. The limitations section discusses what could be improved but not what the current results cannot support." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "The full dataset including tasks, solutions, test case generators, and prompts is released on HuggingFace (Section A.12). The code is on GitHub. Raw data is available for independent verification." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 2 describes data collection: tasks gathered from LeetCode's public problem set, solutions sampled from historical LeetCode submissions, test case generators created via GPT-4 and validated on LeetCode OJ. Filtering criteria and pipeline are documented." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. Data source is LeetCode, a well-known public programming platform. Solutions are from public historical submissions." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline is documented: collect LeetCode tasks → filter by solution count (≥2) → filter by data structure constraints → verify unique outputs → deduplicate solutions via LSH → split into train (1,633) and eval (256). Table 6 shows final distribution." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding source or acknowledgments section is present in the paper. Authors are from NTU, NUS, and Sea AI Lab, but no funding is disclosed." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed: Nanyang Technological University, National University of Singapore, and Sea AI Lab. None of the evaluated models are products of these organizations." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding source is disclosed, so funder independence cannot be assessed. One author is from Sea AI Lab (a commercial entity), but since funding is undisclosed, independence cannot be verified." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper does not state training data cutoff dates for any of the 10 evaluated models. This is critical because LeetCode tasks are publicly available and may appear in training corpora." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "Section 6 acknowledges 'the presence of data contamination during the model training phase compromises the precision of the Mercury benchmark' but provides no analysis of whether specific tasks appeared in training data. No overlap detection is performed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "LeetCode tasks have been publicly available for years before all evaluated models' training cutoffs. The paper acknowledges contamination risk in Section 6 and mentions plans to add new tasks dynamically but does not address contamination for the current evaluation." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study. It is purely a benchmark evaluation of Code LLMs." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants. The study evaluates LLMs on code generation tasks." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No inference cost, API spend, tokens consumed, or wall-clock time per example is reported. The paper evaluates 10 models with SFT and DPO variants but provides no cost information." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "Section 4.3 mentions 'two A100-80G GPUs' and training steps (200 for SFT, 500 for DPO) but does not report total GPU hours, training time, or computational cost." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "Appendix Figure 11 shows bootstrapped Beyond distributions for 3 of 10 models (StarCoder2 3B/7B/15B) over 50 runs. However, the main results tables (2, 3) report single-run point estimates, and training experiments (SFT/DPO) do not report seed sensitivity." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "Table 2 caption states 'We sample one solution for each task to calculate pass score.' Table 3 caption states 'we sample 5 solutions for each task to calculate Beyond score.' K=5 is stated in Section 4.3." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "Fixed hyperparameters are used (Section 4.3) with no mention of whether any search was conducted or why these specific values were chosen. The paper states 'extensive parameter optimization and prompt engineering were not pursued' but does not document the search budget for the chosen values." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "Hyperparameters appear to be chosen without explanation. No justification is given for LoRA r=8, α=16, dropout=0.05, or the specific learning rates and training steps." 323 }, 324 "multiple_comparison_correction": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper compares 10 models × 3 conditions (original, SFT, DPO) × 3 difficulty levels across multiple metrics without any correction for multiple comparisons. No statistical tests are applied at all." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors designed the Mercury benchmark and evaluate models on it without acknowledging potential author-evaluation bias. No independent evaluation or discussion of this bias is provided." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "Models range from 1.3B to 34B parameters with vastly different compute requirements. While model size trends are discussed informally (Section 4.4: 'larger models tend to provide better functional correctness'), performance is not explicitly reported as a function of matched compute budgets." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": true, 342 "justification": "The paper explicitly discusses what Mercury measures (runtime efficiency via percentile ranking) and why absolute runtime is inadequate (hardware variability, Section 3). It explains the Beyond metric's design, demonstrates hardware-agnostic consistency (Appendix Figure 10), and discusses the uniform distribution assumption (Section 6)." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "No scaffolding is used. Models are prompted directly for code completion with a uniform prompt template (Figure 9). The same prompting format is applied across all models." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "LeetCode tasks and solutions have been publicly available for years before the evaluated models' training data collection. The paper does not discuss whether models may have seen these exact tasks and solutions during training." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the prompt format (which includes LeetCode task descriptions) could leak information that models have memorized from training on LeetCode content." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of whether Mercury-eval and Mercury-train tasks share structural similarities with the models' training data or with each other beyond the difficulty-balanced random split." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No concrete leakage detection method is applied. Section 6 mentions contamination as a concern and proposes future dynamic task updates, but no canary strings, membership inference, or decontamination analysis is performed on the current evaluation." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Leading Code LLMs achieve up to 65% on Pass (functional correctness) but less than 50% on Beyond (efficiency), indicating a significant gap between correctness and efficiency.", 376 "evidence": "Table 2 shows deepseek-coder-33b-base achieves 65.0% Pass; Table 3 shows its Beyond is 48.53%. The Gap column in Table 3 quantifies this difference across all models (Section 4.4).", 377 "supported": "strong" 378 }, 379 { 380 "claim": "DPO serves as a robust baseline for enhancing both code efficiency and functional correctness compared with SFT.", 381 "evidence": "Tables 2 and 3 show DPO improves Pass for 8/10 models and improves or maintains Beyond for 6/10 models. SFT often hurts Pass on larger models (deepseek-coder-33b: -6.3, CodeLlama-34b: -4.9) and detracts most Beyond scores (Section 4.4).", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Mercury's difficulty stratification effectively differentiates Code LLM capabilities, with scores declining consistently from Easy to Hard.", 386 "evidence": "Tables 2 and 3 show consistent performance decline across difficulty levels for all models. For example, deepseek-coder-33b-base: 70.9% Easy, 67.9% Medium, 62.3% Hard on Pass (Section 4.4).", 387 "supported": "strong" 388 }, 389 { 390 "claim": "DPO substantially narrows the Gap between Beyond and Pass for models larger than 15B parameters.", 391 "evidence": "Table 3 Gap column: starcoder2-15b Gap drops from 12.55 to 10.81 with DPO; deepseek-coder-33b from 18.50 to 5.79; CodeLlama-34b from 15.49 to 8.01. However, Gap widens for some smaller models.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "The Beyond metric remains consistent across different hardware configurations, making it environment-agnostic.", 396 "evidence": "Appendix Figure 10 shows Beyond scores for two models across three CPU tiers (micro, small, standard) with consistent results. However, this is demonstrated for only 2 models.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "SFT may induce catastrophic forgetting in the pursuit of code efficiency for larger models.", 401 "evidence": "Table 2 shows SFT decreases Pass for deepseek-coder-33b (-6.3) and CodeLlama-34b (-4.9). The paper attributes this to catastrophic forgetting (Section 4.4) but provides no direct evidence of forgetting beyond performance drops.", 402 "supported": "weak" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "Severe LeetCode contamination risk", 408 "detail": "All 1,889 tasks and their solutions come from LeetCode, which is widely scraped and included in code training datasets. Models like DeepSeek-Coder and StarCoder are trained on large code corpora that likely include LeetCode content. The paper acknowledges this in Section 6 but performs no contamination analysis, making all Pass scores potentially inflated." 409 }, 410 { 411 "flag": "No error bars on main results", 412 "detail": "Tables 2 and 3 report point estimates without confidence intervals, standard deviations, or error bars. Appendix Figure 11 shows bootstrapped distributions for only 3 of 10 models. Without uncertainty quantification, reported differences between models and methods may not be statistically meaningful." 413 }, 414 { 415 "flag": "No statistical significance tests for comparative claims", 416 "detail": "All claims about DPO outperforming SFT and model comparisons are based on comparing raw numbers without any statistical test. Given the variance shown in Figure 11 for the few models tested, some reported differences may be within noise." 417 }, 418 { 419 "flag": "Python-only scope presented as general benchmark", 420 "detail": "Mercury uses only Python tasks from LeetCode algorithmic problems, but the title and framing position it as a general 'Code Efficiency Benchmark for Code Large Language Models.' Efficiency characteristics differ across languages and task types (e.g., systems programming vs. algorithms)." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Evaluating large language models trained on code", 426 "authors": ["Mark Chen", "Jerry Tworek"], 427 "year": 2021, 428 "arxiv_id": "2107.03374", 429 "relevance": "Introduced HumanEval and Codex, foundational code generation benchmark and model against which Mercury is positioned." 430 }, 431 { 432 "title": "Program synthesis with large language models", 433 "authors": ["Jacob Austin", "Augustus Odena"], 434 "year": 2021, 435 "arxiv_id": "2108.07732", 436 "relevance": "Introduced MBPP benchmark for program synthesis, one of the supplementary benchmarks used in Mercury evaluation." 437 }, 438 { 439 "title": "Measuring coding challenge competence with APPS", 440 "authors": ["Dan Hendrycks", "Steven Basart"], 441 "year": 2021, 442 "arxiv_id": "2105.09938", 443 "relevance": "Large-scale code generation benchmark from online platforms, directly compared to Mercury in Table 1." 444 }, 445 { 446 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 447 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 448 "year": 2024, 449 "relevance": "EvalPlus benchmark that augments HumanEval/MBPP test coverage; motivates Mercury's approach to test case generation." 450 }, 451 { 452 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 453 "authors": ["Naman Jain", "King Han"], 454 "year": 2024, 455 "arxiv_id": "2403.07974", 456 "relevance": "Contamination-free code benchmark addressing the same contamination concerns that affect Mercury's LeetCode-based evaluation." 457 }, 458 { 459 "title": "StarCoder: may the source be with you!", 460 "authors": ["Raymond Li", "Loubna Ben Allal"], 461 "year": 2023, 462 "arxiv_id": "2305.06161", 463 "relevance": "Open-source Code LLM evaluated in Mercury experiments." 464 }, 465 { 466 "title": "Code Llama: Open foundation models for code", 467 "authors": ["Baptiste Roziere", "Jonas Gehring"], 468 "year": 2023, 469 "arxiv_id": "2308.12950", 470 "relevance": "Open-source Code LLM family (7B, 13B, 34B) evaluated across all Mercury experiments." 471 }, 472 { 473 "title": "StarCoder 2 and The Stack v2: The next generation", 474 "authors": ["Anton Lozhkov", "Raymond Li"], 475 "year": 2024, 476 "arxiv_id": "2402.19173", 477 "relevance": "Next-generation open-source Code LLM evaluated in Mercury experiments." 478 }, 479 { 480 "title": "Direct preference optimization: Your language model is secretly a reward model", 481 "authors": ["Rafael Rafailov", "Archit Sharma"], 482 "year": 2023, 483 "arxiv_id": "2305.18290", 484 "relevance": "DPO method used as the primary training baseline for improving code efficiency in Mercury experiments." 485 }, 486 { 487 "title": "Competition-level code generation with AlphaCode", 488 "authors": ["Yujia Li", "David Choi"], 489 "year": 2022, 490 "relevance": "Competition-level code generation system relevant to understanding Code LLM capabilities on algorithmic tasks." 491 }, 492 { 493 "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques", 494 "authors": ["Mohammed Latif Siddiq", "Joanna CS Santos"], 495 "year": 2022, 496 "relevance": "Security-focused code generation benchmark, representing evaluation of code quality dimensions beyond correctness." 497 }, 498 { 499 "title": "DS-1000: A natural and reliable benchmark for data science code generation", 500 "authors": ["Yuhang Lai", "Chengxi Li"], 501 "year": 2023, 502 "relevance": "Data science code generation benchmark evaluating LLMs on domain-specific tasks beyond algorithmic problems." 503 } 504 ], 505 "engagement_factors": { 506 "practical_relevance": { 507 "score": 2, 508 "justification": "Mercury provides a usable benchmark and metric (Beyond) for researchers evaluating Code LLM efficiency, and the DPO finding is actionable for model trainers." 509 }, 510 "surprise_contrarian": { 511 "score": 1, 512 "justification": "Confirms the expected intuition that Code LLMs prioritize correctness over efficiency; the DPO vs SFT finding is modestly surprising." 513 }, 514 "fear_safety": { 515 "score": 0, 516 "justification": "No safety or security concerns raised; the paper focuses on code efficiency measurement." 517 }, 518 "drama_conflict": { 519 "score": 0, 520 "justification": "No controversy or conflict with existing claims or institutions." 521 }, 522 "demo_ability": { 523 "score": 2, 524 "justification": "Code and dataset publicly available on GitHub and HuggingFace; researchers can run the benchmark themselves." 525 }, 526 "brand_recognition": { 527 "score": 1, 528 "justification": "Authors are from NTU, NUS, and Sea AI Lab — recognized institutions in Asia but not top-tier AI lab brand recognition globally." 529 } 530 } 531 }