scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31336B)
      1 {
      2   "paper": {
      3     "title": "Mercury: A Code Efficiency Benchmark for Code Large Language Models",
      4     "authors": [
      5       "Mingzhe Du",
      6       "Anh Tuan Luu",
      7       "Bin Ji",
      8       "Qian Liu",
      9       "See-Kiong Ng"
     10     ],
     11     "year": 2024,
     12     "venue": "Neural Information Processing Systems",
     13     "arxiv_id": "2402.07844",
     14     "doi": "10.52202/079017-0529"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "Mercury is the first code efficiency benchmark for Code LLMs, comprising 1,889 Python tasks from LeetCode with a novel Beyond metric that normalizes runtime via percentile ranking against historical solutions. Leading Code LLMs achieve ~65% Pass (functional correctness) but less than 50% Beyond (efficiency), revealing a substantial efficiency gap. DPO consistently outperforms SFT for improving code efficiency, especially in models above 15B parameters, while SFT often causes catastrophic forgetting in larger models.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "GitHub repository URL provided in the abstract footnote: https://github.com/Elfsong/Mercury. The paper states 'Our code and data are available on GitHub.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Dataset hosted on HuggingFace (Appendix A.12): https://huggingface.co/datasets/Elfsong/Mercury. Croissant metadata also provided."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions specific libraries (Accelerate, DeepSpeed, BitsandBytes) and hardware (two A100-80G GPUs) in Section 4.3, but does not provide a requirements.txt, Dockerfile, or detailed version listing of dependencies sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While code and data are released, the paper itself contains no step-by-step reproduction instructions, 'Reproducing Results' section, or specific commands to run. A reader must rely on the GitHub repository's documentation, which is not described in the paper."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Main results in Tables 2 and 3 report point estimates only (e.g., '65.0' Pass, '48.53' Beyond) with no confidence intervals or error bars. Appendix Figure 11 shows bootstrapped distributions for only 3 of 10 models, not integrated into the main results."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used anywhere in the paper. Claims like 'DPO yields a stable enhancement' and comparisons between training methods rely solely on comparing raw numbers without any tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Tables 2 and 3 report absolute scores and deltas showing the magnitude of change (e.g., 'deepseek-coder-33b-base: 65.0 → 73.4 (+8.4) with DPO'). The Gap metric in Table 3 explicitly measures the difference between Beyond and Pass, providing context for effect magnitude."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Mercury-eval uses 256 tasks with no justification for this sample size. No power analysis or reasoning for why 256 tasks (88 Easy, 81 Medium, 87 Hard per Table 6) is adequate for the claims made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Main results (Tables 2, 3) are single-run point estimates. Appendix Figure 11 shows bootstrapped variance for only 3 of 10 models (StarCoder2 3B/7B/15B). The remaining 7 models have no variance reporting."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Each of the 10 models is evaluated in three conditions: original, +SFT, and +DPO (Tables 2, 3). HumanEval and MBPP are included as auxiliary benchmarks for functional correctness comparison (Section 4.2)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Models evaluated include DeepSeek-Coder (2023), StarCoder2 (2024), CodeQwen1.5 (2023/2024), and CodeLlama (2023), which were recent at the time of submission. Table 7 lists all models with HuggingFace links."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No ablation study of the Mercury benchmark or Beyond metric design is performed. The comparison of SFT vs DPO is a baseline comparison, not an ablation of their system's components (e.g., removing the runtime normalization, varying K, or testing alternative percentile calculations)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three metrics are reported: Pass (functional correctness), Beyond (efficiency-weighted pass), and Gap (difference between Pass and Beyond). Results are also reported on HumanEval and MBPP."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is fully automated via test case execution and runtime measurement. No human evaluation of code quality, readability, or efficiency is performed."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Mercury-eval (256 tasks) is explicitly separated from Mercury-train (1,633 tasks) as described in Section 2 and Table 6. Training uses Mercury-train, evaluation uses Mercury-eval."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by difficulty level (Easy/Medium/Hard) in both Tables 2 and 3. Table 4 provides per-difficulty failure analysis across three error categories."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 4.5 'Failure Analysis' with Table 4 provides a detailed breakdown of Generation Errors, Execution Errors, and Test Case Errors across all models and difficulty levels."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that SFT 'diminishes functional correctness on the largest two Code LLMs' (Section 4.4) and 'detracts most Beyond scores from original models' (Table 3). DPO hurts smaller models' Gap scores. These negative results are prominently discussed."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are supported: 'leading Code LLMs can achieve 65% on Pass' matches Table 2 (deepseek-coder-33b-base: 65.0%). 'Less than 50% on Beyond' matches Table 3 (same model: 48.53%). 'DPO serves as a robust baseline' is supported by Tables 2-3 showing DPO improvements."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims about DPO and SFT effects are made via controlled fine-tuning experiments: same base models, same data, different training methods. This controlled single-variable manipulation is adequate for the causal claims being made (e.g., 'DPO enhances code efficiency', 'SFT may induce catastrophic forgetting')."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title frames this as 'A Code Efficiency Benchmark for Code Large Language Models' broadly, but the benchmark is Python-only, LeetCode-only, and tests only 10 open-source models (no commercial models like GPT-4 or Claude). The limitations section does not bound generalization to Python or algorithmic tasks."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not discuss alternative explanations for its findings. For example, DPO uses 5 solution pairs vs SFT's single solution per task — the data volume difference is not discussed. The superior DPO results on large models could reflect compute allocation (500 vs 200 steps) rather than method superiority."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly defines code efficiency as runtime (Section 1: 'execution time is the performance bottleneck'), acknowledges that this excludes space complexity, and discusses why absolute runtime is an inadequate proxy (Section 3), motivating the percentile-based Beyond metric. The measurement matches the claim granularity."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table 7 (Appendix A.6) lists all 10 evaluated models with exact HuggingFace model IDs and links (e.g., 'deepseek-ai/deepseek-coder-1.3b-base', 'bigcode/starcoder2-3b'). However, GPT-4 used for test case generation (Section 2) lacks a version/snapshot."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix Figure 9 provides the full one-shot prompt template used for code generation, including all placeholders and what fills them (pretty_content, prompt, code_completion). The template is detailed enough to reconstruct every prompt sent to each model."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.3 reports: LoRA alpha=16, dropout=0.05, r=8; AdamW optimizer; learning rates 1e-4 (SFT) and 5e-5 (DPO); 200 training steps (SFT) and 500 (DPO); β=0.1 for DPO; temperature=0.2; K=5 for Beyond. Hardware: two A100-80G GPUs."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. Models generate code directly from prompts in a single pass."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 2 documents the full filtering pipeline: LeetCode tasks → filter by number of solutions (≥2) → restrict to Python built-in types + TreeNode/ListNode → filter by unique outputs → difficulty stratification → random selection of 256 eval tasks. Criteria at each stage are stated."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Limitations' is a dedicated section discussing two specific limitations: the uniform runtime distribution assumption and data contamination during model training."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 discusses a specific threat: 'we measure code efficiency under the assumption that the code runtime is uniformly distributed. The simplification streamlines code efficiency evaluation via limited solution samples. However, the distribution of code runtime in real-world scenarios is more intricate.' This is specific to their methodology."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The limitations section does not explicitly state what the results do NOT show. It does not bound the scope to Python, to algorithmic problems, or to open-source models. No explicit statements about what populations or settings are excluded from the claims."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full dataset with tasks, solutions, test case generators, and prompts is available on HuggingFace (Appendix A.12). Croissant metadata is also provided for structured access."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 2 describes data collection from LeetCode in detail: public programming tasks, filtering criteria (solution count, data structures, unique outputs), Locality-Sensitive Hashing for deduplication, GPT-4 for test case generators with LeetCode OJ validation."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data comes from LeetCode, a standard public programming platform. Historical solutions are scraped from public submissions."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The full pipeline from LeetCode collection through filtering stages to final split is documented in Section 2. Table 6 shows the final data distribution (1,633 train / 256 eval). Each filter criterion is specified with the resulting effect on the dataset."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding sources, grants, or acknowledgments section is present in the paper. One author is from Sea AI Lab (a commercial entity), but no funding disclosure is provided."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Nanyang Technological University, National University of Singapore, and Sea AI Lab. However, the paper does not evaluate Sea products, so no self-evaluation conflict exists."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding source is disclosed, making it impossible to assess funder independence. Sea AI Lab, the employer of one author, has commercial interest in code generation but is not evaluating its own products here."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "None of the 10 evaluated models have their training data cutoff dates stated in the paper. The models were trained on large code corpora that likely include LeetCode data, but no cutoff information is provided."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "Section 6 mentions data contamination as a limitation ('the presence of data contamination during the model training phase compromises the precision') but performs no actual analysis of whether LeetCode problems appeared in the models' training data."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "LeetCode problems have been publicly available for years and are widely used in code training corpora. The paper acknowledges contamination risk in Section 6 and proposes future dynamic updates, but does not quantify or address current contamination. No canary strings, membership inference, or decontamination analysis is performed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. All evaluation is automated benchmark testing of Code LLMs."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference cost, latency per model, tokens consumed, or cost per task is reported. The paper evaluates 10 models × 3 conditions × 256 tasks without any cost information."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is stated (two A100-80G GPUs, Section 4.3) and training steps are given (200 SFT, 500 DPO), but total GPU hours, wall-clock training time, and total evaluation time are not reported."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Appendix Figure 11 shows bootstrapped Beyond distributions across 50 runs for only 3 of 10 models (StarCoder2 3B/7B/15B). The remaining 7 models have no seed sensitivity analysis. Main results tables report single-configuration numbers."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "The paper explicitly states: 'We sample one solution for each task to calculate pass score' (Table 2 caption) and 'we sample 5 solutions for each task to calculate Beyond score' (Table 3 caption, Section 4.3: K=5)."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Specific hyperparameters are chosen (LoRA alpha=16, r=8, learning rates, etc.) but no search budget, search method, or number of configurations tried is reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Hyperparameters appear to be fixed choices without explanation of how they were selected. No validation-based selection, no comparison of configurations, no justification for the specific values chosen."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own benchmark and metric design without acknowledging potential bias. No independent evaluation or discussion of author-evaluation bias is present."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "DPO uses 500 training steps vs SFT's 200 steps, a 2.5x difference in training compute. This is not discussed as a potential confound when comparing DPO vs SFT performance. No performance-vs-compute curves are provided."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether LeetCode algorithmic problems are a valid proxy for real-world code efficiency. LeetCode tasks are predominantly algorithmic (sorting, dynamic programming, etc.) and may not reflect efficiency concerns in production code (I/O, database queries, system design)."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used. Models generate code directly from prompts without agentic frameworks."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "LeetCode problems and solutions have been publicly available for years before the evaluated models were trained. The paper does not discuss whether models may have seen these exact problems during training, despite this being a significant concern."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup leaks information. For example, the prompt format or function signatures might cue models that have memorized LeetCode solutions."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No analysis of whether Mercury-train and Mercury-eval tasks are sufficiently independent. Both come from LeetCode and may share algorithmic patterns that models can exploit."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection method is used. Section 6 mentions plans to 'update our benchmark via our open-sourced data collection framework to import new tasks dynamically' as future mitigation, but no current detection (canary strings, membership inference, n-gram overlap) is applied."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Leading Code LLMs can achieve ~65% on Pass (functional correctness) but less than 50% on Beyond (efficiency)",
    371       "evidence": "Table 2 shows deepseek-coder-33b-base achieves 65.0% Pass. Table 3 shows the same model achieves 48.53% Beyond. Section 4.4 discusses these results.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "DPO is a robust baseline for enhancing code efficiency compared with SFT",
    376       "evidence": "Table 3 shows DPO improves Beyond scores for most models above 6.7B (e.g., deepseek-coder-33b: 48.53→66.47, +17.94), while SFT often decreases Beyond scores (e.g., deepseek-coder-33b: 48.53→38.32, -10.21). Section 4.4.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "SFT may induce catastrophic forgetting in large models when pursuing code efficiency",
    381       "evidence": "Table 2 shows SFT decreases Pass on deepseek-coder-33b-base (65.0→58.7, -6.3) and CodeLlama-34b-hf (52.4→47.5, -4.9). Section 4.4 states 'SFT may induce catastrophic forgetting in the pursuit of heightened code efficiency.'",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "The Beyond metric is environment-agnostic, remaining consistent across hardware configurations",
    386       "evidence": "Appendix Figure 10 shows Beyond scores for two models across three CPU tiers (micro/small/standard) remaining consistent. However, only 2 models and 3 configurations are tested.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "Larger models have greater capacity to improve efficiency while retaining correctness (narrower Gap)",
    391       "evidence": "Table 3 shows DPO substantially narrows Gap in models larger than 15B (e.g., deepseek-coder-33b Gap: 18.50→5.79, CodeLlama-34b Gap: 15.49→8.01), while Gap widens in smaller models. Section 4.4.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Mercury's difficulty stratification effectively probes the upper limits of Code LLM capabilities",
    396       "evidence": "Table 2 shows consistent Pass score decline from Easy to Hard across all models (e.g., deepseek-coder-6.7b: 69.3 Easy → 56.1 Hard). Section 4.4.",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Severe LeetCode contamination risk",
    403       "detail": "All 10 evaluated models were trained on internet-scale code corpora that almost certainly include LeetCode problems and solutions. The paper acknowledges contamination in Section 6 but performs no analysis to quantify it. Performance numbers may reflect memorization rather than code generation capability, directly undermining the benchmark's validity."
    404     },
    405     {
    406       "flag": "No error bars on main results",
    407       "detail": "Tables 2 and 3 report point estimates for all 30 model configurations (10 models × 3 conditions) without any uncertainty quantification. Appendix Figure 11 shows variance for only 3 of 10 models. The reported differences (e.g., +1.0 vs -2.2) may not be meaningful given unmeasured variance."
    408     },
    409     {
    410       "flag": "Unequal training compute between SFT and DPO",
    411       "detail": "DPO uses 500 training steps while SFT uses only 200 steps (Section 4.3), a 2.5x difference. DPO also uses 5 solution pairs per task vs SFT's single solution, providing more training signal. These compute and data differences are not acknowledged when concluding that DPO is superior to SFT."
    412     },
    413     {
    414       "flag": "Uniform runtime distribution assumption acknowledged but not validated",
    415       "detail": "The Beyond metric assumes code runtimes are uniformly distributed between min and max historical solutions (Section 6). Real runtime distributions are often multimodal (reflecting O(n), O(n log n), O(n²) clusters). This could systematically bias the efficiency metric, but no validation of the assumption is performed."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Evaluating large language models trained on code",
    421       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    422       "year": 2021,
    423       "arxiv_id": "2107.03374",
    424       "relevance": "Introduces HumanEval and Codex, foundational code generation benchmark and model used as a comparison point in Mercury."
    425     },
    426     {
    427       "title": "Program synthesis with large language models",
    428       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    429       "year": 2021,
    430       "arxiv_id": "2108.07732",
    431       "relevance": "Introduces MBPP benchmark for code generation evaluation, used as a secondary benchmark in Mercury experiments."
    432     },
    433     {
    434       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    435       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    436       "year": 2024,
    437       "relevance": "Introduces EvalPlus with augmented test cases for code generation, directly motivating Mercury's test case coverage approach."
    438     },
    439     {
    440       "title": "Measuring coding challenge competence with APPS",
    441       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    442       "year": 2021,
    443       "arxiv_id": "2105.09938",
    444       "relevance": "Large-scale coding challenge benchmark from online platforms, a key comparison point for Mercury's benchmark design."
    445     },
    446     {
    447       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    448       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    449       "year": 2024,
    450       "arxiv_id": "2403.07974",
    451       "relevance": "Addresses benchmark contamination in code evaluation, directly relevant to Mercury's acknowledged contamination limitations."
    452     },
    453     {
    454       "title": "Direct preference optimization: Your language model is secretly a reward model",
    455       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    456       "year": 2023,
    457       "arxiv_id": "2305.18290",
    458       "relevance": "DPO method used as Mercury's primary training baseline for improving code efficiency in LLMs."
    459     },
    460     {
    461       "title": "Code llama: Open foundation models for code",
    462       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    463       "year": 2023,
    464       "arxiv_id": "2308.12950",
    465       "relevance": "Open code LLM family (7B/13B/34B) evaluated as key baselines on Mercury benchmark."
    466     },
    467     {
    468       "title": "Starcoder 2 and the stack v2: The next generation",
    469       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    470       "year": 2024,
    471       "arxiv_id": "2402.19173",
    472       "relevance": "StarCoder2 model family (3B/7B/15B) evaluated on Mercury, representing the latest open code LLMs at time of publication."
    473     },
    474     {
    475       "title": "Starcoder: may the source be with you!",
    476       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    477       "year": 2023,
    478       "arxiv_id": "2305.06161",
    479       "relevance": "First-generation StarCoder model, predecessor to StarCoder2 models evaluated in Mercury."
    480     },
    481     {
    482       "title": "Competition-level code generation with AlphaCode",
    483       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    484       "year": 2022,
    485       "relevance": "Competition-level code generation system relevant to evaluating code LLM capabilities on challenging programming tasks."
    486     },
    487     {
    488       "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques",
    489       "authors": ["Mohammed Latif Siddiq", "Joanna CS Santos"],
    490       "year": 2022,
    491       "relevance": "Security-focused code generation benchmark, representing the broadening of code evaluation beyond functional correctness."
    492     },
    493     {
    494       "title": "Learning to improve code efficiency",
    495       "authors": ["Binghong Chen", "Daniel Tarlow", "Kevin Swersky"],
    496       "year": 2022,
    497       "arxiv_id": "2208.05297",
    498       "relevance": "Prior work on improving code efficiency with ML, directly relevant to Mercury's efficiency optimization goal."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "Benchmark and dataset are released and usable by practitioners evaluating code LLM efficiency, though efficiency optimization via DPO requires ML training expertise."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that LLMs generate functionally correct but inefficient code is somewhat expected; the 65% Pass vs <50% Beyond gap quantifies a known intuition rather than overturning it."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No safety or security concerns raised. The paper is about code efficiency, not vulnerabilities or risks."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict. The paper introduces a straightforward new benchmark without challenging existing claims or institutions."
    517     },
    518     "demo_ability": {
    519       "score": 2,
    520       "justification": "Code on GitHub and dataset on HuggingFace allow researchers to run the benchmark, though setting up the sandbox environment requires effort."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Authors from NTU, NUS, and Sea AI Lab are respectable institutions but not top-tier brand names in the LLM space."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs