scan.json (27784B)
1 { 2 "paper": { 3 "title": "CODEELO: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings", 4 "authors": [ 5 "Shanghaoran Quan", 6 "Jiaxi Yang", 7 "Bowen Yu", 8 "Bo Zheng", 9 "Dayiheng Liu", 10 "An Yang", 11 "Xuancheng Ren", 12 "Bofei Gao", 13 "Yibo Miao", 14 "Yunlong Feng", 15 "Zekun Wang", 16 "Jian Yang", 17 "Zeyu Cui", 18 "Yang Fan", 19 "Yichang Zhang", 20 "Binyuan Hui", 21 "Junyang Lin" 22 ], 23 "year": 2025, 24 "venue": "arXiv preprint", 25 "arxiv_id": "2501.01257" 26 }, 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The paper states in Section 8 (Ethical Statement): 'we will conduct a comprehensive risk assessment and seek permission from the CodeForces platform before open-sourcing the entire submission and evaluation scaffold, and we have not included it in this version of the paper.' The submission/evaluation code is not released. The dataset is released at https://hf.co/datasets/Qwen/CodeElo and the benchmark website exists at https://CodeElo-bench.github.io, but the evaluation scaffold code is withheld." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": true, 37 "justification": "The dataset is publicly available at https://hf.co/datasets/Qwen/CodeElo, as stated in the paper header. The benchmark problems with metadata (contest divisions, difficulty ratings, algorithm tags) are released." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "Appendix B mentions decoding hyperparameters and that vLLM was used for open-source model inference, but no requirements.txt, Docker setup, library versions, or detailed environment specification is provided." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper does not provide step-by-step reproduction instructions. The evaluation scaffold code is withheld (Section 8), and the paper recommends that 'others independently reproduce our proposed method to conduct evaluations' without providing scripts or a README." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": true, 54 "justification": "Table 3 reports standard deviation values in parentheses after overall Elo ratings (e.g., 'o1-mini 1578 (89.2)'—though these are percentile ranks, not std devs). Section 5.3 discusses rating variance and presents violin plots (Figure 3) showing the distribution of Elo ratings across contests, reporting 'standard deviation between 300 and 500' per contest and 'around 50' for overall averages across 54 contests." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper makes numerous comparative claims (e.g., 'o1-mini and QwQ-32B-Preview stand out significantly') but no statistical significance tests are performed. Differences between models are assessed by comparing Elo ratings and pass rates directly." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports absolute Elo ratings and pass rates with baseline context. For example, 'o1-mini ... achieving Elo ratings of 1578' compared to human percentiles, and detailed pass rates across difficulty levels (Table 3). The magnitude of differences is clear from the numerical results." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "The benchmark uses 54 contests with 387 problems, but no justification is given for why this number is sufficient. Section 5.3 mentions that 'increasing the number of tested contests can be beneficial' and that 54 contests yield acceptable variance, but no formal power analysis or sample size justification is provided." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 5.3 reports variance across contests via violin plots (Figure 3), noting 'most models exhibit a standard deviation between 300 and 500' per contest and 'around 50' for overall averages. However, individual results per model appear to be from single runs per contest (8 attempts per problem), not multiple independent runs." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "The paper evaluates 30 open-source and 3 proprietary LLMs, providing comparisons across model families and sizes. Table 1 compares CODEELO against six prior benchmarks (APPS, CodeContests, TACO, xCodeEval, USACO, LiveCodeBench) on feature dimensions." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "The evaluated models include contemporary models as of late 2024: o1-mini, Claude-3.5-Sonnet-2024-10-22, ChatGPT-4o-latest-2024-11-20, QwQ-32B-Preview, Qwen2.5 series, DeepSeek-V2.5, and Mistral-Large-Instruct-2411. The benchmark comparisons include LiveCodeBench (2024) and USACO (2024)." 87 }, 88 "ablation_study": { 89 "applies": true, 90 "answer": false, 91 "justification": "There is no ablation study of the benchmark's components. The paper does not systematically vary aspects of the evaluation method (e.g., number of attempts, prompt design) to understand their individual contributions." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": true, 96 "justification": "The paper reports Elo ratings (overall and per-division), pass rates at different difficulty levels (Easy/Medium/Hard), and pass@n for n=1,2,4,8, as shown in Table 3." 97 }, 98 "human_evaluation": { 99 "applies": true, 100 "answer": false, 101 "justification": "No human evaluation of the system's outputs is performed. The evaluation is entirely automated through the CodeForces platform judging system. Human-comparable Elo ratings are computed from platform data, but no humans evaluate the quality of model-generated code beyond pass/fail." 102 }, 103 "held_out_test_set": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper uses contest problems from May 4, 2024 to November 4, 2024, and explicitly states they 'only tested on the recently held contests to avoid data contamination' (Section 3.1). The test cases are hidden on the CodeForces platform and unavailable to models." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": true, 111 "justification": "Extensive breakdowns are provided: by contest division (Div. 1+2, 2, 3, 4), by problem difficulty (Easy/Medium/Hard), by algorithm tag (16 categories in Table 4), and by programming language (C++ vs Python in Figure 2)." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 5.1 discusses algorithm categories where models struggle: 'they struggle with dp (DP), dfs and similar (DFS), and trees (Tr.), with many models failing to solve even a single problem under these algorithms.' Section 5.2 discusses the Python vs C++ performance gap as a failure mode." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper reports that most models 'struggle even with the easiest problems, placing in the lowest 25 percent among all human participants' (Abstract). It also reports that models perform worse in Python than C++ despite defaulting to Python, which is a finding that contradicts assumptions in prior benchmarks." 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "The abstract claims are supported: o1-mini achieving Elo 1578 and QwQ-32B-Preview achieving 1261 are shown in Figure 1 and Table 3. The claim that 'other models struggle even with the easiest problems, placing in the lowest 25 percent' is supported by Table 3 and Table 6 percentile data." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper makes implicit causal claims, e.g., 'This suggests that increasing the length of the chain-of-thought (CoT) is a promising way to improve the models' reasoning ability' (Section 4.2). This causal inference is drawn from observing that o1-like reasoning models perform better, but this is confounded by many other differences between models (training data, architecture, scale). No controlled experiment isolates CoT length as the causal factor." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": false, 138 "justification": "The title claims to benchmark 'Competition-level Code Generation of LLMs' generally, but the evaluation is limited to CodeForces problems only, using C++ as the primary language, with a specific time window (May-Nov 2024). The paper does not explicitly bound its findings to CodeForces-style competitive programming rather than competition-level coding in general." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper does not discuss alternative explanations for its main findings. For example, the C++ vs Python performance gap could be due to training data composition rather than runtime efficiency, but this is not explored. The strong performance of o1-like models could be due to factors beyond CoT length." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix A (Table 5) lists all 33 models with specific version identifiers and HuggingFace endpoints. Proprietary models include date-stamped versions: 'Claude-3-5-Sonnet-2024-10-22', 'ChatGPT-4o-latest-2024-11-20'. Open-source models have exact HuggingFace model paths. However, o1-mini lacks a specific snapshot date." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "The exact prompt is provided in Section 4.1: 'You are a coding expert. Given a competition-level coding problem, you need to write a C++ program to solve it. You may start by outlining your thought process. In the end, please provide the complete code in a code block enclosed with ``` ```.'" 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Appendix B reports decoding hyperparameters: 'temperature=0.7, top_p=0.8, top_k=20, and repetition_penalty=1.1' for open-source models, maximum output tokens of 4,096 (32,768 for QwQ-32B-Preview). Proprietary models use 'API calls with default parameters.'" 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The evaluation scaffold is described: models receive the problem, generate a solution, the code block is parsed, and an automatic submission bot submits it to CodeForces (Section 3.3.1). Up to 8 attempts per problem, no time penalty, but failed-attempt penalties are counted. This is a simple scaffold (prompt → generate → submit) without iterative feedback loops." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 3.1 describes data collection: problems scraped from CodeForces in raw HTML format, parsed into sections (problem description, input format, output format, examples, notes). Section 3.2 describes classification by division, difficulty rating, and algorithm tags. Section 4.1 explains that Div. 1 contests were discarded and the time window was May 4 - November 4, 2024." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6.2 is dedicated to 'Limitations' and discusses two specific limitations: the 8-submission limit per problem and reliance on the CodeForces platform for judging." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 6.2 discusses specific threats: (1) the 8-submission cap may underestimate model Elo ratings; (2) platform dependency means offline evaluation is not possible. These are specific to this study rather than generic boilerplate." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound generalization to CodeForces-only, does not state that results may not transfer to other competition platforms (AtCoder, LeetCode contests), and does not clarify that the Elo ratings are specific to the tested time period and problem set." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The dataset is publicly available at https://hf.co/datasets/Qwen/CodeElo, which includes the problems with metadata. The CodeForces platform provides the ground truth judging, which is independently verifiable." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 3.1 describes data collection from CodeForces: all problems from rated contests, recently held contests selected for testing. Section 4.1 specifies the time window (May 4 - November 4, 2024), totaling 54 contests and 387 problems. Table 2 provides statistics per division." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants were recruited for this study. The benchmark evaluates LLMs, and human Elo ratings are taken from publicly available CodeForces platform data." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: problems scraped from CodeForces in HTML → parsed into sections → classified by division/difficulty/tags → models prompted → solutions submitted via bot → results collected → Elo ratings calculated. Section 4.1 explains discarding Div. 1 contests. The flow from collection to final analysis is traceable." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source or acknowledgment of financial support is mentioned in the paper. All authors are from Alibaba Group (Qwen Team), but there is no explicit funding disclosure." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All authors are listed as 'Qwen Team, Alibaba Group' with institutional email addresses. The affiliation is clearly stated." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "All authors are from Alibaba Group, which develops the Qwen model family. Several Qwen models are evaluated in the benchmark. Alibaba has a financial interest in demonstrating Qwen models' capabilities. The funder (employer) is not independent of the outcome." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is included in the paper. The authors work at Alibaba, which develops and markets several of the evaluated models (Qwen2.5 series, QwQ-32B-Preview), but this conflict is not explicitly acknowledged." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper does not state the training data cutoff dates for any of the 33 evaluated models. Section 3.1 mentions using 'recently held contests' to avoid contamination, but the actual training cutoffs are not reported." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 3.1 states: 'we only tested on the recently held contests to avoid data contamination.' The temporal selection (May-November 2024 contests) is explicitly chosen to reduce overlap with training data. However, no formal analysis of whether any model's training data could include these problems is provided." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "The paper directly addresses contamination as a key design consideration. Section 3.1 explicitly states temporal selection to avoid it. Section 2 notes that LiveCodeBench 'avoids contamination by re-scraping new problems every month.' CODEELO's online update mechanism is designed to maintain contamination-free evaluation. The benchmark also supports ongoing updates with new contest problems." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. The benchmark evaluates LLMs; human Elo ratings are from existing CodeForces platform data." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference cost, API spend, or tokens consumed are reported. The paper evaluates 33 models across 387 problems with up to 8 attempts each, but the total cost is not mentioned." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No compute budget is stated. The paper mentions using vLLM for open-source model inference but does not report GPU hours, hardware used, or total computational resources." 298 } 299 } 300 }, 301 "claims": [ 302 { 303 "claim": "o1-mini achieves the best Elo rating of 1578, surpassing nearly 90 percent of human participants on CodeForces.", 304 "evidence": "Figure 1 and Table 3 show o1-mini at Elo 1578 with 89.2 percentile rank. Table 6 confirms the 90th percentile human rating is 1603, making 1578 approximately the 89th percentile.", 305 "supported": "strong" 306 }, 307 { 308 "claim": "QwQ-32B-Preview stands out among open-source models with an Elo rating of 1261, placing around the 60th percentile.", 309 "evidence": "Table 3 shows QwQ-32B-Preview at 1261 (63.6 percentile). Table 6 shows the 60th percentile is 1218, consistent with the claim.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Most models struggle even with the easiest problems and fall within the lowest 10th percentile of Elo ratings among human participants.", 314 "evidence": "Table 3 shows that all open-source models except QwQ-32B-Preview, and the non-o1 proprietary models, have Elo ratings below 710, which Table 6 places below the 25th percentile. Many models fall below 383 (10th percentile).", 315 "supported": "strong" 316 }, 317 { 318 "claim": "Increasing the length of chain-of-thought (CoT) is a promising way to improve models' reasoning ability.", 319 "evidence": "This claim is based on the observation that o1-mini and QwQ-32B-Preview (both o1-like reasoning models) significantly outperform other models. Section 4.2. However, this is a correlation, not a controlled experiment isolating CoT length.", 320 "supported": "weak" 321 }, 322 { 323 "claim": "Models perform better in C++ than Python on competition-level coding problems.", 324 "evidence": "Figure 2 shows Elo ratings for 5 models when constrained to C++ vs Python. All models achieve higher ratings with C++. Section 5.2 discusses this finding in detail.", 325 "supported": "moderate" 326 }, 327 { 328 "claim": "CODEELO achieves zero false positives by submitting solutions directly to the CodeForces platform.", 329 "evidence": "Section 3.3.1 explains that solutions are submitted to CodeForces and judged against the platform's hidden test cases and special judges. Since the platform's judgment is the ground truth for competitive programming, this achieves zero false positives by definition.", 330 "supported": "strong" 331 }, 332 { 333 "claim": "About 30% of CodeForces problems require special judges.", 334 "evidence": "Appendix F states: 'We conducted an empirical study and found that 30 out of 100 randomly selected problems required special judges.'", 335 "supported": "moderate" 336 } 337 ], 338 "methodology_tags": [ 339 "benchmark-eval" 340 ], 341 "key_findings": "CODEELO introduces a competition-level code generation benchmark using CodeForces problems with direct platform submission for zero false-positive judging and human-comparable Elo ratings. Testing 33 LLMs, the paper finds that o1-mini (Elo 1578, ~89th percentile) and QwQ-32B-Preview (Elo 1261, ~64th percentile) dramatically outperform all other models, most of which fall below the 20th percentile of human participants. An unexpected finding is that all tested models perform better when generating C++ rather than Python, despite defaulting to Python, contradicting prior benchmarks' exclusive use of Python evaluation.", 342 "red_flags": [ 343 { 344 "flag": "Conflict of interest: Alibaba evaluating Qwen models", 345 "detail": "All authors are from Alibaba's Qwen Team. Multiple Qwen models are evaluated, and QwQ-32B-Preview is highlighted as the best open-source model. While the benchmark evaluates many non-Qwen models too, the undisclosed conflict of interest is a concern. No competing interests statement is included." 346 }, 347 { 348 "flag": "No statistical significance tests for comparative claims", 349 "detail": "The paper makes numerous claims about models 'standing out significantly' and performing better than others, but no statistical tests are applied. Given the variance shown in Figure 3 (std dev 300-500 per contest), some of the smaller differences between models may not be statistically significant." 350 }, 351 { 352 "flag": "Evaluation scaffold code withheld", 353 "detail": "The submission bot and evaluation scaffold are not released (Section 8), making independent verification of the evaluation procedure difficult. Researchers must independently implement the submission pipeline." 354 }, 355 { 356 "flag": "Causal claims from correlational evidence", 357 "detail": "The claim that 'increasing the length of chain-of-thought is a promising way to improve models' reasoning ability' (Section 4.2) is based on observing that o1-like models perform better, but many confounding factors differ between these models." 358 }, 359 { 360 "flag": "No compute or cost reporting", 361 "detail": "Running 33 models across 387 problems with 8 attempts each represents substantial compute, but no cost or compute budget is reported, making practical reproducibility assessment impossible." 362 } 363 ], 364 "cited_papers": [ 365 { 366 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 367 "authors": ["Naman Jain", "King Han", "Alex Gu"], 368 "year": 2024, 369 "arxiv_id": "2403.07974", 370 "relevance": "Contamination-free code benchmark that CODEELO directly compares against; relevant to benchmark evaluation methodology." 371 }, 372 { 373 "title": "Competition-level code generation with AlphaCode", 374 "authors": ["Yujia Li", "David Choi", "Junyoung Chung"], 375 "year": 2022, 376 "relevance": "Foundational work on competition-level code generation using CodeForces, providing context for CODEELO's benchmark design." 377 }, 378 { 379 "title": "Can Language Models Solve Olympiad Programming?", 380 "authors": ["Quan Shi", "Michael Tang", "Karthik Narasimhan", "Shunyu Yao"], 381 "year": 2024, 382 "arxiv_id": "2404.10952", 383 "relevance": "USACO benchmark for competition-level programming evaluation, directly compared in Table 1." 384 }, 385 { 386 "title": "Evaluating Large Language Models Trained on Code", 387 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 388 "year": 2021, 389 "arxiv_id": "2107.03374", 390 "relevance": "HumanEval benchmark paper, foundational for LLM code generation evaluation." 391 }, 392 { 393 "title": "Measuring Coding Challenge Competence with APPS", 394 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 395 "year": 2021, 396 "arxiv_id": "2105.09938", 397 "relevance": "Early competition-level coding benchmark directly compared against CODEELO." 398 }, 399 { 400 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 401 "authors": ["Terry Yue Zhuo", "Minh Chien Vu", "Jenny Chim"], 402 "year": 2024, 403 "arxiv_id": "2406.15877", 404 "relevance": "General code benchmark with complex instructions, contrasted with competition-level benchmarks." 405 }, 406 { 407 "title": "Program Synthesis with Large Language Models", 408 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 409 "year": 2021, 410 "arxiv_id": "2108.07732", 411 "relevance": "MBPP benchmark paper, foundational for LLM code generation evaluation methodology." 412 }, 413 { 414 "title": "Qwen2.5-Coder Technical Report", 415 "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"], 416 "year": 2024, 417 "arxiv_id": "2409.12186", 418 "relevance": "Technical report for one of the major model families evaluated in the benchmark." 419 }, 420 { 421 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 422 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 423 "year": 2024, 424 "arxiv_id": "2401.14196", 425 "relevance": "Technical report for DeepSeek-Coder models, a major model family evaluated in the benchmark." 426 }, 427 { 428 "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", 429 "authors": ["Qihao Zhu", "Daya Guo", "Zhihong Shao"], 430 "year": 2024, 431 "arxiv_id": "2406.11931", 432 "relevance": "State-of-the-art open-source code model evaluated as a competitive baseline." 433 }, 434 { 435 "title": "The Llama 3 Herd of Models", 436 "authors": ["Abhimanyu Dubey", "Abhinav Jauhri", "Abhinav Pandey"], 437 "year": 2024, 438 "arxiv_id": "2407.21783", 439 "relevance": "Technical report for Llama 3.1 models used as baselines in the benchmark evaluation." 440 } 441 ] 442 }