scan.json (27392B)
1 { 2 "paper": { 3 "title": "Benchmarking Correctness and Security in Multi-Turn Code Generation", 4 "authors": ["Ruchit Rawal", "Jeffrey Yang Fan Chiang", "Chihao Shen", "Jeffery Siyuan Tian", "Aastha Mahajan", "Tom Goldstein", "Yizheng Chen"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.13859" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Multi-turn code generation degrades correctness and security by 20-27% compared to single-turn, even for state-of-the-art models. The degradation is not explained by increased context length alone but by the challenge of reasoning over interdependent turns. Code-diff generation produces more insecure code than full-program generation. Agent scaffoldings (Aider, Codex, OpenHands) improve single-turn performance but are less effective in multi-turn scenarios.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper states 'We release our dataset here, and code here' in Section 1, and provides a HuggingFace link (https://huggingface.co/datasets/ai-sec-lab/mt-sec) in the abstract." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The dataset is released on HuggingFace at https://huggingface.co/datasets/ai-sec-lab/mt-sec as stated in the abstract." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions using NVIDIA A40 and A100 GPUs (Appendix B) but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper. The code is referenced but no README or reproduction guide is described." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables 1-11 are reported as point estimates (e.g., '51.9% C&S') with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., '20-27% drop') but reports no statistical significance tests. Differences are based on comparing raw percentages." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports absolute percentage drops with baselines (e.g., 'C&S score of Aider + GPT-5T decreases by 27.3%, falling from 53% to 25.7%'), providing sufficient context for effect magnitude." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The benchmark contains 2,376 multi-turn tasks from 792 seed prompts, but no justification for why this size is sufficient or any power analysis is provided." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread across runs is reported. Results appear to be single-run evaluations with zero temperature for non-reasoning models." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Single-turn performance serves as the primary baseline. The MT-Random control condition is also included. 32 models and 3 agent frameworks are compared." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include contemporary models like GPT-5, Claude Opus 4, O4-Mini, Gemini 2.5 Pro, and DeepSeek-R1, which are state-of-the-art at time of evaluation." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple ablation studies are provided: MT-Random control (Section 4), security policy insertion points (Table 2), Aider agent component ablation (Table 10), and editing format comparison (Table 11)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper uses two primary metrics: Correct & Secure (C&S) and Correct & Insecure (C&I), and also reports aggregated correctness (C&S + C&I)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Human verification by three security experts evaluated task faithfulness (93.1% accepted) and interaction-type alignment (91.6% agreement) of the generated benchmark tasks (Section 3)." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The benchmark reuses dynamic correctness and security tests from the seed single-turn datasets (SecCodePLT, BaxBench), which are separate from the generation prompts." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by interaction type (expansion, editing, refactor), by model, and per-CWE analysis is mentioned. Table 1 provides full per-model breakdowns." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Extensive qualitative failure analysis is provided in Appendix D (forgetting security instructions, over-refusals) and Appendix F (agent failure modes for OpenHands, Codex, Aider)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Key negative results include: agent scaffoldings underperform in multi-turn (despite improving single-turn), security policies don't close the single-to-multi-turn gap, every-turn policy can hurt performance, and over-refusals in thinking models." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims (20-27% C&S drop, code-diffs less secure, agents less effective in multi-turn) are all supported by Tables 1, 3, and surrounding analysis." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The MT-Random control experiment (Fig. 3) provides a controlled comparison to isolate the effect of multi-turn reasoning from context length, supporting the causal claim that degradation comes from cross-turn dependencies rather than context length alone." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Multi-Turn Code Generation' broadly but results are limited to tasks derived from two specific benchmarks (SecCodePLT, BaxBench) covering 27 CWEs. The paper does not bound generalizations to these specific task types or acknowledge that real-world multi-turn coding may differ from synthetic transformations." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper explicitly tests alternative explanations: context length (MT-Random experiment, Section 4), target task length (Appendix C.1), and provides the longer single-turn baseline comparison (Table 6)." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures pass/fail on dynamic unit tests and frames this as evaluating 'correctness and security' in 'real-world coding workflows.' It does not discuss the gap between passing unit tests and actual real-world security, nor acknowledge that their synthetic multi-turn tasks are a proxy for actual developer workflows." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Table 5 in Appendix A provides exact model checkpoints for all 32+ models (e.g., 'gpt-5-2025-08-07', 'claude-opus-4-20250514', 'o4-mini-2025-04-16')." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt templates for multi-turn generation (editing, expansion, refactor) are provided in Appendix G (Sections G.1-G.4), including the targeted regeneration prompt." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix B states: zero temperature for non-reasoning models, 0.7 for reasoning models, thinking budget set to 'low' or 4000 tokens. Agent configurations are detailed in Appendix E." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "Agent scaffolding is described in detail in Appendix E.1 for all three agents (OpenHands, Codex, Aider), including versions, configurations, and interaction setup." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The seed prompt selection criteria are documented (Section 3): dynamic tests required, longest implementations selected, 22-24 per CWE for SecCodePLT. Consistency guardrails and human verification processes are detailed." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "The Discussion & Conclusions section (Section 5) discusses limitations including lack of intermediate turn evaluation and quality fluctuation during turns." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations discussed are generic ('we do not evaluate the quality of intermediate code') rather than specific threats like the validity of synthetic multi-turn transformations as a proxy for real developer interactions, or selection bias in CWE coverage." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the specific synthetic transformation approach, the two source benchmarks, or the specific CWEs tested." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The dataset is released on HuggingFace (https://huggingface.co/datasets/ai-sec-lab/mt-sec) for independent verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3 describes the full data construction pipeline: seed prompt selection, synthetic dialogue generation using GPT-4o, consistency guardrails, and human verification." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "For human verification, the paper states 'three security experts (two authors and one external volunteer)' but does not describe how the external volunteer was recruited or whether this introduces bias." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The three-stage pipeline is documented: seed prompt selection (with counts), synthetic generation with guardrails (up to 3 regenerations), and human verification with acceptance rates (93.1% faithfulness, 91.6% alignment)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgments section lists funding: Open Philanthropy, NSF CAREER Award CNS-2442719, DARPA TIAMAT, NSF TRAILS Institute, Capital One Bank, Amazon Research Award, Google DeepMind, and OpenAI." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All authors are affiliated with University of Maryland, College Park, as stated in the paper header." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "The paper received gifts from Google DeepMind and OpenAI, whose models (Gemini, GPT series) are evaluated. These funders have a financial interest in how their models perform on benchmarks." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is provided in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the 32 models evaluated." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The benchmark is constructed from SecCodePLT and BaxBench, which are public datasets. No discussion of whether these appeared in model training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "SecCodePLT and BaxBench are publicly available benchmarks that may have been in training data for models evaluated. This contamination risk is not discussed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in an experimental study. The human verification of benchmark quality is not a human subjects study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human subjects study. The paper is a benchmark evaluation of LLMs." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in an experimental study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in an experimental study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in an experimental study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in an experimental study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in an experimental study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper mentions 'substantial cost of running evaluations' as a reason for limiting BaxBench experiments, but does not report actual API costs, tokens consumed, or per-example costs." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Appendix B states: 'two NVIDIA A40 GPUs, each with 48GB of memory, and two NVIDIA A100 GPUs, each with 82GB of memory' for open-source model experiments." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Results appear to be single-run with zero temperature (deterministic) for non-reasoning models. No seed sensitivity analysis is reported." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper does not explicitly state the number of runs per evaluation. Appendix B mentions 'we generate one sample per environment-task' for BaxBench but does not clarify for SecCodePLT." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search is described. Temperature and thinking budgets appear to use defaults or recommended values without search." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "For agent configurations, the paper uses default settings without justifying why these are optimal or comparing alternatives systematically (though Table 11 compares editing formats)." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed, so no multiple comparison correction is applied despite comparing 32+ models across multiple conditions." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors create the benchmark and evaluate all models on it. No discussion of author-evaluation bias or independent evaluation." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "No analysis of performance as a function of compute budget. Agent evaluations use different compute profiles without normalization." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "The MT-Random experiment (Section 4, Fig. 3) validates that the benchmark measures multi-turn reasoning rather than just context length effects. The paper also discusses what the benchmark captures vs. real-world workflows." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "Agent scaffoldings (Aider, OpenHands, Codex) are compared against base LLMs, but different agents use different scaffolding. When comparing agents, the scaffold-model confound is not explicitly addressed as a variable." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "SecCodePLT and BaxBench are publicly available datasets. The paper does not discuss whether models trained after their publication may have seen solutions." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information through context or whether multi-turn prompts provide hints not available in real usage." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The three interaction types for each seed prompt share the same tests. The paper does not discuss whether results across interaction types are non-independent." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention method is applied despite using publicly available benchmarks with models that could have seen them." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "There is a consistent 20-27% drop in 'correct & secure' outputs from single-turn to multi-turn settings across state-of-the-art models.", 364 "evidence": "Table 1 shows agent-scaffolded models experience 25-27% decline in C&S for MT-expansion, 14-17% for MT-editing, and 10-12% for MT-refactor. Non-agentic LLMs show 15-20% drops.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Performance degradation is not solely due to longer context length.", 369 "evidence": "MT-Random experiment (Fig. 3) shows that replacing first two turns with unrelated prompts (preserving context length) yields performance comparable to single-turn, while real multi-turn degrades substantially.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Code-diff generation produces more incorrect and insecure code than full-program generation.", 374 "evidence": "Table 3 shows consistent decline in C&S and increase in C&I across all 6 models tested when generating code-diffs vs full code (e.g., O4-Mini drops 10.5% in C&S).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Agent scaffoldings boost single-turn performance but are not as effective in multi-turn scenarios.", 379 "evidence": "Table 9 shows agents achieve higher single-turn C&S (e.g., O4-Mini Agent: 68.8% vs LLM: 56.8%) but lower multi-turn C&S (Agent: 33.0% vs LLM: 38.7% for MT-expansion).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Thinking models exhibit over-refusals in multi-turn settings that don't occur in single-turn.", 384 "evidence": "Section 4 reports Claude Sonnet 4T has 2.7% over-refusal rate and O3T has 0.8% in multi-turn, with zero refusals in single-turn. Verified by regex-based heuristic plus manual inspection.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Security policy insertion in the final turn is most effective, sometimes outperforming every-turn insertion.", 389 "evidence": "Table 2 shows final-turn insertion yields best C&S for OpenAI models (e.g., O3T: 34.8% final-turn vs 33.1% every-turn). Qualitative analysis in Appendix D.3 explains the mechanism.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "No statistical significance tests", 396 "detail": "All comparative claims (20-27% drop, model rankings) are based on raw percentage comparisons without any significance tests, despite the potential for measurement noise." 397 }, 398 { 399 "flag": "No variance or multiple-run reporting", 400 "detail": "Results appear to be single-run evaluations. Even with zero temperature, API-based models may have non-deterministic behavior, and no robustness checks are reported." 401 }, 402 { 403 "flag": "Benchmark contamination unaddressed", 404 "detail": "The benchmark is derived from publicly available SecCodePLT and BaxBench. Models trained after these datasets' publication may have seen their solutions, potentially inflating single-turn scores and underestimating the true multi-turn gap." 405 }, 406 { 407 "flag": "Funding from evaluated companies", 408 "detail": "The research received gifts from Google DeepMind and OpenAI, whose models are prominently evaluated. No competing interests statement is provided." 409 }, 410 { 411 "flag": "Synthetic multi-turn validity", 412 "detail": "Multi-turn tasks are synthetically generated by GPT-4o transforming single-turn prompts. The paper assumes these transformations represent 'real-world coding workflows' but does not validate this against actual developer interaction logs." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI", 418 "authors": ["Yu Yang", "Yuzhou Nie", "Zhun Wang", "Yuheng Tang", "Wenbo Guo", "Bo Li", "Dawn Song"], 419 "year": 2024, 420 "arxiv_id": "2410.11096", 421 "relevance": "Source benchmark for MT-Sec; evaluates correctness and security of LLM-generated code using dynamic tests." 422 }, 423 { 424 "title": "BaxBench: Can LLMs Generate Correct and Secure Backends?", 425 "authors": ["Mark Vero", "Niels Mündler", "Victor Chibotaru", "Veselin Raychev", "Maximilian Baader", "Nikola Jovanović", "Jingxuan He", "Martin Vechev"], 426 "year": 2025, 427 "arxiv_id": "2502.11844", 428 "relevance": "Source benchmark for MT-Sec; evaluates LLMs on self-contained backend application generation with security tests." 429 }, 430 { 431 "title": "SecRepoBench: Benchmarking LLMs for Secure Code Generation in Real-World Repositories", 432 "authors": ["Connor Dilgren", "Purva Chiniya", "Luke Griffith", "Yu Ding", "Yizheng Chen"], 433 "year": 2025, 434 "arxiv_id": "2504.21205", 435 "relevance": "Evaluates LLM secure code generation in repository-level contexts, extending beyond isolated tasks." 436 }, 437 { 438 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 439 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 440 "year": 2025, 441 "relevance": "Early foundational work evaluating security vulnerabilities in Copilot-generated code." 442 }, 443 { 444 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 445 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 446 "year": 2024, 447 "arxiv_id": "2407.16741", 448 "relevance": "One of three agent frameworks evaluated; open-source agent platform for software development tasks." 449 }, 450 { 451 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 452 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 453 "year": 2024, 454 "arxiv_id": "2405.15793", 455 "relevance": "Influential agentic coding framework that relies on multi-turn interactions for software engineering tasks." 456 }, 457 { 458 "title": "Multi-IF: Benchmarking LLMs on Multi-Turn and Multilingual Instructions Following", 459 "authors": ["Yun He"], 460 "year": 2024, 461 "arxiv_id": "2410.15553", 462 "relevance": "Showed LLMs struggle with consistent instruction-following across multiple turns in NLP tasks." 463 }, 464 { 465 "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis", 466 "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"], 467 "year": 2022, 468 "arxiv_id": "2203.13474", 469 "relevance": "Prior multi-turn code generation benchmark that factorizes problems into sub-instructions." 470 }, 471 { 472 "title": "MINT: Evaluating LLMs in Multi-Turn Interaction with Tools and Language Feedback", 473 "authors": ["Xingyao Wang", "Zihan Wang", "Jiateng Liu"], 474 "year": 2023, 475 "arxiv_id": "2309.10691", 476 "relevance": "Multi-turn evaluation benchmark testing LLM problem-solving with tool and language feedback." 477 }, 478 { 479 "title": "CWEval: Outcome-Driven Evaluation on Functionality and Security of LLM Code Generation", 480 "authors": ["Jinjun Peng", "Leyi Cui", "Kele Huang", "Junfeng Yang", "Baishakhi Ray"], 481 "year": 2025, 482 "arxiv_id": "2501.08200", 483 "relevance": "Dynamic evaluation of LLM code generation security, showed static analyzers produce high false positive rates." 484 }, 485 { 486 "title": "Scaling Laws for Neural Language Models", 487 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 488 "year": 2020, 489 "arxiv_id": "2001.08361", 490 "relevance": "Foundational scaling laws paper referenced for model size-performance trends observed in the evaluation." 491 }, 492 { 493 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 494 "authors": ["Daya Guo"], 495 "year": 2025, 496 "arxiv_id": "2501.12948", 497 "relevance": "Reasoning model evaluated in the benchmark; relevant to understanding thinking model performance patterns." 498 } 499 ] 500 }