scan.json (24794B)
1 { 2 "paper": { 3 "title": "Qwen2.5 Technical Report", 4 "authors": ["An Yang", "Baosong Yang", "Beichen Zhang", "Binyuan Hui", "Bo Zheng"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2412.15115", 8 "doi": "10.48550/arXiv.2412.15115" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Qwen2.5 scales pre-training data from 7T to 18T tokens and introduces multi-stage reinforcement learning (DPO + GRPO) for post-training. The open-weight 72B model matches or exceeds Llama-3-405B-Instruct on many benchmarks despite being ~5x smaller. MoE variants (Qwen2.5-Turbo, Qwen2.5-Plus) offer competitive performance at lower cost. Qwen2.5-Turbo handles up to 1M token context length with 100% passkey retrieval accuracy.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper links to https://github.com/QwenLM/Qwen2.5 and models are available on Hugging Face, ModelScope, and Kaggle. Over 100 models released." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The 18T token pre-training dataset and 1M+ SFT examples are not released. Only the model weights are open. No training data is made publicly available." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment specifications are provided in the paper. Hardware and dependency details are not described." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions for training or evaluation are provided. The paper describes methodology at a high level but does not give reproducible commands or scripts." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All benchmark results are reported as point estimates without confidence intervals or error bars. Tables 2-17 show only single numbers." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes numerous comparative claims (e.g., 'outperforms', 'surpasses') but no statistical significance tests are reported anywhere." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results are reported with absolute scores and baselines, allowing readers to compute effect sizes. E.g., MATH scores of 62.1 vs 50.9 (Qwen2-72B) or 53.8 (Llama-3-405B) provide sufficient context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for benchmark sample sizes or discussion of whether benchmark sizes are sufficient for the claims made." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or multiple-run statistics are reported. All results appear to be single-run evaluations." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Extensive baselines are included: Llama-3/3.1 series, Gemma2 series, Mixtral, GPT-4o/4o-mini, Claude 3.5 Sonnet, Yi-1.5, Mistral, Phi-3.5, MiniCPM3, and predecessor Qwen2 models." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include contemporary models from 2024: Llama-3.1-405B, GPT-4o, Claude 3.5 Sonnet, Gemma2, which were state-of-the-art at time of writing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Ablation on DCA+YARN is shown in Tables 16-17, demonstrating the contribution of length extrapolation techniques. Scaling law experiments study hyperparameter effects across model sizes." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Extensive metrics across categories: MMLU, MMLU-Pro, BBH, ARC-C, TruthfulQA, MATH, GSM8K, HumanEval, MBPP, MultiPL-E, IFEval, Arena-Hard, MTBench, RULER, LV-Eval, etc." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Human preference alignment is evaluated via Arena-Hard and MT-Bench. Human and automated labeling processes are used for reward model training data. In-house evaluations include human preference benchmarks." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Standard benchmarks with established test splits are used. The paper describes decontamination via n-gram matching (Section 5) to prevent test data leakage." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by category: general tasks, math & science, coding, multilingual, alignment, long context. In-house evaluations further break down by IF, Knowledge, Comprehension, Coding, Math, Reasoning." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No failure cases or error analysis are discussed. The paper does not examine where Qwen2.5 fails or performs poorly relative to expectations." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "No negative results are reported. Every comparison shows improvement or competitive performance. The DCA+YARN ablation shows degradation when removed, but no failed approaches or configurations are discussed." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims about competitive performance with Llama-3-405B and GPT-4o are supported by Tables 2, 6, and 7. Claims about 18T tokens and 1M+ SFT samples are described in Sections 3 and 4." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper attributes improvements to data scaling (7T→18T), better data filtering, and post-training techniques, but no controlled experiments isolate individual contributions. The causal chain from specific changes to performance gains is asserted, not demonstrated." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract and introduction use broad framing ('comprehensive series', 'meet diverse needs') while results are on specific benchmarks. No explicit bounding of generalization claims to tested settings." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations for performance improvements are discussed. The paper does not consider whether benchmark scores could be influenced by factors other than the claimed improvements." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses benchmark scores as proxies for capabilities like 'reasoning', 'understanding', and 'language comprehension' without discussing the gap between what benchmarks measure and what these capability labels imply." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Baseline model versions are specified: GPT-4o-2024-08-06, GPT-4o-2024-11-20, Claude3.5-sonnet-2024-10-22, Llama-3.1-70B-Instruct, etc. Qwen2.5 model sizes and architectures are detailed in Table 1." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "No evaluation prompts or system instructions are provided. The few-shot settings are described (e.g., '5-shot', '0-shot') but actual prompt text is not given." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Training hyperparameters are reported: learning rates (7×10⁻⁶ to 7×10⁻⁷), weight decay (0.1), gradient clipping (1.0), batch sizes (2048), SFT epochs (2), DPO pairs (150K). Architecture details in Table 1." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. This is a model training and benchmark evaluation paper." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.1 describes data filtering using Qwen2-Instruct as quality filters, domain rebalancing strategy, synthetic data generation and filtering. Section 4.1 describes SFT data curation including back-translation, rejection sampling, and response filtering." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no limitations section. The paper ends with a Conclusion (Section 6) and future work directions but does not discuss limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address potential issues with its evaluation methodology or claims." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show or what settings are excluded from its claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw evaluation data or training data is made available. Only aggregated benchmark scores are reported." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Pre-training data collection is described in Section 3.1 (quality filtering, domain rebalancing, synthetic data). Post-training data collection is described in Sections 4.1-4.3 with specific procedures for each capability area." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks and internally curated datasets." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The training pipeline is documented: pre-training (4K→32K context extension), SFT (1M+ examples, 2 epochs), offline RL (150K DPO pairs, 1 epoch), online RL (GRPO with reward model). Data filtering steps described in Section 3.1." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section. The paper does not disclose funding sources." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Authors are identified as 'Qwen Team' with links to Hugging Face and ModelScope pages associated with Alibaba/Qwen." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Alibaba Cloud is both the developer and evaluator of Qwen2.5. The proprietary models are offered through Alibaba Cloud Model Studio. The funder has direct financial interest in positive results." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement. Alibaba has commercial interest in Qwen2.5 models (offered via Alibaba Cloud), but this is not explicitly declared." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No explicit training data cutoff date is stated. The paper does not specify when the 18T token pre-training data was collected." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Section 5 describes n-gram matching decontamination: training sequences are removed if LCS with test sequences satisfies |LCS| ≥ 13 and |LCS| ≥ 0.6 × min(|st|,|se|)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section 5 explicitly addresses benchmark contamination with a concrete decontamination procedure using longest common subsequence matching between training and test data." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human subjects study. This is a model training and benchmark evaluation paper." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human subjects study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human subjects study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human subjects study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human subjects study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human subjects study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human subjects study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "Inference cost is not quantified. TTFT speedups are shown (Figure 3) but absolute costs, API pricing, or per-example inference times are not reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget is stated. GPU hours, hardware used for training, or total training cost are not disclosed despite training 7+ models on 18T tokens." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No results across multiple random seeds are reported. All benchmark results appear to be single-run evaluations." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of evaluation runs is not stated for any benchmark result." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Scaling laws for hyperparameters are discussed (Section 3.2) but the search budget (number of configurations tried, compute spent on search) is not quantified." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 3.2 describes using scaling laws to predict optimal hyperparameters across model sizes, providing a principled selection criterion rather than arbitrary tuning." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are used at all, so multiple comparison correction is not applicable in practice, but given the many comparisons claimed, it should have been. No correction is applied." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The Qwen team evaluates their own models against competitors without acknowledging self-evaluation bias. No independent evaluation is mentioned." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Performance is not reported as a function of compute budget. The paper claims Qwen2.5-72B matches Llama-3-405B without comparing their training compute budgets." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses many benchmarks without discussing whether they measure the claimed capabilities. Section 5.2.3 does note that RM benchmarks don't predict RL performance, but this is limited to reward models." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in the evaluations. Models are tested directly on benchmarks." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. Many benchmarks used (HumanEval 2021, MMLU 2021, GSM8K 2021) predate the training data, and this is not discussed beyond the n-gram decontamination." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of feature leakage or whether evaluation setups provide information not available in real usage." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether training and test data share structural similarities beyond the n-gram overlap check." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "Section 5 describes a concrete decontamination method using n-gram matching with specific thresholds (LCS ≥ 13 and ≥ 0.6 × min length). This is a concrete prevention method applied to the training data." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Qwen2.5-72B-Instruct outperforms Llama-3.1-405B-Instruct on several benchmarks including MMLU-redux, MATH, MBPP, MultiPL-E, LiveCodeBench, Arena-Hard, and MTBench.", 365 "evidence": "Table 6 shows Qwen2.5-72B-Instruct scores: MMLU-redux 86.8 vs 86.2, MATH 83.1 vs 73.8, MBPP 88.2 vs 84.5, MultiPL-E 75.1 vs 73.5, LiveCodeBench 55.5 vs 41.6, Arena-Hard 81.2 vs 69.3, MTBench 9.35 vs 9.08.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Pre-training data scaling from 7T to 18T tokens significantly improves model capabilities, especially domain expertise.", 370 "evidence": "Figure 1 shows improvement from Qwen2 (7T) to Qwen2.5 (18T) on Math, MBPP, BBH, MMLU. Tables 2-5 show Qwen2.5 outperforming Qwen2 across size variants.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Qwen2.5-Turbo handles up to 1M token context with 100% passkey retrieval accuracy.", 375 "evidence": "Figure 2 shows 100% accuracy on passkey retrieval across all document depths and context lengths up to 1M tokens. Table 16 shows RULER scores.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Qwen2.5-Turbo and Qwen2.5-Plus perform competitively against GPT-4o-mini and GPT-4o respectively.", 380 "evidence": "Tables 6-7 show Qwen2.5-Plus outperforming GPT-4o-mini on most benchmarks, and Qwen2.5-Turbo competitive with GPT-4o-mini.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Two-stage reinforcement learning (offline DPO + online GRPO) significantly enhances human preference alignment.", 385 "evidence": "Arena-Hard and MTBench scores in Tables 6-8 show high alignment scores, but no ablation isolating RL contribution vs. SFT alone is provided.", 386 "supported": "weak" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Company evaluating its own product", 392 "detail": "Alibaba's Qwen team evaluates their own Qwen2.5 models. All benchmark runs, baseline reproductions, and in-house evaluations are conducted by the same team that developed the models. No independent evaluation is mentioned." 393 }, 394 { 395 "flag": "No error bars or variance across runs", 396 "detail": "All benchmark results are single point estimates across 20+ benchmarks and 7+ model sizes. No variance, confidence intervals, or multi-run statistics are reported despite LLM evaluation being known to be noisy." 397 }, 398 { 399 "flag": "No negative results", 400 "detail": "Every comparison shows Qwen2.5 improving over Qwen2 and being competitive with or better than all baselines. No failed experiments, unsuccessful configurations, or areas of clear weakness are discussed." 401 }, 402 { 403 "flag": "No limitations section", 404 "detail": "The paper has no limitations, threats to validity, or scope boundaries discussion. This is a significant omission for a paper making broad capability claims about a model family." 405 }, 406 { 407 "flag": "Undisclosed compute budget", 408 "detail": "Training 7+ dense models and 2 MoE models on 18T tokens with 1M+ SFT examples requires massive compute, but no GPU hours, hardware specifications, or training costs are disclosed." 409 }, 410 { 411 "flag": "Selective benchmark presentation", 412 "detail": "In-house benchmarks (Tables 11-12) are used alongside open benchmarks, but the in-house benchmarks are not publicly available for independent verification." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Evaluating large language models trained on code", 418 "authors": ["Mark Chen"], 419 "year": 2021, 420 "arxiv_id": "2107.03374", 421 "relevance": "Introduces HumanEval benchmark for code generation, widely used for LLM code evaluation." 422 }, 423 { 424 "title": "Training language models to follow instructions with human feedback", 425 "authors": ["Long Ouyang"], 426 "year": 2022, 427 "relevance": "Foundational RLHF paper that establishes the SFT + RLHF post-training paradigm used by Qwen2.5." 428 }, 429 { 430 "title": "Direct preference optimization: Your language model is secretly a reward model", 431 "authors": ["Rafael Rafailov"], 432 "year": 2023, 433 "relevance": "DPO method used as the offline RL stage in Qwen2.5's post-training pipeline." 434 }, 435 { 436 "title": "The Llama 3 herd of models", 437 "authors": ["Abhimanyu Dubey"], 438 "year": 2024, 439 "arxiv_id": "2407.21783", 440 "relevance": "Primary open-weight baseline compared against Qwen2.5 across all model sizes." 441 }, 442 { 443 "title": "Training compute-optimal large language models", 444 "authors": ["Jordan Hoffmann"], 445 "year": 2022, 446 "arxiv_id": "2203.15556", 447 "relevance": "Chinchilla scaling laws that inform Qwen2.5's hyperparameter optimization approach." 448 }, 449 { 450 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 451 "authors": ["Naman Jain"], 452 "year": 2024, 453 "arxiv_id": "2403.07974", 454 "relevance": "Contamination-free code benchmark used to evaluate Qwen2.5 instruction-tuned models." 455 }, 456 { 457 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 458 "authors": ["Jiawei Liu"], 459 "year": 2023, 460 "relevance": "Introduces HumanEval+ and MBPP+ for more rigorous code generation evaluation, used as Qwen2.5 benchmarks." 461 }, 462 { 463 "title": "RewardBench: Evaluating reward models for language modeling", 464 "authors": ["Nathan Lambert"], 465 "year": 2024, 466 "arxiv_id": "2403.13787", 467 "relevance": "Benchmark used to evaluate Qwen2.5's reward model, with discussion of Goodhart's law in RM evaluation." 468 }, 469 { 470 "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models", 471 "authors": ["Zhihong Shao"], 472 "year": 2024, 473 "arxiv_id": "2402.03300", 474 "relevance": "Introduces GRPO method used as Qwen2.5's online RL algorithm." 475 }, 476 { 477 "title": "LiveBench: A challenging, contamination-free LLM benchmark", 478 "authors": ["Colin White"], 479 "year": 2024, 480 "arxiv_id": "2406.19314", 481 "relevance": "Contamination-free benchmark used to evaluate Qwen2.5 instruction-tuned models." 482 } 483 ] 484 }