scan.json (26779B)
1 { 2 "paper": { 3 "title": "Joint Continual Learning of Local Language Models and Cloud Offloading Decisions with Budget Constraints", 4 "authors": [ 5 "Evan Chen", 6 "Wenzhi Fang", 7 "Shiqiang Wang", 8 "Christopher G. Brinton" 9 ], 10 "year": 2026, 11 "venue": "arXiv", 12 "arxiv_id": "2602.00166" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "DA-GRPO, a dual-advantage extension of GRPO, jointly learns task competence and cloud offloading behavior under an explicit cloud usage budget during continual post-training. On math reasoning and code generation benchmarks with Qwen2.5-1.5B and Llama-3.2-3B, DA-GRPO consistently improves post-switch accuracy (up to 75.5% joint accuracy vs 68.5% for GAPG on MATH-lighteval), reduces catastrophic forgetting, and maintains stable cloud usage tracking the target budget. The dual variable λ converges regardless of initialization and adapts smoothly to time-varying collaboration targets.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "All datasets used are publicly available benchmarks: MATH-lighteval, ARC-Easy, ARC-Challenge, TACO-verified, and MMLU." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Table 3 lists training hyperparameters but no environment specifications (Python version, library versions, GPU type, requirements.txt, Dockerfile, or dependency lists) are provided." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Tables 1 and 2 report only point estimates (e.g., '77.2' accuracy) with no confidence intervals, error bars, or ± notation." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "Claims like 'DA-GRPO outperforms' are made by comparing raw numbers across methods. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Tables 1 and 2 report accuracy and forgetting rates for all baselines and proposed method side-by-side, providing sufficient baseline context to judge effect magnitudes (e.g., DA-GRPO 77.2% vs GAPG 67.8% during-task accuracy on MATH-lighteval)." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for the number of training examples, benchmark sizes, or why these particular dataset sizes are adequate for the claims." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported in any table or figure." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Extensive baselines are compared: Edge Tuning Only, Naive Router, trained Router, and collaborative training with GRPO, GVPO, and GAPG (Table 1, Table 2)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include GRPO (2025, DeepSeek-R1 paper), GVPO (NeurIPS 2025), and GAPG (2025). All are very recent methods." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Comparing DA-GRPO to plain GRPO effectively ablates the dual-advantage mechanism. Sensitivity analyses on η (Fig. 6), ηλ (Fig. 7), λ initialization (Fig. 4), and varying τ (Figs. 5, 8, 9, 10) further demonstrate parameter contributions." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Three primary metrics are reported: during-task accuracy, post-switch accuracy, and forgetting rate. The collaboration ratio is also tracked as an additional metric." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "All evaluation is fully automated using exact match (math/QA) and test case execution (code). No human evaluation of output quality is included." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "The paper trains on MATH-lighteval and evaluates on MATH-lighteval, similarly for TACO-verified. No explicit separation of training and evaluation splits is described." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by model (Qwen2.5-1.5B, Llama-3.2-3B), task (MATH, TACO, MATH-500, MMLU), and response type (local-solved vs joint local-cloud). Tables 1 and 2 provide detailed per-setting breakdowns." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No qualitative error analysis, specific failure examples, or discussion of where DA-GRPO breaks down is provided. Sensitivity analysis shows parameter regimes causing instability but no diagnosis of specific failure modes." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": false, 112 "justification": "DA-GRPO outperforms all baselines in every setting. While the sensitivity analysis shows parameter extremes causing instability (Figs. 6-7), no genuinely attempted approaches or configurations that failed to improve are discussed." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims DA-GRPO 'improves post-switch accuracy, substantially reduces forgetting, and maintains stable cloud usage.' Table 1 confirms higher post-switch accuracy and lower forgetting rates, and Figures 4-5 confirm stable cloud usage tracking." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims like 'DA-GRPO improves post-switch accuracy' are supported by controlled comparisons where only the optimization method varies, with identical models, data, and evaluation protocols across baselines." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The introduction frames the work broadly in terms of 'wireless edge systems,' 'consumer devices,' and 'network edge' deployment, but experiments only test 2 SLMs (1.5B, 3B) with 1 cloud model on 4 benchmarks. No actual edge device experiments are conducted." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper states 'We believe that these improvements are driven by improved problem allocation' (Sec 4.1) but does not substantively discuss confounds or alternative explanations for the observed improvements (e.g., different gradient dynamics, effective data augmentation from cloud responses)." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures benchmark accuracy and forgetting rates, and its claims are about benchmark performance and forgetting. The claims match the granularity of the measurements without overclaiming broader capabilities." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "Local models are specified (Qwen2.5-1.5B-Instruct, Llama-3.2-3B-Instruct). However, the cloud model 'Deepseek-R1' is not versioned — DeepSeek-R1 comes in multiple sizes (1.5B, 7B, 8B, 14B, 32B, 70B, 671B) and the specific variant used is not stated." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix F.1 provides the full system prompt and user prompt templates for math, code, and QA tasks, with actual prompt text used in experiments." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Table 3 comprehensively lists hyperparameters: batch size (128), group size (8), max lengths, learning rates, training steps, sampling temperatures, reasoning step limits, and assistance reward values." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The system is a direct RL post-training framework for language models." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper states MMLU is subsampled 'to balance its size with TACO-verified' and TACO-verified is curated 'to retain only problems with a valid and executable test bench,' but exact subsample sizes, filtering criteria, and final dataset statistics are not provided." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "Section 5 is 'Conclusion and Future Work' and mentions two future directions but contains no dedicated limitations discussion. No separate limitations section exists." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats to validity are discussed anywhere in the paper." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show or what settings/populations are excluded. The framing suggests broad applicability to edge deployment without bounding claims to the tested settings." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw experimental data (training logs, per-example predictions, model outputs) are released for independent verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "The data sources are clearly identified: MATH-lighteval (all subsets), ARC-Easy, ARC-Challenge, TACO-verified (curated), and MMLU (auxiliary-train split). All are standard public benchmarks." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. All data comes from standard public benchmarks." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The two-phase training protocol is described at a high level, but the exact data processing pipeline (MMLU subsampling procedure, TACO curation criteria, number of examples per task group, batching) is not fully documented." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources or acknowledgments section is present in the paper." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: Purdue University (Elmore Family School of ECE) and University of Exeter (Department of Computer Science)." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Funding is not disclosed, so independence cannot be assessed." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interest declaration is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No training data cutoff dates are stated for either Qwen2.5 or Llama-3.2, despite these being pre-trained models evaluated on public benchmarks." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether the pre-trained models (Qwen2.5, Llama-3.2) may have seen MATH, ARC, TACO, or MMLU data during pre-training." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "MATH (2021), ARC (2018), and MMLU (2020) are all public benchmarks published years before Qwen2.5 and Llama 3.2 were trained. Contamination risk is not discussed." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "While collaboration ratios (τ = 0.3, 0.5) are tracked, actual inference costs (API costs, latency, tokens consumed, wall-clock time) are not reported." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Training steps are listed (840 for task group 1, 400 for task group 2) but no GPU hours, hardware specifications, total training time, or computational budget are stated." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of multiple random seeds. All results appear to be from single runs with no seed sensitivity analysis." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs producing the reported results is never stated." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper states 'we perform a grid search to identify the optimal configuration' for baselines but does not report the number of configurations tried, search method details, or compute spent on search." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "For baselines, grid search is mentioned but selection criteria and validation protocol are not described. For DA-GRPO, sensitivity analyses are provided but no explicit justification for the final configuration selection." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Comparisons are made across 7+ methods, 2 models, and multiple tasks. No statistical tests are used at all, let alone corrections for multiple comparisons." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors implement all baselines (GRPO, GVPO, GAPG, routers) themselves without acknowledging the self-comparison bias documented by Lucic et al. (2018)." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "Different methods use different amounts of cloud inference (varying τ) but performance is not reported as a function of total compute budget. The compute asymmetry between local and cloud models is not analyzed." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "No discussion of whether MATH-lighteval, TACO-verified, ARC, or MMLU actually measure the capabilities claimed (e.g., whether MATH-lighteval adequately measures 'mathematical reasoning' for the continual learning claims)." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved. All methods use the same direct model training framework." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "MATH (2021), ARC (2018), MMLU (2020) were all published years before Qwen2.5 and Llama 3.2 training. The models may have seen solutions during pre-training. This is not discussed." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the RL training setup leaks evaluation information or whether the reward signal introduces feature leakage." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of independence between training and evaluation examples, or potential overlap between task groups." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention method is applied." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "DA-GRPO achieves higher during-task and post-switch accuracy than all baselines across math and coding tasks under continual learning.", 369 "evidence": "Table 1 shows DA-GRPO achieves 84.5% joint during-task and 75.5% joint post-switch accuracy on MATH-lighteval for Qwen2.5-1.5B (vs 78.1%/68.5% for best baseline GAPG), and 84.8%/72.8% on TACO (vs 82.2%/66.6% for GAPG). Similar gains for Llama-3.2-3B.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "DA-GRPO reduces catastrophic forgetting compared to prior collaborative and routing-based approaches.", 374 "evidence": "Table 1 reports forgetting rates. For Qwen2.5-1.5B joint responses: DA-GRPO 10.7% on MATH (vs 12.3% GAPG, 18.6% GRPO) and 14.1% on TACO (vs 19.0% GAPG, 23.0% GRPO). Figure 3 shows smaller accuracy drops after task switches.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "The dual variable λ converges to stable values regardless of initialization and maintains stable cloud usage around the target τ.", 379 "evidence": "Figure 4 shows λ trajectories from different initializations (0.1, 0.3, 0.5, 0.7, 1.0) all converging to similar values, with collaboration ratios stabilizing around τ = 0.3 for both Qwen2.5 and Llama-3.2.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DA-GRPO adaptively tracks time-varying collaboration targets without retraining or manual hyperparameter tuning.", 384 "evidence": "Figure 5 and Figure 10 show that when τ changes across phases, λ adjusts correspondingly and the empirical collaboration ratio tracks the new target. Four different time-varying τ schedules are demonstrated in Appendix D.3.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Smaller local models (1.5B, 3B) suffer significantly more catastrophic forgetting than larger models (7B) under continual fine-tuning.", 389 "evidence": "Figure 2 shows sequential fine-tuning results on MATH-500: 1.5B and 3B models show sharp performance drops after task switch, while 7B retains most capability. Only one setup tested (math → code switching).", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "The improvements are driven by improved problem allocation rather than enhanced local SLM capacity.", 394 "evidence": "Sec 4.1 states 'We believe that these improvements are driven by improved problem allocation' but provides no direct evidence separating problem allocation from other potential explanations.", 395 "supported": "weak" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No error bars or uncertainty quantification", 401 "detail": "All results in Tables 1 and 2 are single-run point estimates across 2 model architectures and multiple benchmarks. Without variance estimates, it is impossible to assess whether the reported improvements are within noise." 402 }, 403 { 404 "flag": "Self-comparison bias", 405 "detail": "The authors implement all baselines (GRPO, GVPO, GAPG, routers) themselves. Per Lucic et al. (2018), authors' implementations of baselines systematically underperform. This is not acknowledged or mitigated." 406 }, 407 { 408 "flag": "No contamination analysis on pre-trained models", 409 "detail": "MATH (2021), ARC (2018), MMLU (2020) predate Qwen2.5 and Llama-3.2 training. The pre-trained models may have memorized benchmark answers, inflating reported accuracy for all methods. This confound is not discussed." 410 }, 411 { 412 "flag": "Cloud model version unspecified", 413 "detail": "DeepSeek-R1 comes in multiple sizes (1.5B to 671B) but the paper does not specify which variant is used as the cloud model. The cloud model's capability directly affects all collaboration results." 414 }, 415 { 416 "flag": "Training and evaluation on same benchmarks without explicit splits", 417 "detail": "The paper trains RL post-training on MATH-lighteval, TACO-verified, etc. and evaluates on the same benchmarks without describing held-out test splits, raising overfitting concerns." 418 }, 419 { 420 "flag": "No limitations section", 421 "detail": "Despite specific experimental scope (2 models, 4 benchmarks, no real edge device testing), the paper has no limitations section discussing the boundaries of its findings." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 427 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 428 "year": 2023, 429 "arxiv_id": "2305.05176", 430 "relevance": "Foundational work on cost-efficient LLM usage through cascading and routing, directly relevant to local-cloud collaboration." 431 }, 432 { 433 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 434 "authors": ["D. Ding", "A. Mallick", "C. Wang", "R. Sim", "S. Mukherjee"], 435 "year": 2024, 436 "arxiv_id": "2404.14618", 437 "relevance": "LLM routing approach for quality-cost tradeoffs, a key baseline category for local-cloud collaboration methods." 438 }, 439 { 440 "title": "Collaborative Device-Cloud LLM Inference through Reinforcement Learning", 441 "authors": ["W. Fang", "D.-J. Han", "L. Yuan", "C. Brinton"], 442 "year": 2025, 443 "arxiv_id": "2509.24050", 444 "relevance": "Direct predecessor integrating cloud-offloading decisions into local SLM training via RL, the GAPG baseline." 445 }, 446 { 447 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 448 "authors": ["D. Guo", "D. Yang", "H. Zhang"], 449 "year": 2025, 450 "arxiv_id": "2501.12948", 451 "relevance": "Introduces GRPO for RL-based LLM post-training, the foundational method extended by DA-GRPO." 452 }, 453 { 454 "title": "RouteLLM: Learning to Route LLMs with Preference Data", 455 "authors": ["I. Ong", "A. Almahairi", "V. Wu"], 456 "year": 2024, 457 "arxiv_id": "2406.18665", 458 "relevance": "LLM routing with learned preferences, representative of external router approaches for model selection." 459 }, 460 { 461 "title": "Small Language Models Are the Future of Agentic AI", 462 "authors": ["P. Belcak", "G. Heinrich", "S. Diao"], 463 "year": 2025, 464 "arxiv_id": "2506.02153", 465 "relevance": "Argues for SLMs in agentic settings under resource constraints, supporting the motivation for local-cloud collaboration." 466 }, 467 { 468 "title": "GVPO: Group Variance Policy Optimization for Large Language Model Post-Training", 469 "authors": ["K. Zhang", "Y. Hong", "J. Bao"], 470 "year": 2025, 471 "arxiv_id": "2504.19599", 472 "relevance": "Variance-reduced GRPO variant for LLM post-training, a key baseline method compared against DA-GRPO." 473 }, 474 { 475 "title": "MobileLLM: Optimizing Sub-Billion Parameter Language Models for On-Device Use Cases", 476 "authors": ["Z. Liu", "C. Zhao", "F. Iandola"], 477 "year": 2024, 478 "relevance": "On-device LLM optimization addressing the same deployment constraints motivating local-cloud collaboration." 479 }, 480 { 481 "title": "On-Device Language Models: A Comprehensive Review", 482 "authors": ["J. Xu", "Z. Li", "W. Chen"], 483 "year": 2024, 484 "arxiv_id": "2409.00088", 485 "relevance": "Comprehensive survey of on-device LLM deployment, providing context for the local model constraints addressed in this work." 486 }, 487 { 488 "title": "Teaching Small Language Models to Reason", 489 "authors": ["L. C. Magister", "J. Mallinson", "J. Adamek"], 490 "year": 2023, 491 "relevance": "Knowledge distillation from large to small language models, an alternative to the cloud-offloading approach for improving SLM capabilities." 492 }, 493 { 494 "title": "Constrained Policy Optimization", 495 "authors": ["J. Achiam", "D. Held", "A. Tamar", "P. Abbeel"], 496 "year": 2017, 497 "relevance": "Classical constrained RL method that provides the theoretical foundation for constraint-aware policy optimization extended in this work." 498 } 499 ] 500 }