scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24749B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Joint Continual Learning of Local Language Models and Cloud Offloading Decisions with Budget Constraints",
      6     "authors": [
      7       "Evan Chen",
      8       "Wenzhi Fang",
      9       "Shiqiang Wang",
     10       "Christopher G. Brinton"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.00166",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's claims about improved post-switch accuracy, reduced forgetting, and stable cloud usage are all directly supported by Table 1 results comparing DA-GRPO against multiple baselines across math and code tasks.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims (e.g., 'DA-GRPO reduces forgetting') are supported by controlled comparisons against multiple baselines (GRPO, GVPO, GAPG) within the same continual learning framework; 'Collaborative Training w/ GRPO' effectively isolates the dual-advantage mechanism as an ablation.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Claims are bounded to compact SLMs (1.5B, 3B parameters) under resource-constrained edge deployment; the conclusion explicitly scopes results to 'resource-limited local deployment' without overgeneralizing to larger models.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 4.1 explicitly proposes that accuracy gains 'are driven by improved problem allocation rather than enhanced local SLM capacity,' acknowledging an alternative explanation for observed improvements.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes between 'local-solved responses only' and 'local-cloud joint responses' metrics, clarifying that the local subset metric 'evaluates the quality of problem allocation' and is not a measure of intrinsic local SLM capability.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion mentions two future work directions but does not systematically enumerate methodological limitations.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No specific threats to validity are discussed; issues such as benchmark contamination, limited model and task diversity, or single-run evaluation are not acknowledged as threats.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what results do NOT show; while the setting is implicitly bounded to compact SLMs, no explicit boundary statements such as 'these findings do not apply to X' are made.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source is disclosed anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly disclosed: Purdue University (Elmore Family School of ECE) and University of Exeter (Department of Computer Science).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence of funder cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is included in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: 'catastrophic forgetting' is illustrated with experiments in Figure 2, 'cloud-assist rate' τ is formally defined, and the collaboration framework is mathematically formalized with notation in Section 3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 1.1 provides three explicit bullet-point contributions: identifying RL collaboration limitations under continual learning, proposing DA-GRPO, and conducting extensive experiments.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 1.2 engages substantively with routing-based approaches, RL-based collaboration, GRPO variants, and Lagrangian constrained optimization, positioning DA-GRPO's specific advantages over each category.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No source code is released or linked anywhere in the paper; no mention of code availability.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All datasets used (MATH-lighteval, ARC-Easy, ARC-Challenge, MMLU, TACO-Verified) are standard public benchmarks available without restriction.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements file, Dockerfile, or dependency specification is provided; only training hyperparameters in Table 3 are given, with no software environment details.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided; hyperparameters and prompts appear in appendices but there is no runnable pipeline or sequence of commands.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 1 and 2 are point estimates with no confidence intervals, error bars, or indication of run-to-run variability.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are used for any comparative claims; results are presented as raw numbers without significance assessment.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute accuracy differences and forgetting rates are reported in tabular form (e.g., DA-GRPO 67.7% vs. GAPG 59.2% post-switch on MATH-lighteval), providing meaningful effect size context.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The number of training examples used is never stated (MMLU is 'subsampled to balance its size' but no count given), and no justification for dataset sizes or training steps is provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Tables report single-run point estimates with no standard deviation or inter-run variability; it is unclear whether experiments were repeated.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Multiple baselines are included: Edge Tuning Only, Naive Router, trained Router, GRPO, GVPO, and GAPG across both model families and all tasks.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "GAPG (arXiv:2509.24050, 2025), GVPO (NeurIPS 2025), and GRPO (DeepSeek-R1, 2025) are contemporary state-of-the-art baselines in this area.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Comparing 'Collaborative Training w/ GRPO' against DA-GRPO effectively ablates the dual-advantage design; sensitivity analyses in Appendix C ablate learning rate and dual step size independently.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Three metrics are reported: during-task accuracy, post-switch accuracy, and forgetting rate, each evaluated under both local-only and joint accuracy variants.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not relevant for automated mathematical reasoning and code generation tasks evaluated against ground-truth labels.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Evaluation uses held-out benchmark test sets (MATH-lighteval, TACO-Verified, ARC, MMLU) that are distinct from training data.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by task group (math/code), model family (Qwen/Llama), model size, and evaluation type (local-only vs. joint); Appendix D.2 reports additional benchmarks (MATH-500, MMLU).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "No failure cases of DA-GRPO itself are shown; Figure 2 demonstrates motivating baseline forgetting but no systematic failure analysis of the proposed method is presented.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Appendix C shows that very small and very large learning rates and dual step sizes cause instability or stagnation, explicitly reporting conditions under which the method fails.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Model names are specific: Qwen2.5-1.5B-Instruct, Llama-3.2-3B-Instruct, and Deepseek-R1; no checkpoint hashes or snapshot dates are given but names are sufficiently specific.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix F.1 provides the complete system prompt and task-specific user prompt templates for code, math, and QA tasks verbatim.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Table 3 provides comprehensive hyperparameters: batch size, group size, learning rates, training steps, temperature, reasoning step limits, and baseline-specific reward coefficients.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Algorithm 1 describes the full DA-GRPO training procedure including group sampling, cloud querying logic, advantage computation, and dual variable updates in pseudocode.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "TACO-Verified curation is described ('manually curate to retain only problems with a valid and executable test bench'), MMLU split selection is stated, and code evaluation uses 'local containerized environment against all provided test cases'.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "All datasets are publicly available standard benchmarks (MATH, ARC, MMLU, TACO-Verified) accessible for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The paper describes which splits were used (MMLU 'auxiliary-train', MATH-lighteval 'all available subsets') and the TACO curation and code execution evaluation process.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; standard benchmarks are used without recruitment.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline from dataset selection through continual learning training protocol to evaluation (including containerized code execution and exact-match scoring) is documented across Section 4 and Appendix F.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs for Qwen2.5-1.5B-Instruct, Llama-3.2-3B-Instruct, and Deepseek-R1 are not stated anywhere in the paper.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Potential contamination of MATH (2021), MMLU (2020), and ARC (2018) benchmarks, all released years before model training, is not discussed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "MATH, ARC, and MMLU were publicly available well before 2024 model training cutoffs, making contamination plausible, but the paper does not address this concern.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Cloud inference cost and latency are discussed conceptually as motivation for the budget constraint but no actual cost figures, token prices, or latency measurements are reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Training steps (840 + 400) are given in Table 3 but GPU type, GPU hours, or total computational budget are not reported.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "DA-GRPO improves post-switch accuracy compared to all RL-based collaborative baselines on both math and code tasks",
    374       "evidence": "Table 1: DA-GRPO achieves 67.7% post-switch on MATH-lighteval for Qwen2.5-1.5B vs. 59.2% (GAPG), 53.7% (GVPO), 48.1% (GRPO); similar gains hold for Llama-3.2-3B and on TACO-Verified",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "DA-GRPO substantially reduces catastrophic forgetting compared to baselines",
    379       "evidence": "Table 1 shows mixed results: for joint responses DA-GRPO does best (10.7% vs. 12.3% GAPG), but for local-only responses GVPO achieves lower forgetting (9.0%) than DA-GRPO (12.3%) on MATH for Qwen 1.5B",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The dual variable λ converges to stable values regardless of initialization, enabling predictable collaboration ratios",
    384       "evidence": "Figure 4 shows λ trajectories from multiple initializations converging to similar ranges with collaboration ratio stabilizing around τ=0.3 for both Qwen2.5 and Llama-3.2",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Fixed hierarchical reward baselines require per-task reward tuning while DA-GRPO adapts automatically",
    389       "evidence": "Table 3 footnote states 'Assistance reward tuned for math ratio 0.3, coding ratio 0.5, and QA ratio 0.2, only used in fixed hierarchical reward methods'; DA-GRPO's dual variable adapts without this tuning",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "DA-GRPO adapts to time-varying collaboration targets τ without retraining",
    394       "evidence": "Figure 10 and Appendix D.3 show λ tracking four different τ schedules, including non-monotone patterns (0.1→0.7→0.3→0.5), rapidly transitioning to new targets",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Higher cloud collaboration ratios monotonically improve local model accuracy on math tasks",
    399       "evidence": "Figure 9 shows increasing local accuracy as τ increases from 0.3 to 0.7 on MATH-lighteval for Qwen2.5-1.5B under DA-GRPO",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "theoretical"
    406   ],
    407   "key_findings": "DA-GRPO, a constrained RL framework embedding cloud-usage budgets directly into group-relative advantage computation, consistently outperforms GRPO, GVPO, and GAPG on post-switch accuracy and (in most conditions) forgetting rate across mathematical reasoning and code generation tasks in a continual learning setting. The dual variable λ adaptively regulates cloud usage without per-task reward tuning and converges to stable values from diverse initializations. The paper argues that accuracy gains stem primarily from improved problem allocation (routing hard queries to cloud) rather than enhanced local model capacity, supported by the dual metric design distinguishing local-only from joint responses.",
    408   "red_flags": [
    409     {
    410       "flag": "No statistical significance testing or variance across runs",
    411       "detail": "All comparative results in Tables 1 and 2 are single-run point estimates with no confidence intervals, standard deviations, or significance tests; it is unclear whether results are reproducible across seeds."
    412     },
    413     {
    414       "flag": "Forgetting reduction claim overstated",
    415       "detail": "For local-solved responses on MATH-lighteval (Qwen 1.5B), GVPO achieves 9.0% forgetting vs. DA-GRPO's 12.3%, and GAPG beats DA-GRPO on forgetting for Llama-3.2-3B local-solved responses (8.3% vs. 10.1%), undermining the 'substantially reduces forgetting' claim."
    416     },
    417     {
    418       "flag": "No code released",
    419       "detail": "No source code or reproduction artifacts are provided, making independent verification of results impossible."
    420     },
    421     {
    422       "flag": "Benchmark contamination unaddressed",
    423       "detail": "MATH (2021), ARC (2018), and MMLU (2020) were released years before Qwen2.5, Llama-3.2, and Deepseek-R1 were trained; potential pretraining contamination of test sets is not discussed."
    424     },
    425     {
    426       "flag": "No limitations section",
    427       "detail": "No dedicated limitations or threats-to-validity section exists; the paper does not enumerate failure conditions or scope constraints for the proposed method."
    428     },
    429     {
    430       "flag": "No funding disclosure",
    431       "detail": "No funding source is disclosed for this research, contrary to standard scientific practice."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    437       "relevance": "Foundation for GRPO baseline and serves as the cloud LLM in all experiments"
    438     },
    439     {
    440       "title": "Collaborative Device-Cloud LLM Inference through Reinforcement Learning (GAPG)",
    441       "relevance": "Primary prior work on RL-based local-cloud collaboration that DA-GRPO directly extends and improves upon"
    442     },
    443     {
    444       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    445       "relevance": "Representative routing-based baseline for selective LLM offloading"
    446     },
    447     {
    448       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    449       "relevance": "Representative work on LLM cost reduction through selective cloud offloading"
    450     },
    451     {
    452       "title": "Constrained Policy Optimization",
    453       "relevance": "Foundational constrained RL method that motivates DA-GRPO's constraint-based formulation"
    454     },
    455     {
    456       "title": "GVPO: Group Variance Policy Optimization for Large Language Model Post-Training",
    457       "relevance": "Contemporary baseline policy optimization method compared directly in experiments"
    458     },
    459     {
    460       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    461       "relevance": "Benchmark dataset used for general knowledge evaluation"
    462     },
    463     {
    464       "title": "Measuring Mathematical Problem Solving with the MATH Dataset",
    465       "relevance": "Core benchmark dataset (MATH-lighteval, MATH-500) used for math reasoning evaluation throughout"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 3,
    471       "justification": "Directly addresses real edge deployment constraints: monetary cloud cost, memory limits, and continual task adaptation without full retraining."
    472     },
    473     "surprise_contrarian": {
    474       "score": 1,
    475       "justification": "The core insight that constraint-level advantage shaping outperforms reward-level tuning is technically interesting but expected within constrained RL literature."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No safety or AI risk concerns raised; focus is on efficiency and resource optimization for edge devices."
    480     },
    481     "drama_conflict": {
    482       "score": 0,
    483       "justification": "No controversial claims or conflict with established community consensus."
    484     },
    485     "demo_ability": {
    486       "score": 1,
    487       "justification": "No code released; practitioners cannot try this without reimplementing from scratch, though the algorithm is described clearly enough to attempt."
    488     },
    489     "brand_recognition": {
    490       "score": 1,
    491       "justification": "Purdue University is a credible institution but not a top-tier AI lab; no famous product or widely-known dataset is associated with this work."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [],
    496     "top_points": 0,
    497     "total_points": 0,
    498     "total_comments": 0
    499   }
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs