scan.json (25991B)
1 { 2 "paper": { 3 "title": "Poodle: Seamlessly Scaling Down Large Language Models with Just-in-Time Model Replacement", 4 "authors": [ 5 "Nils Strassenburg", 6 "Boris Glavic", 7 "Tilmann Rabl" 8 ], 9 "year": 2025, 10 "venue": "arXiv.org", 11 "arxiv_id": "2512.05525", 12 "doi": "10.48550/arXiv.2512.05525" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval", "theoretical"], 17 "key_findings": "Poodle presents a just-in-time model replacement (JITR) vision where recurring LLM tasks are automatically detected and replaced with fine-tuned surrogate models. On IMDB sentiment classification, replacing Llama 405B Turbo with a fine-tuned BERT model achieves 82× cost reduction for 1M requests, 19.6× throughput improvement, and competitive accuracy (0.90 vs 0.937). Model search followed by fine-tuning on 500 samples completes 4× faster than training from scratch on 5,000 samples while reaching higher accuracy (0.91 vs 0.89).", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No repository URL or code archive is provided. Poodle is described as a prototype but no source code is released." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The experiments use the publicly available IMDB movie reviews dataset [18] (Maas et al., 2011)." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment specifications, dependency lists, or setup instructions are provided. The paper mentions using an NVIDIA RTX A5000 GPU but no software environment details." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included. The experimental setup is described in prose but lacks sufficient detail for exact replication." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "No confidence intervals or error bars are reported. Table 2 shows single accuracy values and Figures 7-8 show point estimates without uncertainty." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are used. Claims like 'model search followed by fine-tuning outperforms other approaches' (Section 4.4) are based solely on comparing raw numbers." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports effect sizes with baseline context: '60× cheaper' and '82× cheaper' for cost (Section 4.1), '19.6× more items per second' for throughput (Section 4.2), and accuracy differences with baselines (e.g., BERT 0.90 vs LLM 0.937 in Table 2)." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification for the sample sizes used. The choice of 10,000 IMDB items, 500/5,000 training samples, and 10 models for search is not explained." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance, standard deviation, or spread measures are reported across any experiment. All results are single-run point estimates." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Section 4.4 compares four approaches: baseline (BERT fine-tuned on 5,000 items), S-naive (fine-tune all 10 models), S-500 (model search + 500 samples), S-5000 (model search + 5,000 samples). Cost comparisons use three LLM baselines (Table 1)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "The main model used for accuracy/inference experiments is Llama-2-7B (2023), which is outdated by 2025 standards. The cost analysis uses GPT-4.1 and Llama 405B Turbo pricing (contemporary), but the actual experimental model is old. BERT (2018) as surrogate is appropriate for the task but the LLM baseline is weak." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 4.4 systematically varies components: with/without model search, and different training set sizes (500 vs 5,000), showing the contribution of each component (Figure 8)." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper evaluates across three metrics: monetary cost (Section 4.1), inference time/throughput (Section 4.2), and accuracy (Section 4.3)." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "Human evaluation is not relevant to the claims, which are about automated cost, latency, and accuracy metrics on a classification task." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section 4.3 states '10,000 random items from the IMDB dataset with a 50/50 training-test split,' clearly separating training and test data." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by LLM model (GPT-4.1 nano, GPT-4.1, Llama 405B Turbo in Figure 7), by training data size (500-5,000 in Table 2), by label source (ground truth vs LLM-generated), and by approach (Figure 8)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No failure cases are discussed. The paper does not show where JITR breaks down, what types of tasks it fails on, or scenarios where the surrogate model performs poorly." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": false, 112 "justification": "No negative results are reported. Every experiment shows improvement from JITR. No approaches that were tried and failed are mentioned." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims 'significant savings for exemplary tasks,' which is supported by the cost (82×), time (19.6×), and accuracy results. The hedging ('exemplary tasks') appropriately bounds the claim." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Section 4.4 claims model search 'outperforms' other approaches. The comparison is adequately controlled: same dataset, same task, same evaluation, single-variable manipulation (with/without model search, different data sizes)." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Seamlessly Scaling Down Large Language Models' broadly, but all experiments use a single task (IMDB sentiment classification). The abstract uses 'exemplary tasks' (plural) when only one task was tested. The vision in Section 2 discusses general recurring tasks far beyond what was demonstrated." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations are discussed. For example, IMDB sentiment is known to be a solved task for fine-tuned models — the paper does not consider whether results would hold for harder tasks. The LLM contamination on IMDB is mentioned but not analyzed as an alternative explanation for the accuracy comparison." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures accuracy on IMDB sentiment classification and frames it as evidence that JITR works for general 'recurring tasks.' The gap between this single-task proxy and the broader claim of general JITR effectiveness is not acknowledged." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The accuracy experiment specifies 'NousResearch/Llama-2-7b-chat-hf' [20] and 'bert-base-uncased' [13], but cost experiments reference marketing names ('GPT-4.1', 'GPT-4.1-nano', 'Llama 405B Turbo') without API versions or snapshot dates." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Full prompt text is provided: Figure 3 shows the base sentiment classification prompt, Figure 4 shows the wrapper prompt, and Figure 5 shows the expected response format." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "Table 2 reports epochs (2-9) and Section 4.2 mentions batch sizes (16 for Llama, 128 for BERT), but learning rate, optimizer, temperature, and other critical parameters are not reported. Section 4.3 notes 'we do not tune hyperparameters' but does not state the defaults used." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. Poodle is a pipeline system (data collector → model search → fine-tuning → monitoring), not an agentic scaffold." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper states '10,000 random items from the IMDB dataset' and mentions BERT uses 256 tokens vs LLM 1024 tokens, but does not document tokenization, truncation strategy, or other preprocessing steps." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations section exists. Section 5 ('Summary and Discussion') discusses future work directions and challenges but does not explicitly address limitations of the current evaluation." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No specific threats to validity are discussed. The paper does not address concerns like single-dataset evaluation, contamination effects on accuracy comparisons, or generalizability limitations." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit scope boundaries are stated. The paper does not say what the results do NOT show. The word 'preliminary' appears once, and 'exemplary' in the abstract, but there is no explicit scoping of claims." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "The IMDB dataset is publicly available, but the experimental results (model search rankings, fine-tuning curves, cost calculations) are not released for independent verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.3 describes '10,000 random items from the IMDB dataset with a 50/50 training-test split.' Section 4.4 describes selecting 10 models from HuggingFace with specific categories (base BERT, 3 sentiment, 6 non-sentiment). Data source and selection process are clear." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. Data source is the standard IMDB benchmark dataset." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "The general JITR pipeline is described architecturally (Figure 2, Section 3), but the specific experimental data pipeline — how IMDB items were selected, how LLM labels were generated, how models were evaluated — lacks step-by-step documentation." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section is present." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: Hasso Plattner Institute (Uni Potsdam) and University of Illinois Chicago. Academic affiliations with no apparent conflict with evaluated products." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is not the same as absence of funding." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper uses Llama-2-7B for accuracy experiments but does not state its training data cutoff date." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": true, 239 "justification": "Section 4.3 explicitly notes 'the LLM might have seen the IMDB data before' as a caveat when interpreting the accuracy comparison between the LLM and BERT surrogate." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Section 4.3 acknowledges contamination risk: 'the LLM might have seen the IMDB data before.' The IMDB dataset (2011) predates Llama-2's training period. The paper notes this as a factor favoring the LLM in the comparison." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section 4.1 provides detailed cost analysis with token prices (Table 1), break-even points, and per-1M-request savings ($33 for GPT-4.1 nano, $850 for GPT-4.1, $1420 for Llama 405B Turbo). Model development cost estimated at $4." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": true, 293 "justification": "Section 4.1 states '$4, which roughly equals three hours of an AWS A10G GPU instance' for surrogate model development. Section 4.2 specifies the NVIDIA RTX A5000 GPU used for inference experiments." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of random seeds or seed sensitivity analysis. All results appear to be single-run." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. Results are presented as single values without indication of repetition." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 4.3 explicitly states 'we do not tune hyperparameters,' transparently reporting zero hyperparameter search budget. Section 4.4 reports the model search budget: 10 models evaluated, 500 samples used for search." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "Section 4.4 describes using Alsatian's baseline approach with 500 samples to rank models, selecting the highest-ranked model. The ground truth ranking is generated by fully fine-tuning all 10 models, and model search correctly identifies the best." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical tests are performed in the paper, so correction for multiple comparisons is not applicable." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors evaluate their own system (Poodle) against baselines they implemented without acknowledging author-evaluation bias. No independent evaluation or discussion of this bias." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": true, 330 "justification": "Figure 8 and Section 4.4 explicitly show development time vs. accuracy for all four approaches, allowing direct comparison of compute cost and resulting performance." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper uses IMDB sentiment classification as a proxy for 'recurring tasks' generally, but does not discuss whether this well-studied, simple benchmark is representative of the diverse recurring tasks JITR targets." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved in the model comparisons." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "IMDB (2011) predates Llama-2's training data. The paper mentions 'the LLM might have seen the IMDB data before' but does not formally address temporal leakage or its effect on the accuracy comparison." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup (e.g., prompt format, wrapper prompt) leaks information that affects the comparison." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of independence between training and test splits beyond stating '50/50 training-test split.' No analysis of potential overlap or structural similarity." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention methods are applied. Contamination risk is only mentioned in passing." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "JITR with Poodle achieves 60-82× cost reduction for 1M requests compared to using LLMs directly.", 369 "evidence": "Section 4.1, Figure 7: Poodle saves $850 (60×) vs GPT-4.1 and $1,420 (82×) vs Llama 405B Turbo for 1M sentiment classification requests, after a break-even point of <10,000 requests for large models.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "JITR reduces inference time by 7.5-10× compared to using a small LLM (Llama-2-7B).", 374 "evidence": "Section 4.2: BERT processes 19.6× more items per second than Llama-2-7B. For 1M requests, Poodle takes 7.5× less time; for 2M requests, more than 10×.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Fine-tuned BERT models achieve competitive accuracy with LLMs on sentiment classification.", 379 "evidence": "Section 4.3, Table 2: BERT fine-tuned on 5,000 LLM-generated labels reaches 0.90 accuracy vs LLM's 0.937, using only 256 tokens vs 1024 and no hyperparameter tuning.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Model search followed by fine-tuning outperforms alternative model development approaches in time, accuracy, and data efficiency.", 384 "evidence": "Section 4.4, Figure 8: S-500 (model search + 500 samples) completes 4× faster than baseline and 19× faster than naive search while achieving 0.91 accuracy (vs 0.89 baseline). Model search correctly ranks sentiment-specific models highest.", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Single-dataset evaluation", 391 "detail": "All experiments use only IMDB sentiment classification — one of the simplest and most well-studied NLP tasks. No evidence that JITR works for other recurring task types (e.g., NER, summarization, translation) despite broad claims." 392 }, 393 { 394 "flag": "No error bars or variance", 395 "detail": "All experimental results are single-run point estimates with no variance reporting. The accuracy differences (e.g., 0.89 vs 0.91) may not be statistically significant." 396 }, 397 { 398 "flag": "Claims outrun evidence", 399 "detail": "The paper's vision and title ('Seamlessly Scaling Down Large Language Models') imply broad generality, but the prototype is only tested on binary sentiment classification. The gap between the vision (Section 2) and evaluation (Section 4) is substantial." 400 }, 401 { 402 "flag": "Contamination confound in accuracy comparison", 403 "detail": "The LLM (Llama-2-7B) was likely trained on IMDB data, inflating its accuracy. The paper acknowledges this but does not quantify the effect, making the accuracy comparison between LLM and BERT surrogate difficult to interpret." 404 }, 405 { 406 "flag": "Missing failure analysis", 407 "detail": "No failure cases or limitations of the approach are empirically demonstrated. The paper does not show tasks where JITR fails or where surrogate models underperform, despite this being a prototype evaluation." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 413 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 414 "year": 2023, 415 "arxiv_id": "2305.05176", 416 "relevance": "Directly relevant work on LLM cost reduction through model cascading, addressing the same problem of reducing inference cost." 417 }, 418 { 419 "title": "Distilling the Knowledge in a Neural Network", 420 "authors": ["Geoffrey Hinton", "Oriol Vinyals", "Jeff Dean"], 421 "year": 2015, 422 "arxiv_id": "1503.02531", 423 "relevance": "Foundational knowledge distillation technique used by Poodle to transfer LLM knowledge into smaller surrogate models." 424 }, 425 { 426 "title": "HuggingGPT: Solving AI Tasks with ChatGPT and Its Friends in Hugging Face", 427 "authors": ["Yongliang Shen", "Kaitao Song", "Xu Tan", "Dongsheng Li", "Weiming Lu", "Yueting Zhuang"], 428 "year": 2023, 429 "relevance": "Agentic approach to task-solving via model selection on HuggingFace, related to the model search component of JITR." 430 }, 431 { 432 "title": "Large Language Models as Tool Makers", 433 "authors": ["Tianle Cai", "Xuezhi Wang", "Tengyu Ma", "Xinyun Chen", "Denny Zhou"], 434 "year": 2024, 435 "relevance": "Alternative approach to LLM cost reduction by generating code tools for tasks, compared against JITR in the introduction." 436 }, 437 { 438 "title": "CREATOR: Tool Creation for Disentangling Abstract and Concrete Reasoning of Large Language Models", 439 "authors": ["Cheng Qian", "Chi Han", "Yi Fung", "Yujia Qin", "Zhiyuan Liu", "Heng Ji"], 440 "year": 2023, 441 "relevance": "LLM tool creation approach for task automation, related alternative to model replacement for reducing LLM dependency." 442 }, 443 { 444 "title": "Knowledge Distillation in Automated Annotation: Supervised Text Classification with LLM-Generated Training Labels", 445 "authors": ["Nicholas Pangakis", "Samuel Wolken"], 446 "year": 2024, 447 "arxiv_id": "2406.17633", 448 "relevance": "Directly evaluates using LLM-generated labels to train smaller models, the core training approach used by JITR." 449 }, 450 { 451 "title": "Fine-tuned small LLMs (still) significantly outperform zero-shot generative AI models in text classification", 452 "authors": ["Martin Juan José Bucher", "Marco Martini"], 453 "year": 2024, 454 "arxiv_id": "2406.08660", 455 "relevance": "Evidence that fine-tuned small models outperform LLMs on text classification, supporting JITR's core premise." 456 }, 457 { 458 "title": "Alsatian: Optimizing Model Search for Deep Transfer Learning", 459 "authors": ["Nils Strassenburg", "Boris Glavic", "Tilmann Rabl"], 460 "year": 2025, 461 "doi": "10.1145/3725264", 462 "relevance": "The model search system underlying Poodle's model selection component, from the same authors." 463 }, 464 { 465 "title": "Which Model to Transfer? Finding the Needle in the Growing Haystack", 466 "authors": ["Cedric Renggli", "André Susano Pinto", "Luka Rimanic"], 467 "year": 2022, 468 "relevance": "Model search technique for transfer learning that addresses the same scaling challenge Poodle faces in finding surrogate models." 469 } 470 ] 471 }