scan.json (31087B)
1 { 2 "paper": { 3 "title": "Model Cascading for Code: A Cascaded Black-Box Multi-Model Framework for Cost-Efficient Code Completion with Self-Testing", 4 "authors": [ 5 "Boyuan Chen", 6 "Mingzhi Zhu", 7 "Brendan Dolan-Gavitt", 8 "Muhammad Shafique", 9 "Siddharth Garg" 10 ], 11 "year": 2024, 12 "venue": "arXiv preprint", 13 "arxiv_id": "2405.15842", 14 "doi": "10.48550/arXiv.2405.15842" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "A cascaded multi-model framework with self-testing reduces LLM code generation inference costs by an average of 26% (up to 70%) while maintaining or improving pass@1 accuracy compared to single-model self-testing. The approach uses self-generated test cases to decide when to escalate queries from smaller to larger models, with a learned threshold parameter. Over 90% of questions solved by the largest model in a family can also be solved by smaller models, validating the cascading premise. The optimal cascading threshold is typically high (0.9–1.0), and models below 10% accuracy should be excluded from the cascade.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The approach is described algorithmically but no implementation is released." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses three publicly available benchmarks: HumanEval, MBPP-sanitized, and APPS-test (introductory subset). All are standard public datasets referenced with citations." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": true, 35 "justification": "Section 4.3 specifies NVIDIA GeForce RTX 3090 GPUs, CUDA Toolkit 11.8, PyTorch-based HuggingFace Transformers v4.31.0, and the Accelerate library. This is detailed enough to recreate the environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While the methodology is described in detail, there are no executable instructions for replicating the experiments." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1, 3, and figures are reported as point estimates. No confidence intervals, error bars, or ± notation are provided for accuracy or cost savings figures." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims cascading 'saves 70% of cost' and 'maintains or improves accuracy' based solely on comparing point estimates. No statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Cost savings are reported with baseline context: e.g., Table 3 shows 17.4% savings on HumanEval for WizardCoder-Python, and figures show absolute accuracy and cost values for both cascading and single-model approaches, enabling the reader to assess magnitude." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The 30% validation / 70% test split is stated but not justified. No power analysis or explanation for why this ratio was chosen. The datasets have 164, 427, and 1000 questions respectively, but no discussion of whether these sizes are adequate for the claims made." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Table 2 notes 'Time is averaged across 10 runs' for cost statistics, but accuracy results appear to be single-run. No standard deviation, IQR, or spread measures are reported for the main accuracy or cost savings results." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper compares against two baselines: randomly selected single-model self-testing plans (Section 5.1) and Pareto-optimal single-model self-testing plans from the validation set (Section 5.2, purple crosses in Figure 7)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": false, 79 "justification": "The model families used (Codegen-mono, WizardCoder-V1.0, WizardCoder-Python-V1.0) are from 2022–2023. Newer model families like DeepSeek-Coder, Qwen-Coder, and Code Llama are cited in Section 2 but not evaluated. No comparison with other cost-reduction methods like FrugalGPT (also cited)." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 5.3 ablates the threshold parameter θ (Figure 6). Section 5.2 compares cascading vs. single-model Pareto solutions. The parameter space exploration across k, l, and θ effectively ablates each component's contribution." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Two primary metrics are used throughout: pass@1 accuracy and inference cost ($/1k queries or $/1M tokens). Both are reported for all experiments." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated via pass/fail on test suites. No human evaluation of code quality, readability, or correctness is conducted." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 3.2 describes a 30% validation / 70% test split. Parameters (Pareto-optimal combinations) are selected on the validation set and evaluated on the held-out test set. Section 4.2 confirms 'we randomly sampled 30% of the questions for the validation set... and evaluated... on the remaining 70%.'" 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 1 provides per-model, per-dataset accuracy. Table 3 shows cost savings broken down by model family and dataset. Figure 7 shows detailed results for each model family–dataset combination." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 5.1 discusses the failure case of WizardCoder-V1.0 on APPS-Intro (-1.6% cost savings), explaining that the 1B model has too low accuracy, 'generates incorrect answers and tests, thereby increasing costs on wasted computations.' They recommend excluding models below 10% accuracy." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table 3 reports -1.6% cost savings for WizardCoder-V1.0 on APPS-Intro, meaning the cascading scheme actually performed worse. This is discussed honestly in Section 5.1." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims 'reduces costs by an average of 26%, and up to 70%.' Table 3 shows individual savings ranging from -1.6% to 70.0%. Averaging the reported values yields approximately 24–26%. The 'up to 70%' claim is supported by the Codegen family on HumanEval. The claim about 'maintaining or improving accuracy' is supported by Figure 7." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper claims cascading 'reduces costs' — a causal claim. The experimental design (controlled comparison of cascading vs. single-model with the same models and datasets, Pareto-optimal selection on validation set) provides adequate support. The ablation on θ (Section 5.3) shows the mechanism." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims applicability to 'Code' generally, and the abstract says 'natural language generation tasks.' But all experiments are on Python-only benchmarks (HumanEval, MBPP, APPS) with three specific model families. The conclusion states 'the method is language-agnostic and can be applied to other programming languages' without evidence." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No substantive discussion of alternative explanations. The cost savings could be specific to the model families chosen (where larger models have large overlap with smaller models). The paper doesn't discuss whether the results hold for model families with different overlap characteristics or different architecture relationships." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures pass@1 accuracy and inference cost, and frames its claims in terms of these same quantities. It does not overclaim that pass@1 represents general 'code quality' or 'developer productivity.' The claims match the granularity of the measurements." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Exact model names and sizes are specified: WizardCoder-Python-V1.0 (7B, 13B, 34B), WizardCoder-V1.0 (1B, 3B, 15B), Codegen-mono (350M, 2B, 6B, 16B). These are specific open-source model identifiers sufficient to download the exact models." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper describes the prompting approach (generate solutions, generate test cases with assert statements) but does not provide the actual prompt text used. Figure 4 shows an example problem but not the prompt template sent to the models." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Key hyperparameters are reported: temperature 0.8 for sampling, temperature 0 for greedy search (Section 3.2), maximum 1024 tokens per prompt (Section 4.3), k ∈ {-1, 0, 1, 3, 5, 10}, l ∈ {0, 2, 4}, θ ∈ {0.0, 0.1, ..., 1.0}." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "The paper does not use agentic scaffolding. The cascading pipeline is a sequential multi-model inference framework, not an agent with tool use, memory, or feedback loops." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.2 describes the datasets, the restriction to APPS introductory-level questions, the 30/70 validation/test split procedure, and the check that validation/test accuracy differences are within 5%. The filtering criteria for APPS (models below 10% accuracy excluded) are stated." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "No dedicated limitations or threats-to-validity section. The conclusion (Section 6) mentions future work directions (per-model θ, speculative decoding) but does not discuss limitations of the current approach." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No threats to validity are discussed. The paper does not consider threats such as the representativeness of the 30% validation split, sensitivity to the specific model families, or whether the cost model generalizes beyond RTX 3090 setups." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The conclusion states 'our experiments focused on Python code generation tasks' but then claims 'the method is language-agnostic and can be applied to other programming languages' — extending rather than bounding scope. No explicit statement of what the results do NOT show." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw experimental data (model outputs, test results, per-question cascading decisions) is released. Only aggregated results in tables and figures are provided." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "The data collection is well-described: standard benchmarks (HumanEval 164 questions, MBPP-sanitized 427 questions, APPS-Intro 1000 questions), model inference setup (Section 4.3), and cost measurement procedure (Section 3.4 with batching details)." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data sources are standard public benchmarks." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: prompts go to models → solutions and tests generated → pairwise testing → scoring via ns × nt → threshold comparison → cascade or accept (Section 3.1, Figures 3–4). The validation/test split process is also described." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding sources are mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: NYU Tandon School of Engineering and NYU Abu Dhabi eBRAIN Lab. The authors evaluate open-source models, not products from their own institutions." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Cannot be assessed because no funding source is disclosed. Absence of a funding statement prevents evaluation of funder independence." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the three model families (Codegen-mono, WizardCoder-V1.0, WizardCoder-Python-V1.0). The training datasets are named (THEPILE, BIGQUERY, BIGPYTHON, Code Alpaca) but no temporal boundaries are given." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether HumanEval, MBPP, or APPS questions appeared in the training data of any model. The models were released after these benchmarks were published, making contamination plausible." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "HumanEval (2021), MBPP (2021), and APPS (2021) were all published before the models were trained. No contamination analysis or discussion is provided despite the clear temporal risk." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All evaluation is automated on code benchmarks." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Cost is a primary focus. Table 1 shows $/1M tokens for all models. Table 2 details time and cost per 1M tokens with GPU counts and batch sizes. Section 3.4 explains the cost calculation methodology using RTX 3090 rental rates ($0.44/hr)." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Section 3.4 and Table 2 detail the GPU configuration: number of RTX 3090 GPUs per model, inference time in hours, batch sizes. The rental cost basis is specified ($0.44/hr per GPU from RunPod)." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No analysis of sensitivity to random seeds. The validation/test split is random, and sampling uses temperature 0.8, but no multi-seed results are reported for accuracy." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "Table 2 states 'Time is averaged across 10 runs' for cost measurements, but the main accuracy results do not state how many runs produced them. The critical pass@1 results appear to be single-run." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": true, 312 "justification": "The full parameter search space is explicitly defined: k ∈ {-1, 0, 1, 3, 5, 10}, l ∈ {0, 2, 4}, θ ∈ {0.0, 0.1, ..., 1.0} for each model in the family. All valid combinations are enumerated and evaluated on the validation set (Section 3.2)." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Configurations are selected via Pareto optimality on the validation set (Section 3.2, Figure 5). The selection criterion is clearly described: points where no other combination achieves both higher accuracy and lower cost." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical hypothesis tests are performed in this paper. Comparisons are made via point estimates without formal testing, so correction for multiple comparisons is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors implement both the cascading system and the single-model baselines. No discussion of potential bias from self-implementation of baselines. No independent evaluation or acknowledgment of this limitation." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "This is the core contribution. All results (Figures 1, 7) plot accuracy as a function of cost. The entire framework is designed to find Pareto-optimal cost–accuracy tradeoffs." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "No discussion of whether HumanEval, MBPP, and APPS adequately measure 'code completion' ability as claimed. The paper treats these benchmarks as ground truth without questioning construct validity." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "The cascading framework is applied uniformly across all model comparisons within each family. Single-model baselines use the same self-testing scoring mechanism (CodeT). The comparison isolates the cascading strategy rather than confounding it with different scaffolds." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "HumanEval, MBPP, and APPS were all published in 2021, before the evaluated models were trained. No discussion of whether model training data includes benchmark solutions." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of feature leakage. The self-testing scheme generates tests from the model itself, which could introduce information feedback, but this is not analyzed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether train and test data overlap or share structural similarities. Models within each family share training data, but the implications for independence are not discussed." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "The cascading approach reduces inference costs by an average of 26%, and up to 70%, while maintaining or improving accuracy compared to single-model self-testing.", 371 "evidence": "Table 3 shows per-family, per-dataset cost savings: 17.4% to 70.0% (with one negative case at -1.6%). Figure 1 and Figure 7 show Pareto-optimal cascading solutions consistently below or equal to single-model solutions on cost-accuracy plots.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "Over 90% of questions solved by the largest model in a family can also be solved by smaller models.", 376 "evidence": "Figure 2 Venn diagram for WizardCoder-Python-V1.0: of 131 questions solvable by 34B, 119 (91%) are also solvable by 7B or 13B. Section 1 states 'over 90% of the questions solved by the 34B model can also be solved by one of the smaller models.'", 377 "supported": "strong" 378 }, 379 { 380 "claim": "The optimal cascading threshold θ is typically high (0.9 or 1.0), reflecting the high accuracy of self-generated tests.", 381 "evidence": "Section 5.3 reports optimal θ values are 'typically high, either 1.0 or 0.9.' Figure 6 shows highest accuracy at θ = 0.8 for one configuration. Less capable families exhibit lower optimal θ on MBPP.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Models below 10% accuracy should be excluded from cascading, as they waste computation on incorrect answers and tests.", 386 "evidence": "Section 5.1 discusses WizardCoder-V1.0 on APPS-Intro where the 1B model (3.9% accuracy) 'generates incorrect answers and tests, thereby increasing costs on wasted computations.' Table 3 shows -1.6% savings for this case.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "The approach is fully black-box and compatible with any model family.", 391 "evidence": "Section 1 states the method 'requires no access to internal model parameters.' Experiments demonstrate it across three different model families (Codegen-mono, WizardCoder-V1.0, WizardCoder-Python-V1.0).", 392 "supported": "moderate" 393 } 394 ], 395 "red_flags": [ 396 { 397 "flag": "Outdated model families", 398 "detail": "All three model families (Codegen-mono, WizardCoder-V1.0, WizardCoder-Python-V1.0) are from 2022–2023 and significantly behind the state of the art. Newer code LLMs like DeepSeek-Coder, Qwen-Coder, and Code Llama are cited but not evaluated. It is unclear whether the high overlap between model sizes (a key premise) holds for newer families." 399 }, 400 { 401 "flag": "No statistical testing on main claims", 402 "detail": "All cost savings percentages (e.g., '26% average', '70% best case') are computed from point estimates without any significance tests, confidence intervals, or variance measures. The random validation/test split introduces variance that is not quantified." 403 }, 404 { 405 "flag": "No contamination analysis", 406 "detail": "HumanEval, MBPP, and APPS were all published in 2021, before the model families were trained. The paper makes no effort to assess whether benchmark solutions appear in training data, which could affect both absolute accuracy numbers and the cascading threshold." 407 }, 408 { 409 "flag": "Cost model assumptions may not generalize", 410 "detail": "The cost model is based on RTX 3090 GPU rental rates from RunPod (2023 pricing), batch inference, and excludes input prompt costs. Real deployment scenarios with different hardware, pricing, and serving configurations could yield different cost-accuracy tradeoffs." 411 }, 412 { 413 "flag": "No code released", 414 "detail": "Despite describing a detailed algorithmic framework, no source code is released, making it difficult to verify the results or apply the approach to new model families." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "Deepseek-coder: When the large language model meets programming – the rise of code intelligence", 420 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang", "Zhenda Xie"], 421 "year": 2024, 422 "arxiv_id": "2401.14196", 423 "relevance": "State-of-the-art code generation LLM family relevant to benchmarking and model cascading approaches." 424 }, 425 { 426 "title": "Code llama: Open foundation models for code", 427 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 428 "year": 2024, 429 "arxiv_id": "2308.12950", 430 "relevance": "Major open-source code LLM family with multiple model sizes, directly applicable to cascading approaches." 431 }, 432 { 433 "title": "CodeT: Code generation with generated tests", 434 "authors": ["Bei Chen", "Fengji Zhang", "Anh Nguyen", "Daoguang Zan", "Zeqi Lin"], 435 "year": 2023, 436 "doi": "10.18653/v1/2024.emnlp-main.1118", 437 "relevance": "Foundational work on self-testing for code generation that this paper builds upon for cascading threshold decisions." 438 }, 439 { 440 "title": "Self-refine: Iterative refinement with self-feedback", 441 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 442 "year": 2023, 443 "relevance": "Key reference for LLM self-improvement through self-generated feedback, motivating the self-testing approach." 444 }, 445 { 446 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 447 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 448 "year": 2023, 449 "arxiv_id": "2305.05176", 450 "relevance": "Prior work on LLM model cascading for natural language tasks; closest related work to this paper's approach." 451 }, 452 { 453 "title": "Large language model cascades with mixture of thought representations for cost-efficient reasoning", 454 "authors": ["Murong Yue", "Jie Zhao", "Min Zhang", "Liang Du", "Ziyu Yao"], 455 "year": 2024, 456 "relevance": "Related work on LLM cascading for cost reduction in reasoning tasks." 457 }, 458 { 459 "title": "WizardCoder: Empowering code large language models with evol-instruct", 460 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 461 "year": 2023, 462 "arxiv_id": "2306.08568", 463 "relevance": "Primary model family used in the experiments; two WizardCoder families are evaluated." 464 }, 465 { 466 "title": "Evaluating large language models trained on code", 467 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 468 "year": 2021, 469 "arxiv_id": "2107.03374", 470 "relevance": "Introduces HumanEval benchmark used as a primary evaluation dataset in this paper." 471 }, 472 { 473 "title": "Program synthesis with large language models", 474 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 475 "year": 2021, 476 "arxiv_id": "2108.07732", 477 "relevance": "Introduces MBPP benchmark used as a primary evaluation dataset in this paper." 478 }, 479 { 480 "title": "Measuring coding challenge competence with APPS", 481 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 482 "year": 2021, 483 "relevance": "Introduces APPS benchmark used as an evaluation dataset in this paper." 484 }, 485 { 486 "title": "A survey on model compression for large language models", 487 "authors": ["Xunyu Zhu", "Jian Li", "Yong Liu", "Can Ma", "Weiping Wang"], 488 "year": 2024, 489 "doi": "10.1162/tacl_a_00704", 490 "relevance": "Surveys alternative approaches to reducing LLM inference costs (quantization, pruning, distillation) that this paper positions against." 491 }, 492 { 493 "title": "Fast inference from transformers via speculative decoding", 494 "authors": ["Yaniv Leviathan", "Matan Kalman", "Yossi Matias"], 495 "year": 2023, 496 "relevance": "Alternative inference acceleration technique discussed as complementary to model cascading." 497 }, 498 { 499 "title": "Qwen2.5-coder technical report", 500 "authors": ["Binyuan Hui", "Jian Yang", "Zeyu Cui"], 501 "year": 2024, 502 "arxiv_id": "2409.12186", 503 "relevance": "Recent code LLM family that could be used for cascading but was not evaluated." 504 } 505 ], 506 "engagement_factors": { 507 "practical_relevance": { 508 "score": 2, 509 "justification": "Practitioners running multi-model code completion servers could adopt the cascading approach, but the framework requires validation-set optimization and multiple models deployed simultaneously." 510 }, 511 "surprise_contrarian": { 512 "score": 1, 513 "justification": "Cascading from small to large models is intuitive; the main contribution is formalizing the threshold-based approach with self-testing rather than challenging conventional wisdom." 514 }, 515 "fear_safety": { 516 "score": 0, 517 "justification": "No safety, security, or risk implications; purely an efficiency optimization." 518 }, 519 "drama_conflict": { 520 "score": 0, 521 "justification": "No controversy; straightforward cost optimization research." 522 }, 523 "demo_ability": { 524 "score": 0, 525 "justification": "No code released, no demo, no tool available to try." 526 }, 527 "brand_recognition": { 528 "score": 1, 529 "justification": "NYU is a recognized institution but the work does not involve high-profile products or labs." 530 } 531 } 532 }