scan-v5.json (26873B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Importance Sampling is All You Need: Predict LLM's performance on new benchmark by reusing existing benchmark", 6 "authors": [ 7 "Junjie Shi", 8 "Wei Ma", 9 "Shi Ying", 10 "Lingxiao Jiang", 11 "Yang Liu", 12 "Bo Du" 13 ], 14 "year": 2025, 15 "venue": "arXiv.org", 16 "arxiv_id": "2508.01203", 17 "doi": "10.48550/arXiv.2508.01203" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims 1.1% average absolute error for CodeBLEU and 2.15% for pass@1 are directly supported by Tables 3–4; minor inconsistency between abstract (1.1%) and introduction (0.9%) is noted but does not materially misrepresent the results.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper frames prompt distributions as causally determining LLM performance but the design is predictive/observational; the claim that BIS 'significantly reduces data contamination risks' is entirely theoretical with no empirical validation of contamination detection accuracy.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "Experiments use only the CodeLlama family (all sharing the same lineage) and two benchmark clusters with similar coding domains; the abstract and conclusion generalize broadly to 'LLMs in code-related tasks' without caveating the single-family limitation.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "No alternative explanations are offered for why IWAE outperforms baselines or why semantic metrics are predicted more accurately than code-level ones beyond 'token-level randomness'; the Discussion reframes limitations as future work rather than considering competing interpretations.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper clearly states it predicts specific automated metrics (CodeBLEU, pass@1, cyclomatic complexity, security scores) and does not conflate these proxies with broader notions of 'code quality' or developer productivity.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "Section 5 is titled 'Discussion' and frames limitations primarily as 'promising avenues for future research' (cross-domain, cross-language, closed-source models) rather than a dedicated limitations or threats-to-validity section.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper never identifies concrete threats such as single-model-family bias, CodeBLEU's known limitations as a quality proxy, or the circularity of evaluating benchmark prediction using benchmarks from the same domain cluster.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": false, 68 "justification": "While cross-domain limitations are acknowledged, the paper does not explicitly state what the results do NOT show (e.g., no claim to work with instruction-tuned or RLHF models, no claim beyond same-language pairs).", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "Section 7 Acknowledgement clearly states support from the Ministry of Education, Singapore, Academic Research Fund Tier 3 (Award ID: MOET32020-0004).", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All six authors list institutional affiliations (NTU, SMU, Wuhan University) in the paper header; the work evaluates open-source models with no commercial affiliations.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "The funder is the Singapore Ministry of Education providing academic research grants; it has no stake in the CodeLlama models or benchmarks being evaluated.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "There is no competing interests statement or declaration of patents, equity, or consulting relationships; the acknowledgement only covers funding.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Importance sampling, IWAE, prompt distribution, and all major notation are explicitly defined in Sections 2–3 and Table 1; key metrics (CodeBLEU, cyclomatic complexity, security scores) are formally defined in Section 4.3.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Three numbered contributions are explicitly listed at the end of the Introduction: theoretical formalization, the BIS framework itself, and empirical validation of IWAE integration.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 engages with code benchmarking literature, importance sampling theory, and VAE/IWAE work, explaining how BIS extends prior methods; comparisons against prior approaches are made empirically in Section 4.4.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code repository or release link is mentioned anywhere in the paper.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "All benchmarks used (BigCodeBench, HumanEval, EvoEval) are publicly available; no private data was created.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper mentions L20 GPUs and BERT embeddings but provides no requirements file, Dockerfile, or software version specifications.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step instructions are provided; the architectural description in Section 3.2 is conceptual and would require substantial implementation guesswork.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "All results in Tables 3–13 are point estimates only; no confidence intervals or error bars are reported for any result.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "Comparative claims (BIS outperforms all baselines) are made without any statistical significance tests or p-values.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Absolute error values are reported across all conditions, providing a consistent effect-size measure; Table 7 includes 'avg of abs' summaries enabling direct magnitude comparison.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The 8,528 data points are presented as a practical collection from available benchmarks with no power analysis or justification for why this quantity is sufficient.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No variance or standard deviation across repeated runs is reported; each configuration appears to be run once with no measure of stability.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 4.4 compares BIS against 8 baselines: GMM (2 configs), RBM (2 configs), MaxEnt (2 configs), VAE, RSR, LR, DTR, RR, MLP, and RNN.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "The baselines (VAE, GMM, ridge regression, MLP, RNN) are standard and appropriate for this distribution-fitting and regression task; they span both statistical and deep learning approaches.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.5 explicitly performs ablation on four factors: embedding dimensionality (PCA vs linear), prompt set size, number of IWAE samples, and weight truncation percentile.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Results are reported for CodeBLEU, pass@1, cyclomatic complexity, security scores, and four Halstead code-level metrics (length, volume, effort, time).", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": false, 206 "answer": false, 207 "justification": "The paper evaluates automated code quality metrics only; human evaluation is not applicable to this benchmark prediction task.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "The cross-prediction design (BigCode predicts Evo, Evo predicts BigCode) uses each dataset as a held-out test set for the other, providing genuine out-of-distribution evaluation.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results are consistently broken down by model size (7B/13B/34B/70B), by source dataset direction, and by metric type (semantic vs code-level).", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "Section 5 acknowledges failure conditions (extreme weight distributions, cross-domain settings) theoretically but does not present actual empirical failure cases with concrete examples.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "The paper clearly reports that code-level metrics (length, volume) are harder to predict with MAEs up to 10.7%, and that reducing sample size to 100 causes substantial performance degradation.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Specific CodeLlama model sizes (7B, 13B, 34B, 70B) and BERT for embeddings are named; CodeLlama is open-source with deterministic checkpoints.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": false, 245 "justification": "No prompting templates or system instructions used to query CodeLlama are provided; only the benchmark prompts themselves (from public datasets) are referenced.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "The chosen IWAE sample count (10) and truncation percentile (0.9) are identified, but learning rate, batch size, training epochs, and BERT embedding configuration are not reported.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "There is no agentic scaffolding; the paper evaluates LLM outputs on fixed code generation benchmarks without any scaffolding layers.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "The paper documents merging HumanEval with EvoEval into 'Evo', min-max normalization procedure, and the cross-prediction framework setup.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "The raw model outputs (LLM-generated code and scores) collected during experiments are not made available; no data repository is linked.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "The collection procedure is described: 4 CodeLlama models run on 9 benchmarks (2,132 prompts each = ~8,528 total), using open-source models released before benchmark publication to limit contamination.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "Standard public benchmarks are used; no participant recruitment is involved.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The full pipeline from benchmark prompts → BERT embeddings → IWAE training → importance weight computation → weighted score prediction is documented mathematically in Section 3.2.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "The paper says open-source models 'released before the publication of these benchmarks' were selected, but does not state the actual training data cutoff dates for any CodeLlama variant.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": true, 303 "justification": "Data contamination is explicitly one of the two central motivating problems; the choice to use only open-source models released before benchmark publication is directly justified on contamination grounds.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": true, 309 "justification": "The paper deliberately restricts to CodeLlama models predating the benchmarks and discusses the contamination problem extensively in both Introduction and Section 5.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": true, 361 "justification": "Section 4.1 states the evaluation was conducted by renting 8 servers with L20 GPUs at a total cost of $280.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": true, 367 "justification": "$280 across 8 L20 GPU servers is reported as the total computational budget for the experiments.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "BIS achieves an average absolute prediction error of 1.1% for CodeBLEU code correctness scores across 4 CodeLlama models and 2 benchmark clusters.", 376 "evidence": "Table 3 shows BigCode-source avg absolute error 0.8% and Evo-source 1.4%, averaging 1.1%.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "BIS outperforms all baseline methods including GMM, RBM, MaxEnt, VAE, and regression models under both importance-sampling and non-importance-sampling frameworks.", 381 "evidence": "Tables 7–8 show BIS achieving avg-of-abs error 0.011 vs. next-best VAE at 0.015 and MaxEnt at 0.017.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "The framework generalizes to metrics beyond CodeBLEU, including cyclomatic complexity (4.6% MAE), security scores (4.3% MAE), and Halstead code-level metrics (up to 10.7% MAE).", 386 "evidence": "Table 6 reports per-metric, per-model errors across 7 additional metrics.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Prompt distributions alone are sufficient to predict LLM code generation performance without executing generated code or using reference solutions.", 391 "evidence": "The theoretical derivation in Section 3.3 and empirical results across 9 benchmarks support this, but only for one model family (CodeLlama) in similar coding domains.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "BIS can significantly mitigate data contamination risks by eliminating reliance on test suites and reference solutions.", 396 "evidence": "This is presented as a theoretical property; no empirical test of contamination detection is performed.", 397 "supported": "weak" 398 } 399 ], 400 "methodology_tags": [ 401 "benchmark-eval", 402 "theoretical" 403 ], 404 "key_findings": "BIS uses Importance Weighted Autoencoders to model prompt distributions and reweight source benchmark scores to predict target benchmark performance, achieving ~1.1% average absolute error on CodeBLEU for CodeLlama 7B–70B without executing code. The framework outperforms GMM, VAE, RBM, and regression baselines, and extends to cyclomatic complexity and security metrics (3–5% MAE) though performance degrades for Halstead code-level metrics (up to 10.7%). Ablation studies identify optimal IWAE sample count (~10) and weight truncation percentile (0.9), and show that PCA dimensionality reduction is far superior to linear projection. The contamination-reduction motivation is compelling but unvalidated empirically.", 405 "red_flags": [ 406 { 407 "flag": "Single model family only", 408 "detail": "All experiments use CodeLlama variants (7B–70B), which share architecture and training lineage. Generalization to other LLM families is untested but broadly claimed." 409 }, 410 { 411 "flag": "No statistical significance testing", 412 "detail": "All comparative claims (BIS is best) are made on point estimates without confidence intervals or hypothesis tests, making it impossible to assess whether observed differences are meaningful." 413 }, 414 { 415 "flag": "Pass@1 tested on single model only", 416 "detail": "Table 4 reports pass@1 only for CodeLlama-7B 'due to computational constraints', but the abstract and findings generalize pass@1 results without this caveat." 417 }, 418 { 419 "flag": "CodeBLEU as ground truth", 420 "detail": "CodeBLEU is itself a widely-criticized proxy for code correctness; predicting CodeBLEU does not guarantee the framework is useful for predicting actual functional correctness." 421 }, 422 { 423 "flag": "No code or data released", 424 "detail": "Neither the IWAE implementation nor the collected evaluation data are released, making independent replication impossible." 425 }, 426 { 427 "flag": "Abstract/intro metric inconsistency", 428 "detail": "The abstract states 1.1% average error while the introduction states 0.9%; though minor, this internal inconsistency suggests insufficient proofreading." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Evaluating large language models trained on code (HumanEval)", 434 "relevance": "Primary benchmark used as source/target dataset; foundational LLM code evaluation paper" 435 }, 436 { 437 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 438 "relevance": "Second primary benchmark; used as source/target dataset; example of high-cost benchmark development that motivates this work" 439 }, 440 { 441 "title": "SWE-Bench: Can language models resolve real-world GitHub issues?", 442 "relevance": "Representative example of realistic code benchmark with complex test suites" 443 }, 444 { 445 "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM", 446 "relevance": "Source of the 7 EvoEval sub-benchmarks used as the 'Evo' dataset in experiments" 447 }, 448 { 449 "title": "Importance weighted autoencoders (IWAE)", 450 "relevance": "Core technical component of BIS; provides the distribution modeling method" 451 }, 452 { 453 "title": "Does Data Contamination Detection Work (Well) for LLMs? A Survey and Evaluation on Detection Assumptions", 454 "relevance": "Background on the contamination problem that motivates this work" 455 }, 456 { 457 "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks", 458 "relevance": "Recent empirical evidence showing benchmark contamination is a real and widespread problem" 459 }, 460 { 461 "title": "Code Llama: Open foundation models for code", 462 "relevance": "The model family used in all experiments" 463 } 464 ], 465 "engagement_factors": { 466 "practical_relevance": { 467 "score": 2, 468 "justification": "Reducing benchmark construction costs is a genuine practical problem, but the current validation on one model family and similar benchmark domains limits immediate applicability." 469 }, 470 "surprise_contrarian": { 471 "score": 2, 472 "justification": "The claim that prompt distributions alone (without code execution or test suites) can predict LLM performance challenges the implicit assumption in all prior benchmark-based evaluation." 473 }, 474 "fear_safety": { 475 "score": 0, 476 "justification": "No AI safety or risk concerns are raised; the paper is a methodological contribution to benchmark evaluation." 477 }, 478 "drama_conflict": { 479 "score": 0, 480 "justification": "Straightforward technical paper with no controversy, competing claims, or adversarial framing." 481 }, 482 "demo_ability": { 483 "score": 1, 484 "justification": "The concept is demonstrable in principle, but no code is released and the setup requires running large LLMs on public benchmarks, limiting casual replication." 485 }, 486 "brand_recognition": { 487 "score": 0, 488 "justification": "Authors are from NTU, SMU, and Wuhan University — credible academic institutions but not AI lab brands with large public followings." 489 } 490 }, 491 "hn_data": { 492 "threads": [ 493 { 494 "hn_id": "45327964", 495 "title": "We Politely Insist: Your LLM Must Learn the Persian Art of Taarof", 496 "points": 181, 497 "comments": 122, 498 "url": "https://news.ycombinator.com/item?id=45327964" 499 }, 500 { 501 "hn_id": "44455950", 502 "title": "AI for Scientific Search", 503 "points": 125, 504 "comments": 34, 505 "url": "https://news.ycombinator.com/item?id=44455950" 506 }, 507 { 508 "hn_id": "32367085", 509 "title": "From maximum force to physics in 9 lines – and implications for quantum gravity", 510 "points": 48, 511 "comments": 50, 512 "url": "https://news.ycombinator.com/item?id=32367085" 513 }, 514 { 515 "hn_id": "42993305", 516 "title": "The Differences Between Direct Alignment Algorithms Are a Blur", 517 "points": 8, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=42993305" 520 }, 521 { 522 "hn_id": "40333494", 523 "title": "From maximum force to physics in 9 lines", 524 "points": 3, 525 "comments": 1, 526 "url": "https://news.ycombinator.com/item?id=40333494" 527 }, 528 { 529 "hn_id": "28075769", 530 "title": "The MIT Supercloud Dataset", 531 "points": 3, 532 "comments": 0, 533 "url": "https://news.ycombinator.com/item?id=28075769" 534 }, 535 { 536 "hn_id": "44591216", 537 "title": "Rethinking the Illusion of Thinking", 538 "points": 2, 539 "comments": 1, 540 "url": "https://news.ycombinator.com/item?id=44591216" 541 }, 542 { 543 "hn_id": "19815852", 544 "title": "What’s Wrong with Risk Matrices?", 545 "points": 2, 546 "comments": 1, 547 "url": "https://news.ycombinator.com/item?id=19815852" 548 }, 549 { 550 "hn_id": "44517330", 551 "title": "A Survey on Latent Reasoning", 552 "points": 2, 553 "comments": 0, 554 "url": "https://news.ycombinator.com/item?id=44517330" 555 }, 556 { 557 "hn_id": "43176012", 558 "title": "Discovering Chunks in Neural Embeddings for Interpretability", 559 "points": 2, 560 "comments": 0, 561 "url": "https://news.ycombinator.com/item?id=43176012" 562 } 563 ], 564 "top_points": 181, 565 "total_points": 376, 566 "total_comments": 209 567 } 568 }