scan.json (28772B)
1 { 2 "paper": { 3 "title": "THROWBENCH: Benchmarking LLMs by Predicting Runtime Exceptions", 4 "authors": [ 5 "Julian Aron Prenner", 6 "Romain Robbes" 7 ], 8 "year": 2025, 9 "venue": "arXiv.org", 10 "arxiv_id": "2503.04241", 11 "doi": "10.48550/arXiv.2503.04241" 12 }, 13 "scan_version": 3, 14 "active_modules": [ 15 "experimental_rigor", 16 "data_leakage" 17 ], 18 "methodology_tags": [ 19 "benchmark-eval" 20 ], 21 "key_findings": "THROWBENCH is a multilingual benchmark of 2,466 programs (Python, Java, C#, Ruby) where LLMs must predict runtime exception types. Evaluating six open-weight code LLMs (7B–34B), performance is modest (F1 19–38%), with Qwen2.5 Coder (32B) performing best. Performance varies substantially by language and exception type: ZeroDivision is relatively easy across languages, while overflow and null-pointer exceptions remain very challenging. Model size does not strongly correlate with performance (ρ=0.39).", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The benchmark and all model outputs are released at https://github.com/giganticode/throwbench/ as stated in the abstract and Section V." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The benchmark dataset of 2,466 programs is released at the same GitHub repository. The underlying data also comes from the publicly available RunBugRun and CodeNet datasets." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper mentions using Ollama and lists model names, sizes, and quantization levels (Table I), but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided in the paper. The approach is described at a high level (Section III-B) but lacks specific commands or a README-style guide for replication." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results (Tables I and II) are reported as point estimates of precision, recall, and F1. No confidence intervals or error bars are provided." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper compares F1 scores across 6 models and claims performance differences (e.g., Qwen2.5 Coder is best) without any statistical significance tests. Only a Pearson correlation (ρ=0.39) for model size vs. performance is reported." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "Only raw F1 scores are reported. No formal effect sizes (Cohen's d, odds ratios) are given. The Pearson ρ=0.39 for model size vs. performance is the only measure of association." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The final benchmark size (2,466 programs) is stated but not justified. The filtering steps are described (Section III-A) but no rationale is given for why this number is sufficient to draw reliable conclusions." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Results appear to be from single inference runs. No standard deviations, variance across seeds, or any spread measures are reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Six different code LLMs are evaluated (Table I), providing comparison across models. However, no non-LLM baselines (random classifier, majority class, static analysis) are included." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "All six models are recent (2024): CodeGemma, DeepSeek Coder 2, Codestral, Qwen2.5 Coder, DeepSeek Coder, and CodeLlama. These represent the contemporary open-weight code LLM landscape." 82 }, 83 "ablation_study": { 84 "applies": false, 85 "answer": false, 86 "justification": "This is a benchmark paper, not a system with components to ablate. There is no proposed method with separable components." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Precision, Recall, and F1 are reported (Table I), providing three complementary evaluation metrics." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "Human evaluation is not relevant for this benchmark — ground truth is determined objectively by program execution, not human judgment." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The models are pre-trained and used as-is without any fine-tuning on the benchmark data. The entire benchmark serves as a held-out test set." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table II provides a detailed breakdown of F1 scores per programming language (4 languages) and per exception type (37 types across languages) for all 6 models." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section IV discusses exception types where all models struggle: overflow errors, null dereference (NullPointerException, NullReferenceException), and Ruby's RangeError. Specific patterns of failure are identified." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The overall framing is that performance is modest (19–38% F1). The paper reports that model size doesn't correlate strongly with performance (ρ=0.39), and that the largest model (CodeLlama 34B) performed worst." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 'modest performance ranging from 19 to 38% (F1 score)' which matches Table I exactly. The claim that THROWBENCH consists of 'over 2,400 short user-written programs' matches the 2,466 count. The contamination-free claim is supported by the execution-based ground truth design." 124 }, 125 "causal_claims_justified": { 126 "applies": false, 127 "answer": false, 128 "justification": "The paper makes no causal claims. It reports descriptive benchmark results without claiming that any factor causes performance differences." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "Claims are bounded to the six tested open-weight models (7B–34B) and four programming languages. The paper does not overclaim — it explicitly notes the benchmark is 'intended to be used in addition to other benchmarks' and frames results as specific to the evaluated models." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for observed performance differences. For instance, it does not consider whether quantization levels, model training data composition, or prompt sensitivity could explain the results (except a brief CodeGemma quantization check)." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures exception type prediction accuracy (F1) and claims to assess this exact capability. No proxy gap exists — the measurement matches the claimed outcome." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table I lists specific model names, sizes, and quantization levels (e.g., 'Qwen2.5 Coder Instr. 32B Q4_K_M', 'CodeGemma Instr. 7B FP16'). These identify specific model variants retrievable via Ollama." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Figure 1 provides the full prompt text used in evaluation, including the exception type list, instructions, and placeholder format for code and input. Since the benchmark data is released, every prompt can be reconstructed." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No generation hyperparameters (temperature, top-p, max tokens) are reported. The paper states evaluation was done via Ollama's Python API but does not specify inference settings." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are prompted directly with a single-turn prompt (Figure 1)." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section III-A documents the benchmark creation pipeline: selecting buggy programs from RunBugRun in 4 languages, executing them, filtering for exception-throwing programs, subsampling for balance, discarding programs >53 LOC (95th percentile), and adding 10% correct programs." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated limitations or threats-to-validity section. Section V ('Conclusions') briefly mentions future directions (omitting answer choices, using chain-of-thought) but does not discuss limitations of the current work." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the effect of quantization on results, the representativeness of competition programming problems, or prompt sensitivity." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss limitations of testing only open-weight models, only competition-style programs, or only exception prediction as a measure of code understanding." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The benchmark data and all model outputs are released at https://github.com/giganticode/throwbench/, enabling independent verification of reported results." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section III-A describes the data source (RunBugRun, based on CodeNet), the selection criteria (buggy programs that throw exceptions), and the filtering process (language selection, balancing, length cutoff)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data comes from standard public datasets (RunBugRun/CodeNet programming contest submissions)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "The pipeline steps are listed (Section III-A) but intermediate counts are not provided. The paper gives the starting source (RunBugRun) and final counts (2,466 total) but does not state how many programs existed at each filtering stage or how many were removed." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding acknowledgments or grant information appears in the paper." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Free University of Bozen-Bolzano and Univ. Bordeaux/CNRS. Neither author is affiliated with the companies that produced the evaluated models." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "Funding is not disclosed, so independence of the funder from outcomes cannot be assessed." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interest statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The training data cutoff dates for the six evaluated models are not stated anywhere in the paper." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "The paper argues that while the source code may appear in training data (from CodeNet/RunBugRun), the ground truth labels (exception types) were determined by execution and are not in training data: 'the code in training corpora hardly contains any information on their runtime behavior' (Section II)." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Contamination is a central design consideration. The paper explicitly argues THROWBENCH is 'currently free from any contamination issues' because ground-truth answers were obtained through program execution, not from information that would appear in training data (Sections II and V)." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference cost, latency, or time-per-example is reported despite evaluating 2,466 examples across 6 models." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No hardware specifications, GPU hours, or total compute budget is stated. The paper mentions using Ollama but does not describe the evaluation infrastructure." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Results appear to be from single runs. No mention of multiple random seeds or sensitivity analysis across runs." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is not stated. It is unclear whether results are from a single run or averaged over multiple runs." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget is reported. Generation parameters (temperature, top-p) are not even stated, let alone any search over them." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "Different quantization levels are used for different models (FP16 for CodeGemma, Q4_0 for others) without full justification for these choices. A brief CodeGemma quantization comparison is provided but not systematic across models." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors introduce the benchmark and evaluate models on it without discussing potential biases in their benchmark construction or evaluation methodology." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "Different models use different quantization levels (FP16 vs Q4_0), affecting compute requirements, but performance is not reported as a function of compute budget. The CodeGemma quantization test is the only partial check." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper claims the benchmark tests 'code understanding' and 'runtime behavior' prediction but does not validate whether exception type prediction actually measures deep code comprehension versus surface-level pattern matching." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "All models are evaluated with the same prompt structure (Figure 1) and the same infrastructure (Ollama). The evaluation is consistent across models, isolating model differences." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "The paper argues that even if the source code (from CodeNet, published 2021) appears in training data, the ground-truth exception types were determined by execution and would not be in training data. This structurally addresses temporal leakage of labels." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "The prompt provides the list of possible exception types as answer options, which constrains the output space. The paper notes this makes the task 'slightly easier' but does not analyze whether this constitutes feature leakage or how much it inflates scores." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper does not discuss whether benchmark programs share structural similarities (e.g., same contest problems, similar code patterns from CodeNet) that could violate independence assumptions." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "The paper makes a conceptual argument about contamination-free design but does not apply any concrete leakage detection method (no canary strings, membership inference, or n-gram overlap analysis)." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "THROWBENCH is challenging for current state-of-the-art LLMs, with the best model achieving only 38% F1.", 373 "evidence": "Table I shows Qwen2.5 Coder achieves the highest F1 of 38.2% among six models. Performance ranges from 19.0% (CodeLlama) to 38.2%.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Model size and performance are not strongly correlated (Pearson's ρ=0.39).", 378 "evidence": "Section IV: The 7B CodeGemma outperformed the 34B CodeLlama (21.1% vs 19.0% F1). The correlation across 6 models is ρ=0.39.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Performance varies substantially across programming languages, with Ruby lowest (22%) and Python highest (29%).", 383 "evidence": "Table II micro-averages across models: Ruby 13.7–35.0%, Python 18.9–49.3% (from per-model averages). Averaged over all models: Ruby 22%, Python 29%.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "ZeroDivision exceptions are recognized with relatively high success, while overflow and null dereference errors pose major challenges.", 388 "evidence": "Table II shows DivideByZero/Arithmetic exceptions have green bands across models and languages. NullPointer (Java), NullReference (C#), StackOverflow, and Overflow exceptions show very low F1 scores across all models.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "THROWBENCH is free from contamination because ground-truth answers were determined through program execution.", 393 "evidence": "Section II argues that 'the code in training corpora hardly contains any information on their runtime behavior.' Section V: 'ground-truth answers were obtained through program execution.'", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Quantization (FP16 vs 4-bit) does not significantly affect performance for CodeGemma.", 398 "evidence": "Section IV: 'We also evaluated CodeGemma with 4-bit quantization and found no significant performance difference (in fact, the 4-bit version performed 0.2% better).'", 399 "supported": "weak" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No statistical significance testing", 405 "detail": "Performance differences between models are reported as raw F1 score comparisons without any significance tests. With 2,466 examples, small differences could be statistically meaningful or due to chance — this is impossible to assess without tests." 406 }, 407 { 408 "flag": "No variance or uncertainty quantification", 409 "detail": "All results appear to be single-run point estimates with no error bars, confidence intervals, or variance across runs. LLM outputs can be stochastic, and the stability of these results is unknown." 410 }, 411 { 412 "flag": "Only open-weight models tested", 413 "detail": "The evaluation covers only 7B–34B open-weight models. No proprietary models (GPT-4, Claude, Gemini) are tested, limiting the generalizability of claims about LLM capabilities on this task." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper lacks any limitations or threats-to-validity discussion. Issues such as the representativeness of competition code, prompt sensitivity, quantization confounds, and benchmark construct validity are not addressed." 418 }, 419 { 420 "flag": "Inconsistent quantization across models", 421 "detail": "CodeGemma uses FP16 while all other models use 4-bit quantization. Although a brief check for CodeGemma shows minimal difference, this was not verified for the other five models, creating a potential confound." 422 }, 423 { 424 "flag": "Missing trivial baseline", 425 "detail": "No random-guess or majority-class baseline is included. Without this, it is difficult to assess whether the 19–38% F1 scores reflect genuine code understanding or could be partially achieved by simple heuristics." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Evaluating Large Language Models Trained on Code", 431 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 432 "year": 2021, 433 "arxiv_id": "2107.03374", 434 "relevance": "Introduced HumanEval and Codex, foundational code LLM benchmark and model." 435 }, 436 { 437 "title": "Program Synthesis with Large Language Models", 438 "authors": ["J. Austin", "A. Odena", "M. Nye"], 439 "year": 2021, 440 "arxiv_id": "2108.07732", 441 "relevance": "Introduced MBPP, a widely-used code synthesis benchmark." 442 }, 443 { 444 "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation", 445 "authors": ["F. Cassano", "J. Gouwar", "D. Nguyen"], 446 "year": 2023, 447 "doi": "10.1109/TSE.2023.3267446", 448 "relevance": "Extended code benchmarks to 18+ programming languages, relevant to multilingual LLM evaluation." 449 }, 450 { 451 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 452 "authors": ["N. Jain", "K. Han", "A. Gu"], 453 "year": 2024, 454 "arxiv_id": "2403.07974", 455 "relevance": "Contamination-aware code benchmark using live programming competition problems." 456 }, 457 { 458 "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution", 459 "authors": ["A. Gu", "B. Rozière", "H. Leather", "A. Solar-Lezama", "G. Synnaeve", "S. I. Wang"], 460 "year": 2024, 461 "arxiv_id": "2401.03065", 462 "relevance": "Closest related benchmark — tests runtime behavior prediction (input/output) for code understanding." 463 }, 464 { 465 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 466 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 467 "year": 2023, 468 "relevance": "EvalPlus — enhanced evaluation rigor for code generation benchmarks with additional test cases." 469 }, 470 { 471 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 472 "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim"], 473 "year": 2024, 474 "arxiv_id": "2406.15877", 475 "relevance": "Large-scale code generation benchmark evaluating diverse function calls." 476 }, 477 { 478 "title": "RunBugRun – An Executable Dataset for Automated Program Repair", 479 "authors": ["J. A. Prenner", "R. Robbes"], 480 "year": 2023, 481 "arxiv_id": "2304.01102", 482 "relevance": "Source dataset for THROWBENCH; executable bug dataset used for automated program repair research." 483 }, 484 { 485 "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 486 "authors": ["C. S. Xia", "L. Zhang"], 487 "year": 2023, 488 "arxiv_id": "2304.00385", 489 "relevance": "LLM-based automated program repair with cost analysis." 490 }, 491 { 492 "title": "Code Llama: Open Foundation Models for Code", 493 "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"], 494 "year": 2024, 495 "arxiv_id": "2308.12950", 496 "relevance": "Open-weight code LLM evaluated in THROWBENCH; foundational model for code tasks." 497 }, 498 { 499 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence", 500 "authors": ["D. Guo", "Q. Zhu", "D. Yang"], 501 "year": 2024, 502 "arxiv_id": "2401.14196", 503 "relevance": "Open-weight code LLM evaluated in THROWBENCH." 504 }, 505 { 506 "title": "Qwen2.5-Coder Technical Report", 507 "authors": ["B. Hui", "J. Yang", "Z. Cui"], 508 "year": 2024, 509 "arxiv_id": "2409.12186", 510 "relevance": "Best-performing model on THROWBENCH; recent open-weight code LLM." 511 }, 512 { 513 "title": "Measuring Coding Challenge Competence With APPS", 514 "authors": ["D. Hendrycks", "S. Basart", "S. Kadavath"], 515 "year": 2021, 516 "relevance": "Large code benchmark from programming competitions, similar sourcing to THROWBENCH." 517 } 518 ], 519 "engagement_factors": { 520 "practical_relevance": { 521 "score": 1, 522 "justification": "A benchmark that could be used to evaluate code LLMs, but not a directly usable tool or technique for practitioners." 523 }, 524 "surprise_contrarian": { 525 "score": 1, 526 "justification": "LLMs struggling with runtime exception prediction is mildly surprising given claims of code understanding, but not deeply contrarian." 527 }, 528 "fear_safety": { 529 "score": 0, 530 "justification": "No AI safety or security concerns raised." 531 }, 532 "drama_conflict": { 533 "score": 0, 534 "justification": "No controversial claims or conflicts with other work." 535 }, 536 "demo_ability": { 537 "score": 2, 538 "justification": "Benchmark and code released on GitHub; anyone with Ollama and a GPU can run the evaluation." 539 }, 540 "brand_recognition": { 541 "score": 0, 542 "justification": "Authors from Free University of Bozen-Bolzano and University of Bordeaux; no major AI lab branding." 543 } 544 } 545 }