scan-v5.json (26436B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Extracting Fix Ingredients using Language Models", 6 "authors": [ 7 "Julian Aron Prenner", 8 "Romain Robbes" 9 ], 10 "year": 2025, 11 "venue": "2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)", 12 "arxiv_id": "2503.04214", 13 "doi": "10.1109/Forge66646.2025.00028" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All major abstract claims are backed by experimental results: ingredient prevalence by RQ1 analysis, 31% relative improvement by Table II, and large-context outperformance by the same table.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "Improvement claims are supported by controlled experiments comparing ScanFix variants against explicit baselines where the only experimental variable is ingredient augmentation strategy.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Claims are appropriately bounded to CodeT5-based models and file-level context; the Sutton's bitter lesson discussion further acknowledges generalization limits to other architectures.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section VII.B discusses whether large context windows will make ScanFix obsolete, the 'lost in the middle' phenomenon, and whether improvements stem from targeted extraction vs. simply providing more tokens.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper explicitly acknowledges exact match is a proxy: 'bugs in TSSB-3M are not executable (and lack tests) we resort to exact match', clearly distinguishing this from actual bug-fixing verification.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section VII.A is a dedicated 'Limitations' subsection covering software bugs, non-identifier ingredients, lexical analysis limitations, single model architecture, LLM memorization, dataset choice, and file-level scanning.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Threats are specific: e.g., 'repeating experiments using a second or even third model architecture would have exceeded our computational budget' and 'Defects4J has only around 800 bugs... each individual bug would have a very large weight'.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Explicit scope boundaries are stated: identifier ingredients only (not literals or compound snippets), file-level context for TSSB-3M, and CodeT5 models only.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Acknowledgments state: 'This study has received financial support from the French State in the framework of the Investments for the Future programme IdEx université de Bordeaux.'", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations are clearly listed on the first page: Free University of Bozen-Bolzano and Univ. Bordeaux/LaBRI.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "The French government IdEx university excellence program is independent of automated program repair tool outcomes; no apparent conflict exists.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests or financial interests declaration is present beyond the funding statement; there is no 'authors declare no competing interests' statement.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Key terms are formally defined in Section III: 'identifier ingredients', 'fix ingredients', 'fixall', 'winin', 'winout', 'filein', 'fileout', 'projin', 'ingredient cover', and 'local context' all receive precise definitions.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Four research questions (RQ1-RQ4) are stated upfront, mapping to distinct contributions: ingredient prevalence analysis, impact on repair success, scanner model evaluation, and the combined ScanFix system.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section II substantively engages with prior work: comparing to SequenceR, FitRepair, relevant-identifier prompting, RAG approaches, and search-based APR, explaining how this work extends or differs from each.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "A replication package is explicitly provided at https://github.com/giganticode/llm_ingredient_extraction (reference [12]).", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "Both datasets used (TSSB-3M and Defects4J) are publicly available standard benchmarks; no novel private dataset is introduced that would require separate release.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "No requirements files, Dockerfiles, or library versions are specified; tools are named (TreeSitter, Pygments, Ctags) but without version numbers or environment setup instructions in the paper.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper provides a replication package link but does not include step-by-step reproduction instructions in the paper itself; readers must rely entirely on the external repository.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": true, 147 "justification": "95% confidence interval error bands are shown in Figures 5, 6, 7, and 10 for repair success across ingredient count and distance analyses.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": false, 153 "justification": "No formal statistical significance tests (t-tests, Wilcoxon, etc.) are reported for model comparisons; Table II shows only point estimates without any inferential statistics.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Both absolute and relative improvement figures are consistently reported throughout (e.g., 'absolute performance increase of 2.55% and a relative improvement of roughly 7%', '31.5% (abs. 5.9%)').", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": true, 165 "justification": "Table I documents all dataset splits with sizes; the paper explains why Defects4J (800 bugs) cannot be used for RQ3/RQ4, and the 500-bug random sample is justified by API rate limiting constraints.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "Table II (main results) shows only point estimates without confidence intervals or standard deviations; error bands are only shown in figures for subset analyses, not for the primary comparative results.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Multiple baselines are included: 'No ingredients', 'Naive ingredients', 'Large context', 'Perfect ingredients', 'Perfect ingredients (file)', and 'Perfect recall, low precision'.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "Defects4J analysis includes recently published tools (TARE 2023, FitRepair 2023, RAP-Gen 2023); the large-context model directly tests the competing simple approach with the same underlying architecture.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "Multiple ablations are presented: two scanner variants ('All' vs. 'OOW'), three classification thresholds (0.05, 0.5, 0.95), and models with vs. without ingredient augmentation.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "Scanner evaluated with precision, recall, and F1 at multiple thresholds; repair success measured by exact match, per-ingredient-count breakdown, and per-ingredient-distance analysis.", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": false, 202 "answer": false, 203 "justification": "Human evaluation of system outputs is not applicable for this automated program repair paper; evaluation uses exact match against ground truth.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": true, 209 "justification": "Table I explicitly defines separate training, validation, and test splits for each RQ, with disjoint training sets for scanner and repair models specifically to avoid data leakage.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results are broken down by ingredient count (Figures 5, 6), ingredient distance (Figures 7, 10), in-window vs. out-of-window categories, and rare vs. common ingredients.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Failure modes are discussed: 'low performance for multiple fix ingredients', performance drops for far-away ingredients, and Figure 11 illustrates success/failure patterns with a concrete example.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "The paper prominently reports that ScanFix is outperformed by the large-context baseline and explicitly frames this as evidence for Sutton's bitter lesson, stating it 'discourages further research into domain-specific solutions'.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": true, 235 "justification": "Models are identified with parameter counts: 'CodeT5 (small variant with 60M parameters)' and 'BigCode's pre-trained StarEncoder model with roughly 125M parameters'; Hugging Face URLs are cited.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": true, 241 "justification": "Input formats for both scanner and repair models are described with special tokens (<BUGSTART>, <BUGEND>, <SCAN>, <INGRE>) and concrete example inputs are shown in Figures 4, 8, and 11.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Learning rates, epochs, batch sizes, and gradient accumulation are reported for both models: repair model (lr=1e-4, 4 epochs, batch=12, accum=2) and scanner model (lr=6e-5, 4 epochs, batch=30, accum=3).", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "No agentic scaffolding is used; the system is a straightforward two-model pipeline (scanner → repair model) with a simple inference procedure, not an agent framework.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Section III.A details mining from GitHub, pre-processing multi-line strings via TreeSitter, deduplication by commit hash (reducing 3M to 900K bugs), encoding issue filtering, and local context construction.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "Both TSSB-3M and Defects4J are publicly available; the replication package is provided, making processed data derivable from public sources.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Section III.A describes the mining procedure in detail: using TSSB-3M commit hashes, the GitHub API for full file contents, and Defects4J's relevant class lists for project-level ingredients.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants; all data derives from public software repositories (TSSB-3M, GitHub, Defects4J).", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "The full pipeline is documented: mining → preprocessing → deduplication → filtering → ingredient extraction → dataset splitting (with sizes in Table I) → training and evaluation.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Training data cutoffs for the pre-trained base models (CodeT5, StarEncoder) are not stated; memorization concerns are acknowledged but not addressed via cutoff verification.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": true, 298 "answer": true, 299 "justification": "Data leakage between scanner and repair model training sets is explicitly addressed: 'we take care to use different training sets for the scanner model and the final ScanFix model (RQ4) to avoid data leakage issues' (Table I).", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section VII.A explicitly discusses memorization: 'Our model is based on the small version of CodeT5 (60M parameters), both due to our limited resources and to minimize these memorization issues.'", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human participants.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": false, 357 "justification": "No inference cost or latency figures are reported; only qualitative discussion of VRAM constraints and the quadratic cost of large-context attention.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "The paper mentions VRAM budget constraints and that extra model runs 'would have exceeded our computational budget' but provides no specific GPU hours, hardware specs, or cost figures.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Identifier ingredients are prevalent in program repair: 85% of Defects4J bugs and 44% of TSSB-3M bugs require at least one identifier ingredient.", 372 "evidence": "RQ1 analysis on both datasets with full enumeration of ingredient sets and cover calculations shown in Figure 3.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "39–51% of fix identifier ingredients fall outside a typical repair model's 30-line input window.", 377 "evidence": "Figure 3 cover percentages: input window covers 61% for Defects4J and 49% for TSSB-3M, meaning 39% and 51% are out-of-window respectively.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Repair success decreases as fix ingredient count increases and as ingredients are farther from the bug location.", 382 "evidence": "Figures 5 and 6 show downward trends across all tools with ingredient count; Figure 7 shows distance-dependent performance degradation with rare ingredients most affected.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "ScanFix achieves up to 31% relative improvement over the no-ingredient baseline for bugs with out-of-window fix ingredients.", 387 "evidence": "Table II: 'Scanner Ingrs. t=0.05 (OOW)' = 24.56% vs 'No Ingrs.' = 18.68% on winout bugs (31.5% relative improvement).", 388 "supported": "strong" 389 }, 390 { 391 "claim": "A large-context baseline (5120 tokens, no ingredient augmentation) outperforms all ScanFix variants, achieving 47.8% relative improvement over the no-ingredient baseline.", 392 "evidence": "Table II: 'Large Context (no ingrs.)' = 27.60% vs 'No Ingrs.' = 18.68% for winout bugs; best ScanFix variant is 24.56%.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Oracle (perfect) ingredient augmentation far outperforms all automatic approaches, showing ingredient extraction quality is the binding constraint.", 397 "evidence": "Table II: 'Perfect Ingrs.' = 65.23% vs best ScanFix = 24.56% for winout bugs, a 2.7x gap demonstrating large theoretical headroom.", 398 "supported": "strong" 399 } 400 ], 401 "methodology_tags": [ 402 "benchmark-eval", 403 "empirical" 404 ], 405 "key_findings": "Identifier ingredients (variable, method, class names) are prevalent in neural program repair but frequently fall outside repair models' input windows (39–51% out-of-context). ScanFix uses a StarEncoder-based scanner model to extract likely ingredients from file-level context, achieving 7–31% relative improvement for out-of-window bugs. However, simply expanding the input window from 1024 to 5120 tokens outperforms ScanFix (47.8% relative improvement), supporting Sutton's bitter lesson that scaling computation beats domain-specific engineering. The large gap between ScanFix and an oracle-ingredient baseline indicates the bottleneck is extraction quality, not the fundamental viability of the approach.", 406 "red_flags": [ 407 { 408 "flag": "Exact match only", 409 "detail": "TSSB-3M evaluation relies solely on exact string match because bugs are not executable; this may reward lexically identical patches over semantically correct ones and does not verify actual bug fixing." 410 }, 411 { 412 "flag": "Single model architecture", 413 "detail": "Both scanner and repair models use only CodeT5/StarEncoder; the paper acknowledges this as a limitation but does not test a second architecture, limiting generalizability of the comparative results." 414 }, 415 { 416 "flag": "No significance testing in main results", 417 "detail": "Table II reports only point estimates; no formal statistical tests compare ScanFix variants against baselines, making it unclear whether observed differences are statistically significant." 418 }, 419 { 420 "flag": "RQ3/RQ4 limited to file-level context", 421 "detail": "Project-level ingredient extraction (which covers 90%+ of ingredients per RQ1) cannot be evaluated on TSSB-3M due to data availability constraints, leaving the most impactful setting untested by the main experiments." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "TSSB-3M: Mining single statement bugs at massive scale", 427 "relevance": "Primary dataset for training and evaluation throughout RQ1–RQ4" 428 }, 429 { 430 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 431 "relevance": "Secondary benchmark providing project-level context and repair tool comparison data" 432 }, 433 { 434 "title": "The plastic surgery hypothesis", 435 "relevance": "Foundational hypothesis motivating the entire ingredient-based approach to program repair" 436 }, 437 { 438 "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models (FitRepair)", 439 "relevance": "Most closely related prior work on relevant-identifier prompting with LLMs; ScanFix directly extends this" 440 }, 441 { 442 "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair", 443 "relevance": "Prior NPR work that augmented input with static class-level identifiers, directly preceding this approach" 444 }, 445 { 446 "title": "Out of context: How important is local context in neural program repair?", 447 "relevance": "By same first author; informs local context size choices and asymmetric window (18/12 lines) used throughout" 448 }, 449 { 450 "title": "RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5", 451 "relevance": "Competing RAG-based approach to context augmentation for program repair" 452 }, 453 { 454 "title": "Can OpenAI's Codex fix bugs? An evaluation on QuixBugs", 455 "relevance": "Prior work by same first author raising memorization concerns in LLM-based APR, motivating CodeT5-small choice" 456 }, 457 { 458 "title": "Lost in the Middle: How Language Models Use Long Contexts", 459 "relevance": "Motivates concern that large context windows may not effectively use all provided context" 460 }, 461 { 462 "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code", 463 "relevance": "Base model for the repair component of ScanFix" 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "Provides concrete techniques for improving neural program repair, though the bitter lesson conclusion somewhat deflates the main approach's adoption prospects." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "The finding that expanding the input window beats a carefully engineered ingredient extraction system is counterintuitive and is explicitly framed as a Sutton's bitter lesson case." 474 }, 475 "fear_safety": { 476 "score": 0, 477 "justification": "No AI safety or risk concerns raised; this is a software engineering tools paper focused on program repair performance." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "The Sutton's bitter lesson framing creates mild tension about whether the research direction is worth pursuing, with the authors explicitly discouraging follow-on work." 482 }, 483 "demo_ability": { 484 "score": 1, 485 "justification": "Code is available on GitHub with a replication package, but no interactive demo or runnable notebook is provided and setup requires training custom models." 486 }, 487 "brand_recognition": { 488 "score": 0, 489 "justification": "Authors are from Free University of Bozen-Bolzano and Université de Bordeaux; no famous AI labs or widely-recognized products are involved." 490 } 491 }, 492 "hn_data": { 493 "threads": [ 494 { 495 "hn_id": "30665928", 496 "title": "PERCEPT: Online change-point detection using topological data analysis", 497 "points": 8, 498 "comments": 0, 499 "url": "https://news.ycombinator.com/item?id=30665928" 500 }, 501 { 502 "hn_id": "42999205", 503 "title": "Flip Graphs with Symmetry and New Matrix Multiplication Schemes", 504 "points": 3, 505 "comments": 0, 506 "url": "https://news.ycombinator.com/item?id=42999205" 507 }, 508 { 509 "hn_id": "44256016", 510 "title": "Can Theoretical Physics Research Benefit from Language Agents?", 511 "points": 1, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=44256016" 514 } 515 ], 516 "top_points": 8, 517 "total_points": 12, 518 "total_comments": 0 519 } 520 }