scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (34600B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Extracting Fix Ingredients using Language Models",
      6     "authors": [
      7       "Julian Aron Prenner",
      8       "Romain Robbes"
      9     ],
     10     "year": 2025,
     11     "venue": "2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)",
     12     "arxiv_id": "2503.04214",
     13     "doi": "10.1109/Forge66646.2025.00028"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims are well-supported: 'relative improvements of up to 31%' matches Table II (31.5% for OOW winout); 'outperformed by a model with a large input window' matches Table II (39.28% vs 37.27%); 'augmenting with ground-truth fix leads to even better results' matches Table II (51.99%).",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Causal claims ('augmenting repair model input with ingredients yields improvements') are supported by controlled experimental comparisons: same model architecture, same training data, varying only the ingredient augmentation. The ablation-style design (adding/removing ingredients) is adequate for these claims.",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract specifies 'a dataset of 85,000 Python bugs.' Section VII-A explicitly bounds scope: single model architecture, file-level scanning only for TSSB-3M, identifier ingredients only (not literals or compound ingredients), lexical analysis only.",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section VII-B discusses Sutton's bitter lesson as an alternative framing. The paper considers that large context windows might solve the problem entirely, that memorization in models could explain some performance, and that ingredient frequency in training data affects success (Section IV-D).",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly acknowledges using exact match as a proxy: 'bugs in TSSB-3M are not executable (and lack tests) we resort to exact match to assess whether a generated patch is correct or not' (Section IV-B). This clearly distinguishes the measurement proxy from actual correctness.",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section VII-A 'Limitations' is a substantial dedicated section covering seven specific limitations: potential software bugs, non-identifier ingredients, lexical analysis limitations, single model architecture, LLM impact, dataset constraints, and file-level scanning limitations.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The limitations are specific to this study: 'We use a single model architecture' with explanation that 'repeating experiments using a second or even third model architecture would have exceeded our computational budget'; 'we were not able to use Defects4J... due to its small size'; lexical analysis 'ignores an identifier's syntactic function.'",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section VII-A explicitly states what was NOT tested: non-identifier ingredients ('Literals, compound ingredients... are not studied'), project-level scanning for TSSB-3M ('mining for project-level identifiers impractical'), LLM evaluation ('we decided not to focus on LLMs'), and scope/accessibility analysis.",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments section states: 'This study has received financial support from the French State in the framework of the Investments for the Future programme IdEx université de Bordeaux.'",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are clearly listed: Julian Aron Prenner at Free University of Bozen-Bolzano, Romain Robbes at Univ. Bordeaux/CNRS/Bordeaux INP/LaBRI. No commercial product is being evaluated, so no product-affiliation conflict exists.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The French State's IdEx program is a general academic funding source with no financial stake in whether ScanFix outperforms baselines.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial interests statement is present in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Fix ingredients, identifier ingredients, ingredient cover, and the full taxonomy (fixall, winin, winout, mthin, mthout, filein, fileout, projin, projout) are formally defined in Section III.B.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper clearly states two contributions: an empirical study of identifier ingredient prevalence and impact (RQ1-2), and the ScanFix system combining a scanner and repair model (RQ3-4).",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section II provides detailed comparison to SequenceR, RewardRepair, FitRepair, RAP-Gen, SelfAPR, and others, explicitly explaining how ScanFix differs (neural scanner vs. static/lexical extraction, fine-tuned repair model).",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "The paper states 'Code and scripts used in this work are provided online[12]' with reference [12] pointing to a GitHub repository (https://github.com/giganticode/llm_ingredient_extraction).",
    122           "source": "opus"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The paper uses two publicly available datasets: TSSB-3M and Defects4J 2.0. Both are standard benchmarks accessible to other researchers. The replication package includes scripts for data processing.",
    128           "source": "opus"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The paper specifies model architectures (CodeT5-small 60M, StarEncoder 125M) and mentions tools like TreeSitter and Pygments, but does not provide a requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment.",
    134           "source": "opus"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "While a replication package URL is provided, the paper itself does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section.",
    140           "source": "opus"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "The paper reports 95% confidence interval bands on Figures 5, 6, 7, and 10. For example, 'Error bands indicate a 95% confidence interval' is stated for multiple figures.",
    148           "source": "opus"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are reported. The paper uses 'significant' informally (e.g., 'a significant increase in repair success') without formal testing. Comparative claims rely on point estimates and confidence interval overlap.",
    154           "source": "opus"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "The paper reports both absolute and relative improvements with baseline context. For example, 'absolute performance increase of 2.55% and a relative improvement of roughly 7%' and 'a relative improvement of 31.5% (abs. 5.9%).' Table II provides full numeric context.",
    160           "source": "opus"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Section VII-A (Datasets) explicitly justifies dataset size choices: 'Defects4J has only around 800 bugs... each individual bug would have a very large weight. In contrast, our evaluation set for TSSB is between 80 times (RQ3) and 250 times larger (RQ4), which ensures much more robust results.'",
    166           "source": "opus"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The main results in Table II are single-run point estimates with no standard deviation, variance, or multiple-run analysis. The 95% CI bands in figures show variation across bugs, not across experimental runs with different seeds.",
    172           "source": "opus"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The paper includes six baselines: perfect ingredients, perfect file-level ingredients, perfect recall/low precision, naive ingredients, no ingredients, and large context. For Defects4J RQ2, results from 15 published APR/NPR tools are compared.",
    180           "source": "opus"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "For Defects4J analysis, baselines include recent tools like FitRepair, TARE, RAP-Gen (2022-2023). The large-context baseline provides a fair apples-to-apples comparison using the same model architecture.",
    186           "source": "opus"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "The paper systematically varies: scanner variants (All vs OOW), extraction thresholds (0.05, 0.5, 0.95), ingredient sources (none, naive, scanned, perfect, large context). Table II presents results for all combinations, showing each component's contribution.",
    192           "source": "opus"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "The scanner model is evaluated with precision, recall, and F1 (Section V-B). Repair is evaluated with exact match. The ingredient analysis uses cover, distance, and frequency metrics. Multiple perspectives on the same problem.",
    198           "source": "opus"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": false,
    203           "justification": "No human evaluation of repair outputs is performed. Evaluation is entirely automated using exact match against ground-truth fixes.",
    204           "source": "opus"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Table I explicitly documents disjoint training and evaluation splits: 510,851 training / 85,776 evaluation for RQ2; 274,776 / 85,776 for scanner (RQ3); 236,075 / 257,316 for repair (RQ4). The paper also ensures disjoint scanner and repair training sets to avoid data leakage.",
    210           "source": "opus"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down by fix ingredient count (Figures 5, 6), ingredient distance (Figures 7, 10), in-window vs out-of-window (Figure 6, Table II columns), ingredient frequency (Figure 7), and context level (Figure 3).",
    216           "source": "opus"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "The paper discusses where the approach fails: ScanFix is outperformed by large context (Section VI-B), performance degrades with multiple ingredients ('the complexity of arranging the ingredients dominates'), and the scanner has modest extraction performance.",
    222           "source": "opus"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The paper openly reports that ScanFix is outperformed by the large-context baseline (Section VI-B) and frames this as 'A case of Sutton's bitter lesson' (Section VII-B). The scanner's modest precision/recall is honestly reported.",
    228           "source": "opus"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The paper specifies 'CodeT5 small variant with 60M parameters' (ref [37]) for the repair model and 'BigCode's pre-trained StarEncoder model with roughly 125M parameters' (refs [38, 39]) for the scanner model. These are specific, identifiable model checkpoints.",
    236           "source": "opus"
    237         },
    238         "prompts_provided": {
    239           "applies": false,
    240           "answer": false,
    241           "justification": "The paper's main experiments use fine-tuned seq2seq and token-classification models, not prompting. The ChatGPT example in Section I is a motivational illustration, not a systematic experiment.",
    242           "source": "opus"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Hyperparameters are reported in detail: repair model uses learning rate 1×10⁻⁴, 4 epochs, batch size 12 accumulated over 2 steps, beam search with 5 beams (Section IV-B). Scanner uses learning rate 6×10⁻⁵, 4 epochs, batch size 30 accumulated over 3 steps (Section V-A).",
    248           "source": "opus"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding is used. The approach is a standard two-stage pipeline of fine-tuned models (scanner followed by repair model).",
    254           "source": "opus"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section III-A documents preprocessing in detail: TreeSitter-based multi-line string/comment splitting, commit-hash deduplication (reducing TSSB-3M from 3M+ to ~900K), filtering of encoding issues (3 Defects4J bugs omitted), and context window construction (18 lines before, 12 after).",
    260           "source": "opus"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Both underlying datasets (TSSB-3M and Defects4J 2.0) are publicly available. The replication package [12] provides the code and scripts used for data processing, enabling independent verification from the raw data sources.",
    268           "source": "opus"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section III-A describes data collection: TSSB-3M file-level context mined via GitHub API using commit hashes ('this step takes several weeks'), Defects4J uses 'relevant classes' bug properties. Deduplication and filtering criteria are specified.",
    274           "source": "opus"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants. Data sources are standard benchmarks (TSSB-3M, Defects4J).",
    280           "source": "opus"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The pipeline is documented: mining (GitHub API) → preprocessing (TreeSitter) → deduplication (commit hashes, reducing 3M+ to ~900K) → filtering (encoding issues) → ingredient extraction (Pygments) → dataset splitting (Table I shows exact split sizes). The deduplication step's quantitative impact is stated.",
    286           "source": "opus"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "The paper does not state the training data cutoff dates for the pre-trained CodeT5 or StarEncoder models. Without cutoff dates, it is impossible to assess whether TSSB-3M or Defects4J code appeared in pre-training data.",
    294           "source": "opus"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper mentions memorization concerns in Section VII-A ('Previous work expressed concerns of memorization issues in APR') and uses a small model to minimize this, but does not analyze whether specific test examples overlap with pre-training data for CodeT5 or StarEncoder.",
    300           "source": "opus"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "TSSB-3M contains bugs from public GitHub repositories that could be in CodeT5's and StarEncoder's pre-training data (both trained on open-source code). The paper does not address this contamination risk beyond noting memorization concerns and using a smaller model.",
    306           "source": "opus"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in this study. All experiments use automated bug datasets and model evaluations.",
    314           "source": "opus"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants in this study.",
    320           "source": "opus"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in this study.",
    326           "source": "opus"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in this study.",
    332           "source": "opus"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in this study.",
    338           "source": "opus"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in this study.",
    344           "source": "opus"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in this study.",
    350           "source": "opus"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost, latency, or API cost is reported. The paper mentions VRAM constraints ('fully exhausting our VRAM budget') but does not quantify inference time or per-example cost.",
    358           "source": "opus"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The paper mentions exceeding computational budget as a constraint ('would have exceeded our computational budget') but does not state the actual budget: no GPU hours, hardware specifications, or training time are reported.",
    364           "source": "opus"
    365         }
    366       },
    367       "experimental_rigor": {
    368         "seed_sensitivity_reported": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single training runs.",
    372           "source": "opus"
    373         },
    374         "number_of_runs_stated": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "The number of training runs is not stated. The paper reports beam search with 5 beams for candidate generation but does not state how many training runs produced the reported results.",
    378           "source": "opus"
    379         },
    380         "hyperparameter_search_budget": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "Specific hyperparameters are reported but no search budget is disclosed. There is no indication of how the learning rates, batch sizes, or number of epochs were selected.",
    384           "source": "opus"
    385         },
    386         "best_config_selection_justified": {
    387           "applies": true,
    388           "answer": true,
    389           "justification": "The paper transparently reports results for all scanner variants (All, OOW) at all thresholds (0.05, 0.5, 0.95) and all baselines in Table II. No cherry-picking — all configurations are shown.",
    390           "source": "opus"
    391         },
    392         "multiple_comparison_correction": {
    393           "applies": false,
    394           "answer": false,
    395           "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable.",
    396           "source": "opus"
    397         },
    398         "self_comparison_bias_addressed": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "The baselines (no ingredients, naive ingredients, large context) are all implemented by the authors. For Defects4J RQ2, published results from other tools are used. The paper does not acknowledge the bias of evaluating their own system against their own baseline implementations.",
    402           "source": "opus"
    403         },
    404         "compute_budget_vs_performance": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "The large-context model uses 5× more input tokens (5120 vs 1024) and the paper discusses quadratic attention cost, but performance is not formally reported as a function of compute budget. No FLOPs or training time comparisons are provided.",
    408           "source": "opus"
    409         },
    410         "benchmark_construct_validity": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "The paper acknowledges exact match as a proxy ('we resort to exact match to assess whether a generated patch is correct or not') but does not discuss whether TSSB-3M's single-statement Python bugs actually measure general program repair capability or whether exact match is a valid construct for correctness.",
    414           "source": "opus"
    415         },
    416         "scaffold_confound_addressed": {
    417           "applies": false,
    418           "answer": false,
    419           "justification": "No agentic scaffolding is involved. The models are fine-tuned seq2seq and token-classification models, not scaffold-dependent agents.",
    420           "source": "opus"
    421         }
    422       },
    423       "data_leakage": {
    424         "temporal_leakage_addressed": {
    425           "applies": true,
    426           "answer": false,
    427           "justification": "No discussion of whether CodeT5 or StarEncoder pre-training data temporally overlaps with TSSB-3M or Defects4J bug data. The temporal relationship between pre-training data collection and benchmark creation is not addressed.",
    428           "source": "opus"
    429         },
    430         "feature_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the evaluation setup leaks information. For instance, the ground-truth fix is used to determine positive labels for the scanner training, but potential feature leakage through the input structure is not discussed.",
    434           "source": "opus"
    435         },
    436         "non_independence_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "Commit-hash deduplication removes exact duplicates, but the paper does not discuss whether train and test examples may share repositories, authors, or structurally similar code patterns that could inflate performance.",
    440           "source": "opus"
    441         },
    442         "leakage_detection_method": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "Commit-hash deduplication and disjoint train/test splits (Table I) are good practices but are not leakage detection methods in the formal sense (no canary strings, membership inference, n-gram overlap analysis, or temporal splits).",
    446           "source": "opus"
    447         }
    448       }
    449     }
    450   },
    451   "claims": [
    452     {
    453       "claim": "Identifier ingredients are prevalent: 85% of Defects4J fixes and 44% of TSSB-3M fixes require identifier ingredients.",
    454       "evidence": "Section III.C: 'in Defects4J only 15% of bugs involve no identifier ingredients' and 'in the former, 56% of the fixes do not require any identifier ingredient.'",
    455       "supported": "strong"
    456     },
    457     {
    458       "claim": "39-51% of fix identifier ingredients fall outside a typical 30-line context window.",
    459       "evidence": "Section III.C ingredient cover analysis: 'the input window's local context 61% and 49%... of ingredients for Defects4J and TSSB-3M, respectively.'",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "ScanFix augmentation yields relative improvements of up to 31% over no-ingredient baselines for out-of-window bugs.",
    464       "evidence": "Table II: OOW scanner at t=0.05 achieves 24.56% vs 18.68% no-ingredients for winout bugs, a 31.5% relative improvement.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "A large-context baseline (5120 tokens) outperforms ScanFix despite requiring no domain-specific engineering.",
    469       "evidence": "Table II: Large Context achieves 27.60% for winout bugs vs 24.56% for best ScanFix variant; 47.8% relative improvement over no-ingredients.",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Perfect ingredient augmentation achieves 65.23% exact match for winout bugs, far exceeding all other conditions.",
    474       "evidence": "Table II: 'Perfect Ingrs.' achieves 65.23% for bugs with winout ingredients vs 27.60% for large context baseline.",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Repair success decreases as fix ingredient count increases and as ingredient distance from the bug location grows.",
    479       "evidence": "Figures 5, 6, and 7 consistently show downward repair success trends across ingredient count and distance, confirmed across multiple APR tools and both datasets.",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "The scanner model achieves F1 of approximately 27% for fixall ingredients, with precision-recall tradeoffs controlled by threshold.",
    484       "evidence": "Section V.B: At t=0.5, recall=47.66% and precision=7.05%; paper reports F1 of 27% in abstract.",
    485       "supported": "moderate"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "observational"
    491   ],
    492   "key_findings": "Identifier ingredients (variable/method/class names) are required in 44-85% of bug fixes, and 39-51% fall outside typical neural repair model context windows, causing significant repair failures. A dedicated StarEncoder-based scanner model can extract these ingredients from file-level context with modest F1 (~27%), and augmenting the repair model improves exact match by 7-31% relative over no-ingredient baselines. However, simply expanding the context window to 5120 tokens outperforms ScanFix by a larger margin, suggesting that domain-specific ingredient engineering may be rendered unnecessary by large-context models—a concrete instantiation of Sutton's bitter lesson. The gap between perfect-ingredient augmentation (65%) and ScanFix (25%) for out-of-window bugs shows the ceiling of the approach if extraction quality could be improved.",
    493   "red_flags": [
    494     {
    495       "flag": "Exact match proxy for repair",
    496       "detail": "TSSB-3M bugs are not executable, so exact match is used as the sole success metric. Exact match systematically underestimates real repair success and overestimates failure—a patch that is semantically equivalent but syntactically different counts as wrong. This affects all quantitative claims about improvement magnitude."
    497     },
    498     {
    499       "flag": "Single model architecture",
    500       "detail": "Both scanner and repair models use one architecture each (StarEncoder and CodeT5-small). The authors acknowledge this but note computational budget constraints. Results may not generalize to other model families."
    501     },
    502     {
    503       "flag": "No significance tests on main results",
    504       "detail": "Table II reports point estimates without significance tests. Given sample sizes in the hundreds of thousands, even small improvements may be significant, but the absence of tests leaves uncertainty about whether smaller differences (e.g., 'All' vs 'OOW' variants) are meaningful."
    505     },
    506     {
    507       "flag": "File-level scanning only for TSSB-3M",
    508       "detail": "The most valuable case—project-level scanning—cannot be evaluated on TSSB-3M due to scale. The paper acknowledges that 91-93% of fix ingredients exist at project level, but ScanFix is only evaluated on the file level where coverage is 69-77%. This artificially caps what the scanner can find."
    509     },
    510     {
    511       "flag": "Pre-training contamination unresolved",
    512       "detail": "The authors choose CodeT5-small to 'minimize' memorization issues but do not state pre-training cutoffs relative to TSSB-3M's GitHub data. The concern is acknowledged but not resolved."
    513     }
    514   ],
    515   "cited_papers": [
    516     {
    517       "title": "The plastic surgery hypothesis",
    518       "relevance": "Foundational work establishing that fix ingredients exist in project code; directly motivates the paper's research question about identifier ingredients in neural repair."
    519     },
    520     {
    521       "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models (FitRepair)",
    522       "relevance": "Most directly related prior work on using identifier ingredients with LLMs; ScanFix explicitly extends and compares against this approach."
    523     },
    524     {
    525       "title": "Out of context: How important is local context in neural program repair?",
    526       "relevance": "Direct predecessor by the same authors establishing context window size effects; ScanFix is the proposed solution to the problem identified there."
    527     },
    528     {
    529       "title": "TSSB-3M: Mining single statement bugs at massive scale",
    530       "relevance": "Primary dataset used for RQ1-4; the scale enables robust statistical analysis that Defects4J alone cannot provide."
    531     },
    532     {
    533       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    534       "relevance": "Standard APR benchmark used for cross-tool analysis (RQ2) and the only dataset with full project-level context."
    535     },
    536     {
    537       "title": "RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic Program Repair",
    538       "relevance": "Most related RAG-based alternative approach; the paper contrasts ScanFix's space-efficient identifier extraction with RAG's inclusion of full code fragments."
    539     },
    540     {
    541       "title": "Where were the repair ingredients for Defects4j bugs?",
    542       "relevance": "Prior empirical analysis of ingredient origin in Defects4J; this paper extends that work to NPR context and identifier-specific ingredients."
    543     },
    544     {
    545       "title": "Can OpenAI's codex fix bugs? an evaluation on QuixBugs",
    546       "relevance": "First-author prior work establishing memorization concerns in APR evaluation, directly cited to justify model size choice in this paper."
    547     },
    548     {
    549       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    550       "relevance": "Cited to contextualize whether large-context baselines can effectively use all provided context, raising doubt about whether large windows fully solve the ingredient problem."
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 1,
    556       "justification": "Research contribution to APR methodology; not directly usable as a practitioner tool, though the replication code is available."
    557     },
    558     "surprise_contrarian": {
    559       "score": 1,
    560       "justification": "The 'bitter lesson' framing—showing that simply increasing context window outperforms a more sophisticated approach—is mildly contrarian within the APR community."
    561     },
    562     "fear_safety": {
    563       "score": 0,
    564       "justification": "No AI safety, security, or risk concerns raised."
    565     },
    566     "drama_conflict": {
    567       "score": 0,
    568       "justification": "No controversy or conflict angle."
    569     },
    570     "demo_ability": {
    571       "score": 1,
    572       "justification": "Code released on GitHub but requires training infrastructure and dataset preparation; not easily demoed."
    573     },
    574     "brand_recognition": {
    575       "score": 0,
    576       "justification": "From Free University of Bozen-Bolzano and University of Bordeaux; not well-known AI labs."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [
    581       {
    582         "hn_id": "30665928",
    583         "title": "PERCEPT: Online change-point detection using topological data analysis",
    584         "points": 8,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=30665928"
    587       },
    588       {
    589         "hn_id": "42999205",
    590         "title": "Flip Graphs with Symmetry and New Matrix Multiplication Schemes",
    591         "points": 3,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=42999205"
    594       },
    595       {
    596         "hn_id": "44256016",
    597         "title": "Can Theoretical Physics Research Benefit from Language Agents?",
    598         "points": 1,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=44256016"
    601       }
    602     ],
    603     "top_points": 8,
    604     "total_points": 12,
    605     "total_comments": 0
    606   }
    607 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs