scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31605B)
      1 {
      2   "paper": {
      3     "title": "Extracting Fix Ingredients using Language Models",
      4     "authors": [
      5       "Julian Aron Prenner",
      6       "Romain Robbes"
      7     ],
      8     "year": 2025,
      9     "venue": "2025 IEEE/ACM Second International Conference on AI Foundation Models and Software Engineering (Forge)",
     10     "arxiv_id": "2503.04214",
     11     "doi": "10.1109/Forge66646.2025.00028"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval", "observational"],
     16   "key_findings": "Identifier ingredients are prevalent in program repair (44% of TSSB-3M, 85% of Defects4J fixes) and 39-51% fall outside a typical model input window. ScanFix, combining a scanner model with a repair model, yields 7-31% relative improvement over no-ingredient baselines for out-of-window bugs. However, simply expanding the input window to 5120 tokens outperforms ScanFix, illustrating Sutton's bitter lesson. Oracle ingredients show far higher performance (51.99%), indicating that better extraction techniques could still be valuable.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper states 'Code and scripts used in this work are provided online[12]' with reference [12] pointing to a GitHub repository (https://github.com/giganticode/llm_ingredient_extraction)."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses two publicly available datasets: TSSB-3M and Defects4J 2.0. Both are standard benchmarks accessible to other researchers. The replication package includes scripts for data processing."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper specifies model architectures (CodeT5-small 60M, StarEncoder 125M) and mentions tools like TreeSitter and Pygments, but does not provide a requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While a replication package URL is provided, the paper itself does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper reports 95% confidence interval bands on Figures 5, 6, 7, and 10. For example, 'Error bands indicate a 95% confidence interval' is stated for multiple figures."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are reported. The paper uses 'significant' informally (e.g., 'a significant increase in repair success') without formal testing. Comparative claims rely on point estimates and confidence interval overlap."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports both absolute and relative improvements with baseline context. For example, 'absolute performance increase of 2.55% and a relative improvement of roughly 7%' and 'a relative improvement of 31.5% (abs. 5.9%).' Table II provides full numeric context."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section VII-A (Datasets) explicitly justifies dataset size choices: 'Defects4J has only around 800 bugs... each individual bug would have a very large weight. In contrast, our evaluation set for TSSB is between 80 times (RQ3) and 250 times larger (RQ4), which ensures much more robust results.'"
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The main results in Table II are single-run point estimates with no standard deviation, variance, or multiple-run analysis. The 95% CI bands in figures show variation across bugs, not across experimental runs with different seeds."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper includes six baselines: perfect ingredients, perfect file-level ingredients, perfect recall/low precision, naive ingredients, no ingredients, and large context. For Defects4J RQ2, results from 15 published APR/NPR tools are compared."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "For Defects4J analysis, baselines include recent tools like FitRepair, TARE, RAP-Gen (2022-2023). The large-context baseline provides a fair apples-to-apples comparison using the same model architecture."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper systematically varies: scanner variants (All vs OOW), extraction thresholds (0.05, 0.5, 0.95), ingredient sources (none, naive, scanned, perfect, large context). Table II presents results for all combinations, showing each component's contribution."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The scanner model is evaluated with precision, recall, and F1 (Section V-B). Repair is evaluated with exact match. The ingredient analysis uses cover, distance, and frequency metrics. Multiple perspectives on the same problem."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation of repair outputs is performed. Evaluation is entirely automated using exact match against ground-truth fixes."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Table I explicitly documents disjoint training and evaluation splits: 510,851 training / 85,776 evaluation for RQ2; 274,776 / 85,776 for scanner (RQ3); 236,075 / 257,316 for repair (RQ4). The paper also ensures disjoint scanner and repair training sets to avoid data leakage."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by fix ingredient count (Figures 5, 6), ingredient distance (Figures 7, 10), in-window vs out-of-window (Figure 6, Table II columns), ingredient frequency (Figure 7), and context level (Figure 3)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper discusses where the approach fails: ScanFix is outperformed by large context (Section VI-B), performance degrades with multiple ingredients ('the complexity of arranging the ingredients dominates'), and the scanner has modest extraction performance."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper openly reports that ScanFix is outperformed by the large-context baseline (Section VI-B) and frames this as 'A case of Sutton's bitter lesson' (Section VII-B). The scanner's modest precision/recall is honestly reported."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims are well-supported: 'relative improvements of up to 31%' matches Table II (31.5% for OOW winout); 'outperformed by a model with a large input window' matches Table II (39.28% vs 37.27%); 'augmenting with ground-truth fix leads to even better results' matches Table II (51.99%)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims ('augmenting repair model input with ingredients yields improvements') are supported by controlled experimental comparisons: same model architecture, same training data, varying only the ingredient augmentation. The ablation-style design (adding/removing ingredients) is adequate for these claims."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract specifies 'a dataset of 85,000 Python bugs.' Section VII-A explicitly bounds scope: single model architecture, file-level scanning only for TSSB-3M, identifier ingredients only (not literals or compound ingredients), lexical analysis only."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section VII-B discusses Sutton's bitter lesson as an alternative framing. The paper considers that large context windows might solve the problem entirely, that memorization in models could explain some performance, and that ingredient frequency in training data affects success (Section IV-D)."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper explicitly acknowledges using exact match as a proxy: 'bugs in TSSB-3M are not executable (and lack tests) we resort to exact match to assess whether a generated patch is correct or not' (Section IV-B). This clearly distinguishes the measurement proxy from actual correctness."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper specifies 'CodeT5 small variant with 60M parameters' (ref [37]) for the repair model and 'BigCode's pre-trained StarEncoder model with roughly 125M parameters' (refs [38, 39]) for the scanner model. These are specific, identifiable model checkpoints."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "The paper's main experiments use fine-tuned seq2seq and token-classification models, not prompting. The ChatGPT example in Section I is a motivational illustration, not a systematic experiment."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Hyperparameters are reported in detail: repair model uses learning rate 1×10⁻⁴, 4 epochs, batch size 12 accumulated over 2 steps, beam search with 5 beams (Section IV-B). Scanner uses learning rate 6×10⁻⁵, 4 epochs, batch size 30 accumulated over 3 steps (Section V-A)."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. The approach is a standard two-stage pipeline of fine-tuned models (scanner followed by repair model)."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section III-A documents preprocessing in detail: TreeSitter-based multi-line string/comment splitting, commit-hash deduplication (reducing TSSB-3M from 3M+ to ~900K), filtering of encoding issues (3 Defects4J bugs omitted), and context window construction (18 lines before, 12 after)."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section VII-A 'Limitations' is a substantial dedicated section covering seven specific limitations: potential software bugs, non-identifier ingredients, lexical analysis limitations, single model architecture, LLM impact, dataset constraints, and file-level scanning limitations."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The limitations are specific to this study: 'We use a single model architecture' with explanation that 'repeating experiments using a second or even third model architecture would have exceeded our computational budget'; 'we were not able to use Defects4J... due to its small size'; lexical analysis 'ignores an identifier's syntactic function.'"
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section VII-A explicitly states what was NOT tested: non-identifier ingredients ('Literals, compound ingredients... are not studied'), project-level scanning for TSSB-3M ('mining for project-level identifiers impractical'), LLM evaluation ('we decided not to focus on LLMs'), and scope/accessibility analysis."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Both underlying datasets (TSSB-3M and Defects4J 2.0) are publicly available. The replication package [12] provides the code and scripts used for data processing, enabling independent verification from the raw data sources."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section III-A describes data collection: TSSB-3M file-level context mined via GitHub API using commit hashes ('this step takes several weeks'), Defects4J uses 'relevant classes' bug properties. Deduplication and filtering criteria are specified."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data sources are standard benchmarks (TSSB-3M, Defects4J)."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The pipeline is documented: mining (GitHub API) → preprocessing (TreeSitter) → deduplication (commit hashes, reducing 3M+ to ~900K) → filtering (encoding issues) → ingredient extraction (Pygments) → dataset splitting (Table I shows exact split sizes). The deduplication step's quantitative impact is stated."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Acknowledgments section states: 'This study has received financial support from the French State in the framework of the Investments for the Future programme IdEx université de Bordeaux.'"
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: Julian Aron Prenner at Free University of Bozen-Bolzano, Romain Robbes at Univ. Bordeaux/CNRS/Bordeaux INP/LaBRI. No commercial product is being evaluated, so no product-affiliation conflict exists."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The French State's IdEx program is a general academic funding source with no financial stake in whether ScanFix outperforms baselines."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not state the training data cutoff dates for the pre-trained CodeT5 or StarEncoder models. Without cutoff dates, it is impossible to assess whether TSSB-3M or Defects4J code appeared in pre-training data."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper mentions memorization concerns in Section VII-A ('Previous work expressed concerns of memorization issues in APR') and uses a small model to minimize this, but does not analyze whether specific test examples overlap with pre-training data for CodeT5 or StarEncoder."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "TSSB-3M contains bugs from public GitHub repositories that could be in CodeT5's and StarEncoder's pre-training data (both trained on open-source code). The paper does not address this contamination risk beyond noting memorization concerns and using a smaller model."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study. All experiments use automated bug datasets and model evaluations."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No inference cost, latency, or API cost is reported. The paper mentions VRAM constraints ('fully exhausting our VRAM budget') but does not quantify inference time or per-example cost."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "The paper mentions exceeding computational budget as a constraint ('would have exceeded our computational budget') but does not state the actual budget: no GPU hours, hardware specifications, or training time are reported."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single training runs."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of training runs is not stated. The paper reports beam search with 5 beams for candidate generation but does not state how many training runs produced the reported results."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Specific hyperparameters are reported but no search budget is disclosed. There is no indication of how the learning rates, batch sizes, or number of epochs were selected."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "The paper transparently reports results for all scanner variants (All, OOW) at all thresholds (0.05, 0.5, 0.95) and all baselines in Table II. No cherry-picking — all configurations are shown."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The baselines (no ingredients, naive ingredients, large context) are all implemented by the authors. For Defects4J RQ2, published results from other tools are used. The paper does not acknowledge the bias of evaluating their own system against their own baseline implementations."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The large-context model uses 5× more input tokens (5120 vs 1024) and the paper discusses quadratic attention cost, but performance is not formally reported as a function of compute budget. No FLOPs or training time comparisons are provided."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper acknowledges exact match as a proxy ('we resort to exact match to assess whether a generated patch is correct or not') but does not discuss whether TSSB-3M's single-statement Python bugs actually measure general program repair capability or whether exact match is a valid construct for correctness."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No agentic scaffolding is involved. The models are fine-tuned seq2seq and token-classification models, not scaffold-dependent agents."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether CodeT5 or StarEncoder pre-training data temporally overlaps with TSSB-3M or Defects4J bug data. The temporal relationship between pre-training data collection and benchmark creation is not addressed."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the evaluation setup leaks information. For instance, the ground-truth fix is used to determine positive labels for the scanner training, but potential feature leakage through the input structure is not discussed."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Commit-hash deduplication removes exact duplicates, but the paper does not discuss whether train and test examples may share repositories, authors, or structurally similar code patterns that could inflate performance."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Commit-hash deduplication and disjoint train/test splits (Table I) are good practices but are not leakage detection methods in the formal sense (no canary strings, membership inference, n-gram overlap analysis, or temporal splits)."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "Identifier ingredients are prevalent: 44% of TSSB-3M fixes and 85% of Defects4J fixes require at least one identifier ingredient.",
    368       "evidence": "Section III-C, analysis of ~900K TSSB-3M bugs and 832 Defects4J bugs. Figure 3 shows cover distributions across context levels.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "39-51% of identifier ingredients fall outside a typical 30-line input window context.",
    373       "evidence": "Section III-C, Figure 3. Window cover is 61% for Defects4J and 49% for TSSB-3M, meaning 39% and 51% respectively are uncovered.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Fix ingredient count and distance from bug location significantly affect repair success across multiple APR/NPR tools.",
    378       "evidence": "Section IV-C/D. Figure 5 shows downward trend across 15 tools on Defects4J. Figures 6 and 7 show similar patterns on TSSB-3M. In-window ingredients yield higher success than out-of-window.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "The scanner model extracts fix ingredients with 17-48% recall and 7-45% precision depending on threshold.",
    383       "evidence": "Section V-B, Figure 9. All variant at t=0.5: recall 47.66%, precision 7.05%. At t=0.95: recall 17.01%, precision 45.09%.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "ScanFix yields 7-31% relative improvement over no-ingredient baseline, especially for out-of-window ingredients.",
    388       "evidence": "Section VI-B, Table II. All bugs: 37.27% vs 34.72% (7.3% relative). Bugs with winout ingredients: 24.56% vs 18.68% (31.5% relative, OOW variant).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "A large-context baseline (5120 tokens) outperforms all ScanFix variants without any ingredient augmentation.",
    393       "evidence": "Table II: large context achieves 39.28% on all bugs vs best ScanFix 37.27%. Section VII-B frames this as Sutton's bitter lesson.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Perfect (oracle) ingredients lead to 51.99% exact match, far exceeding all other approaches, showing potential for improved extraction.",
    398       "evidence": "Table II: perfect ingredients 51.99% vs large context 39.28% vs best ScanFix 37.27%. Also, 'perfect recall, low precision' baseline at 40.29% shows repair model tolerates noisy ingredients.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Single model architecture",
    405       "detail": "All experiments use a single model family (CodeT5-small for repair, StarEncoder for scanner). The authors acknowledge this limitation and cite computational budget constraints, but results may not generalize to other architectures."
    406     },
    407     {
    408       "flag": "Exact match evaluation only",
    409       "detail": "TSSB-3M evaluation relies solely on exact match with ground-truth fixes. Semantically correct but textually different patches are counted as failures, potentially underestimating true repair rates for all approaches equally."
    410     },
    411     {
    412       "flag": "No formal significance testing",
    413       "detail": "Comparative claims between ScanFix variants and baselines are made based on point estimates in Table II without statistical significance tests. With single-run results, it is unclear whether observed differences are statistically meaningful."
    414     },
    415     {
    416       "flag": "No multi-seed evaluation",
    417       "detail": "Results appear to come from single training runs without seed variation. Henderson et al. (2018) showed that results can vary substantially across seeds, which is acknowledged in the field but not addressed here."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "CoCoNuT: Combining context-aware neural translation models using ensemble for program repair",
    423       "authors": ["T. Lutellier", "H. V. Pham", "L. Pang", "Y. Li", "M. Wei", "L. Tan"],
    424       "year": 2020,
    425       "relevance": "Neural program repair approach using ensemble models, one of the NPR tools benchmarked in this study."
    426     },
    427     {
    428       "title": "Can OpenAI's Codex Fix Bugs? An Evaluation on QuixBugs",
    429       "authors": ["J. A. Prenner", "H. Babii", "R. Robbes"],
    430       "year": 2022,
    431       "relevance": "Early evaluation of LLMs (Codex) for automated program repair, raises memorization concerns relevant to contamination."
    432     },
    433     {
    434       "title": "Impact of Code Language Models on Automated Program Repair",
    435       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    436       "year": 2023,
    437       "relevance": "Comprehensive study of how code language models affect automated program repair performance."
    438     },
    439     {
    440       "title": "The plastic surgery hypothesis",
    441       "authors": ["E. T. Barr", "Y. Brun", "P. Devanbu", "M. Harman", "F. Sarro"],
    442       "year": 2014,
    443       "relevance": "Foundational hypothesis that correct fixes can be crafted from existing code elements, central premise of this work."
    444     },
    445     {
    446       "title": "Out of context: How important is local context in neural program repair?",
    447       "authors": ["J. A. Prenner", "R. Robbes"],
    448       "year": 2023,
    449       "arxiv_id": "2312.04986",
    450       "relevance": "Directly relevant predecessor studying how context window size affects neural program repair, informs this paper's context design."
    451     },
    452     {
    453       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    454       "authors": ["Z. Chen", "S. Kommrusch", "M. Tufano", "L.-N. Pouchet", "D. Poshyvanyk", "M. Monperrus"],
    455       "year": 2021,
    456       "relevance": "Early neural program repair using LSTM with input augmentation (field variables, method signatures), directly related to ingredient augmentation approach."
    457     },
    458     {
    459       "title": "RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic Program Repair",
    460       "authors": ["W. Wang", "Y. Wang", "S. Joty", "S. C. Hoi"],
    461       "year": 2023,
    462       "relevance": "Retrieval-augmented approach to program repair, alternative to ingredient scanning for incorporating project-level context."
    463     },
    464     {
    465       "title": "AutoCodeRover: Autonomous Program Improvement",
    466       "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"],
    467       "year": 2024,
    468       "relevance": "Autonomous agent for program repair using retrieval-augmented generation, relevant to agentic AI repair approaches."
    469     },
    470     {
    471       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    472       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    473       "year": 2024,
    474       "relevance": "LLM-based autonomous agent for program repair, represents the agentic approach to automated repair."
    475     },
    476     {
    477       "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models",
    478       "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"],
    479       "year": 2023,
    480       "relevance": "Uses Levenshtein-based identifier extraction and project-specific fine-tuning (FitRepair) for LLM-based repair, directly comparable approach."
    481     },
    482     {
    483       "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning",
    484       "authors": ["C. S. Xia", "L. Zhang"],
    485       "year": 2022,
    486       "relevance": "AlphaRepair: zero-shot LLM-based program repair, one of the NPR tools analyzed in the Defects4J evaluation."
    487     },
    488     {
    489       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    490       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. H. Hoi"],
    491       "year": 2021,
    492       "relevance": "Foundation model used for the repair model in this work; identifier-aware pre-training is directly relevant to the ingredient extraction task."
    493     },
    494     {
    495       "title": "Code Llama: Open Foundation Models for Code",
    496       "authors": ["B. Rozière", "J. Gehring", "F. Gloeckle"],
    497       "year": 2024,
    498       "relevance": "Large-context code LLM (16K+ tokens) representing the trend of expanding input windows that may make ingredient extraction obsolete."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 1,
    504       "justification": "Research contribution to APR methodology; not directly usable as a practitioner tool, though the replication code is available."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The 'bitter lesson' framing—showing that simply increasing context window outperforms a more sophisticated approach—is mildly contrarian within the APR community."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No AI safety, security, or risk concerns raised."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict angle."
    517     },
    518     "demo_ability": {
    519       "score": 1,
    520       "justification": "Code released on GitHub but requires training infrastructure and dataset preparation; not easily demoed."
    521     },
    522     "brand_recognition": {
    523       "score": 0,
    524       "justification": "From Free University of Bozen-Bolzano and University of Bordeaux; not well-known AI labs."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs