scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29569B)
      1 {
      2   "paper": {
      3     "title": "A Deep Dive into Large Language Models for Automated Bug Localization and Repair",
      4     "authors": [
      5       "Soneya Binta Hossain",
      6       "Nan Jiang",
      7       "Qiang Zhou",
      8       "Xiaopeng Li",
      9       "Wen-Hao Chiang",
     10       "Yingjun Lyu",
     11       "Hoan Nguyen",
     12       "Omer Tripp"
     13     ],
     14     "year": 2024,
     15     "venue": "Proc. ACM Softw. Eng.",
     16     "arxiv_id": "2404.11595",
     17     "doi": "10.1145/3660773"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Toggle, a token-granulated bug localization and repair framework, achieves new state-of-the-art on the CodeXGLUE code refinement benchmark (25.07% exact match on Tufano Small) and outperforms existing methods on Defects4J at Top-10/30/50/100 using only a 110M-parameter model. Prompt design has a dramatic effect on bug fixing accuracy—from 16% to 57% for the same model—demonstrating that token-level localization injects strong inductive bias. An adjustment module that bridges tokenizer discrepancies between localization and fixing models consistently improves accuracy across all tested configurations.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. Toggle's implementation is not released."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper uses publicly available datasets: CodeXGLUE (Tufano Small/Medium), CodeReviewer, Defects4J, and the GitHub dataset. All are referenced with citations and are accessible."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions GPU instances and Hugging Face models but does not specify library versions or environment details."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology in detail but does not include scripts, commands, or a README for reproducing experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results in Tables 1–8 are reported as single point estimates (e.g., '25.07%') with no confidence intervals, error bars, or ± notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Claims like 'PolyCoder-2.7B reaching the highest accuracy of 25.07%' outperforming the baseline of 23.86% are made by comparing raw numbers with no statistical significance tests."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are reported with baseline context in all tables (e.g., Table 1 shows baseline accuracies alongside Toggle results, Table 4 shows improvement from Prompt 1 at 16.07% to Prompt 4 at 56.98%), allowing the reader to assess the magnitude of improvements."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for sample sizes. The number of patches per bug (210 for Defects4J) is noted as 'relatively small' compared to prior work but not formally justified."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Section 5 mentions experiments were 'repeated each experiment several times to confirm consistency' but no standard deviations, variance, or spread measures appear in any results table."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple baselines are included: NSEdit, CoText for CodeXGLUE; CodeT5, NSEdit for CodeReviewer; CURE, RewardRepair, Recoder, KNOD, Tare, AlphaRepair, TENURE for Defects4J (Tables 1, 3)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include methods from 2021–2023 (KNOD, Tare, AlphaRepair, TENURE from ICSE 2023), which are contemporary for a 2024 publication."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "RQ3 (four prompts compared), RQ4 (with/without contextual information), and RQ5 (with/without adjustment module) constitute comprehensive ablation studies isolating each component's contribution."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Exact match accuracy is used for CodeXGLUE/CodeReviewer, test-case validation for Defects4J, and Top-K metrics (Top-10, 30, 50, 100) for patch ranking."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "For Defects4J, Section 3.2.1 states: 'we manually executed all corrective patches to ensure they indeed pass all tests and effectively fix the bugs,' constituting manual verification of system outputs."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 3.1.1 states datasets are split 80/10/10 for train/validation/test. Defects4J (RQ2) serves as a completely unseen test set not used in fine-tuning."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down by dataset (Tufano Small, Medium, CodeReviewer ±comments), by model backbone (6 LLMs), and by prompt type (4 prompts) across Tables 1, 4, 6, 7, 8."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Figure 7 shows a concrete failure case where correct localization leads to incorrect fix, and Figure 2 illustrates tokenizer discrepancy failures. Section 3.5.2 discusses why the bug fixing model prematurely closes methods."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "CodeGPT underperforms other models due to Java-only pretraining (Section 3.1.3). Prompt 4 underperforms Prompt 3 on Tufano datasets despite outperforming with ground-truth locations (Table 8 vs Table 4). Location prediction accuracy drops significantly for dual-token prediction (Table 7)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims SOTA on CodeXGLUE (supported by Table 1, PolyCoder-2.7B at 25.07% vs NSEdit 23.86%) and better/comparable performance on Defects4J (supported by Table 3, Top-10 through Top-100 rankings)."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims about prompt design improving accuracy are supported by controlled experiments (RQ3) where only the prompt changes while model, data, and ground-truth locations remain fixed. The adjustment module ablation (RQ5) is similarly controlled."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Large Language Models' broadly but experiments use only 6 models from 110M to 2.7B parameters—far smaller than current frontier models. Datasets are predominantly Java. These scope limitations are not acknowledged in the title or abstract framing."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Section 5 (Threats to Validity) offers generic disclaimers: 'there's a possibility that our results may not generalize across other datasets' and 'it is conceivable that [scripts] might contain bugs.' No substantive alternative explanations for the observed results are discussed (e.g., whether improvements are due to reduced sequence length rather than inductive bias)."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures exact match accuracy and test-case pass rates, and frames results in terms of 'bug fixing accuracy'—which directly matches the measurements. No proxy gap exists between what is measured and what is claimed."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Specific model names with parameter counts are given: CodeGPT-110M (Java checkpoint), CodeParrot-110M (multi-language), CodeGen-350M/2B, PolyCoder-400M/2.7B, CodeT5-large (347M). These are identifiable on Hugging Face."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Figure 5 provides the complete structure of all four prompt formats with concrete examples showing how buggy code, shared prefix, shared suffix, and separators are arranged. The prompts are deterministically constructed from code, and the construction rules are fully specified."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No training hyperparameters are reported—no learning rate, batch size, number of epochs, optimizer, temperature for generation, or beam size. Only dataset splits (80/10/10) and number of generated patches (70/210) are mentioned."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "Toggle is a pipeline of fine-tuned models (localization → adjustment → fixing), not an agentic scaffold. No tool use, retry logic, or feedback mechanisms are involved."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 2.2 describes dataset characteristics and selection criteria. For Defects4J, 240 single-hunk bugs are selected. The GitHub dataset excludes Defects4J patches via AST comparison. Sections 2.3.1–2.3.3 detail how input prompts are constructed from buggy/fixed code pairs."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 'Threats to Validity' provides a dedicated subsection discussing generalizability, implementation correctness, and accuracy metrics."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Section 5 states generic threats: 'there's a possibility that our results may not generalize across other datasets' and 'it is conceivable that [tools] might contain bugs.' These are boilerplate disclaimers rather than threats specific to this study's methodology."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that findings may be specific to models in the 110M–2.7B range, to Java code, or to single-hunk bugs."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "While the benchmark datasets are public, the paper does not release model outputs, generated patches, predicted locations, or fine-tuned model weights. Independent verification of reported numbers is not possible."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.2 describes each dataset in detail: Tufano Small (58,350 samples) and Medium (65,465) from GitHub commits, CodeReviewer (183,881 multilingual samples), Defects4J (835 bugs from 17 projects, 240 single-hunk used), and the GitHub dataset (1,083,185 commits)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. All data comes from standard public benchmarks."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from bug localization to adjustment to bug fixing is described in detail (Sections 2.3.1–2.3.3). Figure 3 provides a visual overview. The adjustment model training pipeline is described step-by-step in Section 2.3.3 with concrete shift ranges (-3 to +3)."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding sources are mentioned despite 5 of 8 authors being affiliated with Amazon Web Services, suggesting corporate funding."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: Soneya Binta Hossain (University of Virginia), Nan Jiang (Purdue), and Qiang Zhou, Xiaopeng Li, Wen-Hao Chiang, Yingjun Lyu, Hoan Nguyen, Omer Tripp (all Amazon Web Services)."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed. With 5/8 authors from AWS, the work is implicitly corporate-funded. While the paper does not evaluate AWS products specifically, the lack of funding disclosure prevents assessment of independence."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial disclosure statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The pre-training data cutoff dates for CodeGPT, CodeParrot, CodeGen, PolyCoder, and CodeT5 are not stated. The paper does not mention when these models' training data was collected."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 2.2 states the GitHub dataset excluded 'patches associated with the Defects4J project or those resembling any in Defects4J v1.2 or v2.0 were meticulously excluded, based on an AST comparative analysis.' Standard splits are used for other datasets."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "While Defects4J overlap with training data is addressed, no discussion of whether CodeXGLUE benchmark examples could appear in the pre-training data of the base models (CodeGPT, CodeParrot, etc.) is provided."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. All evaluation is on code benchmarks."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference cost, latency, or tokens consumed are reported. The number of generated patches (210 per bug) is noted as 'relatively small' but no wall-clock time or compute cost is given."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No GPU hours, training time, or total computational budget is stated despite fine-tuning 6 LLMs across 4 datasets with 4 prompt styles (96 experiments) plus adjustment model training."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds. All results appear to be single-run numbers despite Section 5 claiming experiments were 'repeated several times.'"
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 5 vaguely states 'repeated each experiment several times to confirm consistency' but the exact number of runs is not stated and no variance is shown in results tables."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search budget is reported. Training hyperparameters are not even listed, let alone the search process for selecting them."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "All configurations are systematically reported: 6 models × 4 prompts in Table 4, all models on all datasets in Tables 1 and 8. CodeParrot-110M was selected for Defects4J based on 'superior performance over the similarly sized CodeGPT-110M and comparable performance to larger models' (Section 3.2.1)."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons, despite making dozens of pairwise comparisons across 6 models, 4 prompts, and 4 datasets."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors re-implement some baselines (NSEdit on CodeReviewer, Section 3.1.2) and compare their own system against these re-implementations without acknowledging or addressing author-evaluation bias."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Models range from 110M to 2.7B parameters (25× difference) but no performance-per-compute analysis is provided. Finding 1 notes larger models perform better but does not account for the additional compute required."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper uses exact match as the primary metric, noting only that it is 'commonly used' and 'a more fairer measure of accuracy than either BLEU or CodeBLEU scores' (Section 3.1.1). No deeper discussion of whether exact match captures real-world bug fixing utility."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No agentic scaffolding is involved. Toggle is a deterministic pipeline of fine-tuned models, not a scaffold-based system."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the base models' pre-training data temporally overlaps with the benchmark datasets. CodeXGLUE and Defects4J predate the models' training, creating potential temporal leakage."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage scenarios."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Section 2.2 states the GitHub dataset excluded patches 'associated with the Defects4J project or those resembling any in Defects4J v1.2 or v2.0' using AST comparative analysis, directly addressing train-test independence for the Defects4J evaluation."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "An AST comparative analysis was used to identify and exclude Defects4J-related patches from the GitHub training dataset (Section 2.2), constituting a concrete decontamination method."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Toggle achieves new state-of-the-art on CodeXGLUE code refinement benchmark, with PolyCoder-2.7B reaching 25.07% exact match on Tufano Small (vs 23.86% baseline) and 16.19% on Tufano Medium (vs 15.36% baseline).",
    374       "evidence": "Table 1, Section 3.1.3. Results shown for 6 LLMs across 4 datasets with baseline comparisons from CodeXGLUE leaderboard.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Toggle with CodeParrot-110M generalizes to unseen Defects4J bugs, fixing 41 bugs at Top-10 (vs 36 for Recoder), 58 at Top-30 (vs 51), 64 at Top-50 (vs 62), and 74 at Top-100 (vs 70).",
    379       "evidence": "Table 3, Section 3.2.2. Comparison against 7 existing APR methods using test-case validation on 240 single-hunk Defects4J bugs.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Prompt design dramatically impacts bug fixing accuracy: for CodeGPT-110M, accuracy improves from 16.07% (Prompt 1) to 56.98% (Prompt 4) with ground-truth locations.",
    384       "evidence": "Table 4, Section 3.3. Controlled experiment with 6 LLMs × 4 prompts on Tufano Small, using ground-truth locations to isolate prompt effect.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Additional contextual information (buggy line numbers, code review comments) significantly improves bug localization accuracy, e.g., starting token accuracy on Tufano Medium increases from 26.73% to 56.66% with buggy line numbers.",
    389       "evidence": "Table 5, Section 3.4. Comparison of location prediction with/without context across 3 datasets.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "The adjustment module consistently improves bug fixing accuracy across all 16 dataset-model combinations tested.",
    394       "evidence": "Table 6, Section 3.5. All 16 cells show improvement, though magnitudes range from 0.17pp to 2.02pp.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Larger LLMs yield better bug fixing accuracy after fine-tuning, with larger models consistently outperforming smaller counterparts within the same family.",
    399       "evidence": "Table 1, Section 3.1.3. Within CodeGen (350M→2B) and PolyCoder (400M→2.7B), larger models outperform on all datasets.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No code or model release",
    406       "detail": "Toggle's implementation, fine-tuned models, and experimental scripts are not released, making independent verification impossible despite claims of SOTA performance."
    407     },
    408     {
    409       "flag": "No error bars or statistical tests",
    410       "detail": "All results are single point estimates with no confidence intervals, significance tests, or variance measures. Claims of outperformance are based on raw number comparisons (e.g., 25.07% vs 23.86%) that could be within noise."
    411     },
    412     {
    413       "flag": "Missing training hyperparameters",
    414       "detail": "No learning rate, batch size, epochs, optimizer, or generation parameters (temperature, beam size) are reported despite fine-tuning 6 models across multiple configurations."
    415     },
    416     {
    417       "flag": "Undisclosed corporate funding",
    418       "detail": "Five of eight authors are affiliated with Amazon Web Services but no funding source or competing interests statement is provided."
    419     },
    420     {
    421       "flag": "Incomplete baseline comparison for Defects4J",
    422       "detail": "The Top-200 and ≥Top-500 columns in Table 3 are blank for Toggle ('-'), preventing fair comparison against methods like TENURE (129 at ≥Top-500) and AlphaRepair (110) at higher patch budgets."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Impact of Code Language Models on Automated Program Repair",
    428       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    429       "year": 2023,
    430       "relevance": "Explores LLMs for APR showing strong fixing capabilities, directly relevant to LLM-based code repair evaluation."
    431     },
    432     {
    433       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    434       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    435       "year": 2023,
    436       "relevance": "Studies LLMs for APR with prompt engineering, a key baseline and related approach."
    437     },
    438     {
    439       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    440       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    441       "year": 2022,
    442       "relevance": "AlphaRepair zero-shot APR method, a primary baseline in the Defects4J evaluation."
    443     },
    444     {
    445       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    446       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"],
    447       "year": 2023,
    448       "arxiv_id": "2203.13474",
    449       "relevance": "One of the primary LLM backbones used in Toggle's experiments."
    450     },
    451     {
    452       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    453       "authors": ["Yue Wang", "Weishi Wang", "Shafiq R. Joty", "Steven C. H. Hoi"],
    454       "year": 2021,
    455       "relevance": "Encoder backbone used for Toggle's bug localization model, core to the framework."
    456     },
    457     {
    458       "title": "A Systematic Evaluation of Large Language Models of Code",
    459       "authors": ["Frank F. Xu", "Uri Alon", "Graham Neubig", "Vincent J. Hellendoorn"],
    460       "year": 2022,
    461       "arxiv_id": "2202.13169",
    462       "relevance": "PolyCoder evaluation paper; PolyCoder is a primary LLM backbone in the experiments."
    463     },
    464     {
    465       "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
    466       "authors": ["Nan Jiang", "Thibaud Lutellier", "Yiling Lou", "Lin Tan"],
    467       "year": 2023,
    468       "relevance": "Key APR baseline in Defects4J comparison, representing edit-based approaches."
    469     },
    470     {
    471       "title": "Tare: Type-Aware Neural Program Repair",
    472       "authors": ["Qihao Zhu", "Zeyu Sun", "Wenjie Zhang", "Yingfei Xiong", "Lu Zhang"],
    473       "year": 2023,
    474       "relevance": "Recent APR baseline in Defects4J evaluation, fixing 109 bugs at ≥Top-500."
    475     },
    476     {
    477       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    478       "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"],
    479       "year": 2021,
    480       "relevance": "Primary benchmark used for evaluation, widely used for code ML tasks."
    481     },
    482     {
    483       "title": "Defects4J: A Database of existing faults to enable controlled testing studies for Java programs",
    484       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    485       "year": 2014,
    486       "relevance": "Standard bug benchmark used for generalizability evaluation with test-case validation."
    487     },
    488     {
    489       "title": "An Empirical Study of Deep Learning Models for Vulnerability Detection",
    490       "authors": ["Benjamin Steenhoek", "Md Mahbubur Rahman", "Richard Jiles", "Wei Le"],
    491       "year": 2023,
    492       "relevance": "Empirical study of DL for vulnerability detection, relevant to LLM-based code analysis evaluation."
    493     },
    494     {
    495       "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities",
    496       "authors": ["Yi Wu", "Nan Jiang", "Hung Viet Pham"],
    497       "year": 2023,
    498       "relevance": "Evaluates neural networks for vulnerability fixing, closely related to LLM-based program repair."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 1,
    504       "justification": "Toggle addresses a practical problem (automated bug fixing) but no code is released, limiting immediate use."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "Token-level vs line-level localization is a novel granularity shift, but does not fundamentally challenge conventional wisdom about LLM-based APR."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No AI safety or security concerns raised; the work focuses on improving code repair accuracy."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict; straightforward benchmark improvement paper."
    517     },
    518     "demo_ability": {
    519       "score": 0,
    520       "justification": "No code, demo, or tool released for anyone to try."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Amazon Web Services is well-known but not a top-tier AI research brand; the work is published at FSE, a respected but niche SE venue."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs