scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31903B)
      1 {
      2   "paper": {
      3     "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis",
      4     "authors": [
      5       "Fengjie Li",
      6       "Jiajun Jiang",
      7       "Jiajun Sun",
      8       "Hongyu Zhang"
      9     ],
     10     "year": 2024,
     11     "venue": "ACM Transactions on Software Engineering and Methodology",
     12     "arxiv_id": "2406.00992",
     13     "doi": "10.1145/3715004"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "key_findings": "GIANTREPAIR improves LLM-based program repair by abstracting LLM-generated patches into skeletons and instantiating them with context-aware program analysis. On Defects4J, it repairs 171 bugs total, improving raw LLM patches by 27.78% (v1.2) and 23.40% (v2.0), and outperforming the best prior APR tool (FitRepair) by 42 bugs with perfect fault localization and 7 bugs with automated fault localization. Different LLMs contribute complementary fixes, and data leakage analysis confirms that 79% of correct patches were not in StarCoder's training data.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper provides a GitHub URL: https://github.com/Feng-Jay/GiantRepair and states 'We have open-sourced our implementations and all experimental data to facilitate future research in this field.'"
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses the publicly available Defects4J benchmark (v1.2 and v2.0) and GrowingBugs. The authors state they have open-sourced 'all experimental data' at their GitHub repository."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper lists hardware (dual Intel Xeon 6388 CPUs, 512GB RAM, four A800 GPUs, Ubuntu 20.04.6LTS) but does not provide dependency specifications such as requirements.txt, Dockerfile, or detailed library versions. GIANTREPAIR is implemented in Java (~22k lines) but Java version and dependency details are not listed."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper provides a GitHub URL but does not contain step-by-step reproduction instructions, scripts to replicate experiments, or a 'Reproducing Results' section. Configuration details are spread across Section IV-D."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All results are presented as point estimates (e.g., 87, 84, 171 bugs fixed). No confidence intervals or error bars are reported anywhere in the paper."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper claims GIANTREPAIR 'outperforms' baselines based solely on comparing raw counts of correctly fixed bugs. No statistical significance tests (t-tests, Mann-Whitney, etc.) are used for any comparisons."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports percentage improvements with baseline context: 'the relative improvement is up to 31.58%, with an average increase of 27.78%' (Section V-A), and provides absolute numbers (e.g., from 43 to 53 correct fixes for GPT-3.5)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification is given for why Defects4J v1.2 (255 single-function bugs) and v2.0 (228 single-function bugs) constitute sufficient sample sizes for the claims made. No power analysis is discussed."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Results appear to be from single experimental runs. No standard deviations, variance across seeds, or spread measures are reported for any experiment."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares against 22 baseline APR tools (Table II) including 4 LLM-based APRs (FitRepair, Repilot, GAMMA, AlphaRepair), 4 deep-learning APRs (Tare, CURE, Recoder, Hanabi), and others spanning 2016-2023."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The main baselines include tools from 2022-2023: Tare (ICSE 2023), FitRepair (ASE 2023), GAMMA (ASE 2023), Repilot (FSE 2023), AlphaRepair (FSE 2022). These represent the state of the art at the time of submission."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "RQ3 (Section V-C) analyzes the contribution of each skeleton abstraction rule by measuring how frequently each AST node type appears in correct fixes (Figure 4). This shows which components contribute most to the overall results."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper reports both number of correctly repaired bugs and patch precision (ratio of correct to plausible patches). Table V shows both metrics, e.g., GIANTREPAIR: 64/111 correct/plausible, 57.66% precision."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Patches are manually inspected for semantic equivalence to developer patches: 'a patch is deemed correct only if it is semantically equivalent to the developer patch, as determined through manual inspection' (Section III-C). This human evaluation of system outputs goes beyond automated test-passing."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are reported on the established Defects4J benchmark (v1.2 and v2.0) which serves as an external test set. Additionally, GrowingBugs is used as a completely separate dataset in Section VI-B to test generalization."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table IV provides per-project breakdowns for all 6 Defects4J v1.2 projects (Chart, Closure, Lang, Math, Time, Mockito). Table V also provides per-project results for the automated fault localization scenario."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The paper shows motivating examples where LLMs fail but GIANTREPAIR succeeds (Listings 1-5), but does not analyze cases where GIANTREPAIR itself fails. No error analysis or systematic discussion of failure modes is provided."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that GIANTREPAIR has lower patch precision (57.66%) than several baselines (Tare 60.00%, SimFix 67.50%, Hanabi 80.95%), noting 'the powerful code generation ability of LLMs may also increase the risk of generating incorrect patches (i.e., low patch precision) due to the issue of weak tests' (Section V-B)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims of 27.78%/23.40% improvement match Table III averages. The claim of '42 more bugs' matches Table IV (171 vs 129 for FitRepair). The claim of '7 more bugs' matches Table V (64 vs 57 for Tare). All abstract claims are supported by the results."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper makes causal claims about GIANTREPAIR improving repair performance. RQ1 provides a controlled comparison (same LLMs with vs without GIANTREPAIR) that isolates the approach's contribution. RQ3's ablation study further supports causal attribution to specific components."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title 'Hybrid Automated Program Repair' makes no language or benchmark qualifier, yet results are exclusively on Java bugs from Defects4J and GrowingBugs. The limitations section (VI-C) acknowledges 'one programming language (Java)' but the title and abstract frame the contribution broadly."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section VI-B substantively discusses data leakage as an alternative explanation, finding 23/109 StarCoder patches overlapped with training data and conducting an additional GrowingBugs experiment. Section VI-D discusses internal threats (manual review bias) and external threats (dataset generalizability)."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures 'number of correctly repaired bugs' verified by manual semantic equivalence checking against developer patches. This directly measures what is claimed (repair capability) with no proxy gap."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-D specifies: 'StarCoderBase (i.e., StarCoder-15.5B), CodeLlama-7B, Llama-2-13B, and GPT-3.5-turbo-0301.' For GPT-4 comparison: 'GPT-4-1106-preview.' All models include version or size identifiers."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper states 'we reused the prompt proposed by Xia et al. [19]' but does not include the actual prompt text anywhere in the paper. The reader must consult a separate publication to see the prompt."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-D reports: 'Top-p Nucleus Sampling with p = 0.95 and temperature = 0.8', 'at most 200 patches' per LLM per bug, 'at most generates 500 candidate patches based on one patch skeleton', and '5-hour time budget for repairing a single bug.'"
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "GIANTREPAIR is a program analysis tool that uses LLMs as patch generators, not an agentic system with scaffolding (no tool use, retry logic, memory management, or feedback loops). LLMs generate patches in a single pass."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section IV-B documents filtering: 'we removed the bugs that require cross-function modifications. Consequently, we use all 255 single-function bugs from Defects4J v1.2 and 228 single-function bugs from Defects4J v2.0.' GrowingBugs filtering is also documented (34/250 projects, 51/122 bugs)."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section VI-C 'Limitation' discusses three specific limitations. Section VI-D 'Threats to validity' discusses internal and external threats with substantive detail."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section VI-D discusses specific threats: manual patch review may introduce error; LLM training data overlap with Defects4J (addressed in VI-B with concrete analysis); evaluation limited to specific datasets. These are study-specific, not generic boilerplate."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section VI-C explicitly states: 'our experiments involved four LLMs...and one programming language (Java)'; the time for LLM patch generation is not counted; and single-patch skeleton instantiation may miss cross-patch fixes."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The authors state they 'open-sourced our implementations and all experimental data' at https://github.com/Feng-Jay/GiantRepair, and mention 'we have published our full set of correct and plausible patches' in Section VI-D."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section IV-B describes the data source: Defects4J v1.2 (391 bugs from 6 projects) and v2.0 (438 bugs from 11 projects), with explicit filtering criteria (single-function bugs only) yielding 255 and 228 bugs respectively."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data comes from the standard Defects4J and GrowingBugs benchmarks."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The full pipeline is documented: LLM generates 200 patches per bug → skeleton construction via AST differencing → patch instantiation (up to 500 per skeleton) → ranking → test validation via ExpressAPR. Filtering counts are provided (391→255, 438→228, 250→34→51 for GrowingBugs)."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information, acknowledgments section, or grant references appear anywhere in the paper."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly stated: Tianjin University (Li, Jiang, Sun) and Chongqing University (Zhang). These are academic institutions with no direct commercial interest in the evaluated LLMs."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Cannot assess funder independence since no funding information is disclosed."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial disclosure statement is present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the four LLMs (GPT-3.5-turbo-0301, StarCoder, CodeLlama, Llama-2). The paper does not mention when these models' training data was collected."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section VI-B extensively discusses data leakage: 'among the 109 correct patches generated by GIANTREPAIRStarCoder, 23 of them were included in StarCoder's training data.' They selected StarCoder because 'it is the only one that published its training data.'"
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section VI-B addresses contamination through two methods: (1) checking StarCoder's training data for overlap, finding 86/109 patches were NOT in training data; (2) conducting an extra experiment on GrowingBugs, filtering to projects not in training data (34/250 projects, 51 bugs), where GIANTREPAIR still repaired 10 bugs."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study. It is a benchmark evaluation of automated program repair tools."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "A 5-hour time budget per bug is mentioned as a limit, but actual inference costs (API costs for GPT-3.5, wall-clock times per experiment, tokens consumed) are not reported. The authors acknowledge in Section VI-C that 'the time LLMs take to generate patches is not accounted for.'"
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Hardware is described (dual Intel Xeon 6388 CPUs, 512GB RAM, four A800 GPUs) but total compute budget (GPU hours, total API spend, aggregate wall-clock time for all experiments) is not quantified. The experiments involve 200 patches × 4 LLMs × 483 bugs plus GIANTREPAIR's processing, which is significant."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single experimental runs. LLM patch generation uses sampling (temperature=0.8) which inherently varies by seed, but this is not analyzed."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The exact number of experimental runs is not stated. It is unclear whether experiments were run once or multiple times. Only final bug counts are reported."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Configuration values (p=0.95, temperature=0.8, 500 patches per skeleton, 200 patches per LLM, 5-hour budget) are stated but no hyperparameter search budget or method is described. It is unclear how these values were chosen."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper states configurations were chosen 'by following prior work [14], [15], [27]' for some settings, and 'adopted the model default settings' for others, but does not justify why these are optimal or report results under alternative configurations."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper makes many comparative claims across 22 baselines, 4 LLMs, 2 benchmarks, and multiple projects, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors compare their system against baseline results 'reused...from the corresponding publications directly' (Section IV-D) but do not acknowledge the potential bias of different experimental conditions, evaluation criteria, or implementation quality between their setup and published baselines."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "GIANTREPAIR generates up to 500 additional candidate patches per skeleton on top of 200 LLM-generated patches, using significantly more compute than baselines. This compute disparity is not discussed or controlled for. The 4-LLM combined configuration (800 base patches) is compared against single-model tools."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper uses Defects4J without discussing whether it adequately measures real-world repair capability. Defects4J is limited to single-hunk Java bugs; no discussion of whether performance on these bugs generalizes to the broader class of real-world software bugs."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "RQ1 (Table III) specifically controls for the scaffold confound by comparing the same LLMs with and without GIANTREPAIR, isolating the contribution of the approach. This controlled comparison demonstrates improvement is due to GIANTREPAIR's skeleton+instantiation pipeline, not just stronger models."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "Section VI-B addresses temporal leakage by checking whether correct patches appear in StarCoder's published training data (23/109 overlap) and by testing on GrowingBugs with projects excluded from training data."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The paper explicitly compares results under perfect fault localization (which provides information not available in practice) versus automated fault localization (Section V-B, Table V), directly addressing whether location information leakage affects conclusions."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The paper does not discuss whether Defects4J bugs from the same projects share structural similarities that could inflate results, or whether training data of the LLMs included code from the same repositories."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Section VI-B applies a concrete detection method: directly checking StarCoder's published training data against correct patches. Additionally, they filter GrowingBugs to exclude projects in training data (34/250 projects retained), serving as a decontaminated validation set."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "GIANTREPAIR improves LLM repair performance by an average of 27.78% on Defects4J v1.2 and 23.40% on Defects4J v2.0.",
    375       "evidence": "Table III shows improvements across 4 LLMs: GPT-3.5 (43→53), StarCoder (42→55), CodeLlama (40→51), Llama-2 (19→25) on v1.2; GPT-3.5 (45→53), StarCoder (44→54), CodeLlama (34→43), Llama-2 (18→24) on v2.0 (Section V-A).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "GIANTREPAIR outperforms the best state-of-the-art APR by repairing at least 42 more bugs with perfect fault localization.",
    380       "evidence": "Table IV shows GIANTREPAIR fixes 171 bugs total (87 on v1.2, 84 on v2.0) vs FitRepair's 129 (85+44). However, GIANTREPAIR uses 4 combined LLMs while FitRepair uses 4 CodeT5 models, and fault localization granularity differs (Section V-B).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "GIANTREPAIR repairs at least 7 more bugs than the best APR under automated fault localization.",
    385       "evidence": "Table V shows GIANTREPAIR fixes 64 bugs vs Tare's 57 under automated (Ochiai) fault localization on Defects4J v1.2. GIANTREPAIR's precision (57.66%) is lower than several baselines (Section V-B).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "GIANTREPAIR can repair 21 bugs that none of 22 existing APR tools can fix.",
    390       "evidence": "Figure 3 (right) shows 21 uniquely repaired bugs by GIANTREPAIR vs all 22 baselines combined on Defects4J v1.2 (Section V-B).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "The effectiveness of GIANTREPAIR is not primarily due to data leakage.",
    395       "evidence": "Section VI-B shows 86/109 (79%) of StarCoder's correct patches were NOT in training data. On GrowingBugs (filtered to exclude training data projects), GIANTREPAIR still repaired 10/51 bugs.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "GIANTREPAIR is still useful compared to GPT-4.",
    400       "evidence": "Section VI-A reports GPT-4 repaired only 1 of 10 randomly selected bugs that GIANTREPAIR fixed but other LLMs could not. However, only 10 bugs were tested with 20 patches each, which is a very small sample (Section VI-A).",
    401       "supported": "weak"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "No statistical significance testing",
    407       "detail": "All comparative claims ('outperforms', 'improves') rely on raw count differences without any significance tests. With 255-483 bugs, stochastic LLM generation (temperature=0.8), and the absence of multiple runs, differences could partly reflect sampling variance."
    408     },
    409     {
    410       "flag": "No variance or reproducibility analysis",
    411       "detail": "Results appear to be from single experimental runs despite using stochastic LLM sampling (temperature=0.8, top-p=0.95). No standard deviations, seed sensitivity, or confidence intervals are reported."
    412     },
    413     {
    414       "flag": "Unequal resource comparison with baselines",
    415       "detail": "The headline comparison (171 vs 129 bugs) uses GIANTREPAIR with patches from 4 LLMs combined (800 base patches per bug) while individual baselines use fewer resources. Some baselines (FitRepair, Repilot) use line-level perfect fault localization (harder) while GIANTREPAIR uses function-level (easier). The authors acknowledge this but frame it as underestimating their approach."
    416     },
    417     {
    418       "flag": "LLM inference time excluded from time budget",
    419       "detail": "The authors acknowledge in Section VI-C that 'the time LLMs take to generate patches is not accounted for in the patch generation process of GIANTREPAIR.' The 5-hour budget only covers GIANTREPAIR's processing, making cost comparisons with baselines that include their full pipeline time unfair."
    420     },
    421     {
    422       "flag": "Weak GPT-4 comparison",
    423       "detail": "The GPT-4 comparison (Section VI-A) tests only 10 bugs with 20 patches each, which is too small to draw meaningful conclusions. The claim that GIANTREPAIR 'is still useful even compared with the latest LLM' overstates what this evidence supports."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Automated program repair in the era of large pre-trained language models",
    429       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    430       "year": 2023,
    431       "relevance": "Foundational study exploring LLM application to automated program repair, whose prompt design is reused in this paper."
    432     },
    433     {
    434       "title": "The plastic surgery hypothesis in the era of large language models",
    435       "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"],
    436       "year": 2023,
    437       "relevance": "FitRepair, the most closely related prior work, proposes fine-tuning methods for LLM-based APR and is the primary comparison target."
    438     },
    439     {
    440       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    441       "authors": ["C. Xia", "L. Zhang"],
    442       "year": 2022,
    443       "relevance": "AlphaRepair uses CodeBERT for zero-shot program repair via masked prediction, a key baseline for LLM-based APR."
    444     },
    445     {
    446       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    447       "authors": ["Y. Wei", "C. S. Xia", "L. Zhang"],
    448       "year": 2023,
    449       "relevance": "Repilot fuses LLMs with completion engines for APR, demonstrating hybrid LLM+analysis approaches related to GIANTREPAIR's design."
    450     },
    451     {
    452       "title": "GAMMA: Revisiting template-based automated program repair via mask prediction",
    453       "authors": ["Q. Zhang", "C. Fang", "T. Zhang", "B. Yu", "W. Sun", "Z. Chen"],
    454       "year": 2023,
    455       "relevance": "Template-based APR using masked LLM prediction, a key baseline combining LLMs with structured repair patterns."
    456     },
    457     {
    458       "title": "Impact of code language models on automated program repair",
    459       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    460       "year": 2023,
    461       "relevance": "Systematic evaluation of code language models for APR, establishing methodology for LLM-APR comparison."
    462     },
    463     {
    464       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    465       "authors": ["C. S. Xia", "L. Zhang"],
    466       "year": 2023,
    467       "arxiv_id": "2304.00385",
    468       "relevance": "Explores conversational LLM-based program repair with ChatGPT, demonstrating cost-effectiveness of LLM-based APR."
    469     },
    470     {
    471       "title": "RepairLlama: Efficient representations and fine-tuned adapters for program repair",
    472       "authors": ["A. Silva", "S. Fang", "M. Monperrus"],
    473       "year": 2023,
    474       "arxiv_id": "2312.15698",
    475       "relevance": "Fine-tuned LLM adapters for program repair, relevant to the question of how to best leverage LLMs for APR."
    476     },
    477     {
    478       "title": "Tare: Type-aware neural program repair",
    479       "authors": ["Q. Zhu", "Z. Sun", "W. Zhang", "Y. Xiong", "L. Zhang"],
    480       "year": 2023,
    481       "relevance": "Type-aware neural APR that uses program analysis constraints, the best-performing baseline under automated fault localization."
    482     },
    483     {
    484       "title": "Starcoder: may the source be with you!",
    485       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    486       "year": 2023,
    487       "arxiv_id": "2305.06161",
    488       "relevance": "Open-source code LLM used as one of four base models in GIANTREPAIR, notable for publishing its training data enabling leakage analysis."
    489     },
    490     {
    491       "title": "Code llama: Open foundation models for code",
    492       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    493       "year": 2023,
    494       "arxiv_id": "2308.12950",
    495       "relevance": "Code-specific LLM fine-tuned from Llama-2, used as one of four base models in GIANTREPAIR experiments."
    496     },
    497     {
    498       "title": "A survey of large language models for code: Evolution, benchmarking, and future trends",
    499       "authors": ["Z. Zheng", "K. Ning", "Y. Wang"],
    500       "year": 2023,
    501       "arxiv_id": "2311.10372",
    502       "relevance": "Comprehensive survey of LLMs for code tasks including program repair, providing context for the field this paper contributes to."
    503     }
    504   ],
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 2,
    508       "justification": "Open-source Java tool with a concrete pipeline that practitioners could integrate into APR workflows, though limited to Java and Defects4J-style bugs."
    509     },
    510     "surprise_contrarian": {
    511       "score": 1,
    512       "justification": "The insight that 'incorrect' LLM patches still provide useful structural guidance is moderately interesting but not deeply counterintuitive."
    513     },
    514     "fear_safety": {
    515       "score": 0,
    516       "justification": "No security or safety concerns; the paper addresses automated bug fixing, which is constructive."
    517     },
    518     "drama_conflict": {
    519       "score": 0,
    520       "justification": "No controversy; standard academic improvement-over-baselines paper."
    521     },
    522     "demo_ability": {
    523       "score": 2,
    524       "justification": "GitHub repository provided with code and data, though running it requires Java, Defects4J setup, and LLM access."
    525     },
    526     "brand_recognition": {
    527       "score": 1,
    528       "justification": "Uses GPT-3.5 and Llama-2 (recognizable models) but from an academic lab without major brand recognition."
    529     }
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs