scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33234B)
      1 {
      2   "paper": {
      3     "title": "Fixing 7,400 Bugs for 1$: Cheap Crash-Site Program Repair",
      4     "authors": [
      5       "Han Zheng",
      6       "Ilia Shumailov",
      7       "Tianqi Fan",
      8       "Aiden Hall",
      9       "Mathias Payer"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2505.13103",
     14     "doi": "10.48550/arXiv.2505.13103"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "WILLIAMT, a template-guided crash-site repair tool, fixes 46.1% of 358 ARVO memory corruption bugs at $0.0026/bug (vs $0.93/bug for CodeRover-S). The combined WILLIAMT+CodeRover-S pipeline improves the fixing rate by 29.6% while reducing token cost by 45.9% compared to CodeRover-S alone. Non-reasoning models (Claude 3.5 Haiku) outperform reasoning models at the crash-site analysis task, and local models (Gemma3:27B) achieve 96.4% of GPT-4o's performance. Manual review reveals only 56 of 165 plausible patches (34%) are truly correct across diverse inputs.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper explicitly states 'We promise to fully release WILLIAMT upon paper acceptance to support open science' (Section 1, repeated in Section 7). A promise of future release does not count as released code."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The evaluation uses ARVO, a publicly available benchmark (reference [30]). The paper states ARVO 'reconstructs and curates a reproducible dataset of OSS bugs specifically tailored for APR evaluation' and all bugs originate from OSS-Fuzz."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Hardware is mentioned ('Ubuntu 22.04 server equipped with AMD EPYC 7302P and 64GB RAM', 'RTX 4090 GPU', 'Mac Mini M4') but no software dependency specifications, requirements.txt, Dockerfile, or library versions are provided. The ARVO Docker images provide reproducibility for the benchmark, but the WILLIAMT tool's own environment is not specified."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Code is not yet released, making reproduction impossible."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results are reported as single point estimates (e.g., '54.5%', '46.1%', '$0.0026'). No confidence intervals, error bars, or uncertainty measures appear in any table or figure."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Comparative claims like 'WILLIAMT reduces token cost by 45.9%' and 'increases the bug-fixing rate to 73.5%' are based solely on comparing raw numbers without any statistical significance test."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are given with baseline context throughout: '45.9% cost reduction', '29.6% improvement in fixing rate', '357 times more bugs per dollar', '99.7% token cost reduction while preserving over 86.7% performance' (Section 5.1-5.3). Absolute numbers and percentages are provided for both systems."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The 358-bug sample is defined by selection criteria ('all HOF, SOF, UAF and GOF bugs that can be compiled within 15 minutes') but there is no justification for why this sample size is adequate for the claims being made, nor any power analysis."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All results appear to be from single runs. WILLIAMT is described as 'one-shot' (one trial, one patch), and no standard deviation, variance, or spread measures are reported across any experimental condition."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Three SoTA baselines are compared: AutoCodeRover-S [60], Agentless [54], and VulMaster [64]. Results are presented in Figure 4 and the Venn diagram in Figure 5c."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All baselines are from 2024: CodeRover-S (arXiv 2411.03346, 2024), Agentless (ISSTA 2024), and VulMaster (ICSE 2024). These represent the current state of the art in LLM-based APR."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "WILLIAMT has two main components (regex-based context retrieval and template-guided patch generation) but no ablation study removes individual components to measure their contribution. The multi-LLM comparison (Figure 6) varies the backend but not the system architecture."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are reported: plausible fix rate, token cost (USD), execution time, CPU usage, and a detailed fix classification (Plausible, Multiple, Compiles, No Patch, No Code). Appendix A additionally reports actual fix ratio after manual review."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Appendix A (Figure 9) presents manual evaluation of all 165 plausible patches from WILLIAMT-GPT-4o: automated validation of execution consistency, then manual review of robustness across diverse inputs. 56 of 165 are found truly correct."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "All 358 bugs are used for evaluation. There is no separation into development and test sets, and no discussion of whether prompts or templates were tuned on any subset of the data."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by fix outcome (Plausible/Multiple/Compiles/No Patch/No Code in Figures 4 and 6) and by bug type (HBO 52.3%, UAF 11.3%, SBO 8.9%, GBO 4.6% in Figure 11). The Venn diagram (Figure 5c) shows overlap across tools."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 discusses three failure modes: incorrect crash site analysis (LLM returns inaccurate variables), semantically disruptive patch insertion (breaking control flow), and the imprecise plausible metric. Appendix A provides a detailed failure funnel (Figure 9): 165→95→56 through successive filtering."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Reasoning models (DeepSeek-R1, o3-mini) are shown to cost 5.7-5.9× more than Claude35-Haiku 'without providing notable improvements over non-reasoning LLMs' (Section 5.3). Gemma3:1b fixes only 10 bugs. VulMaster 'resolves only 5 bugs in total.' The manual review shows only 56/165 plausible patches are truly correct."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract claims the combined pipeline achieves '73.5% (+29.6%)' fixing rate, but 195 (CodeRover-S) + 60 additional = 255/358 = 71.2%, not 73.5%. More critically, the Conclusion (Section 7) swaps the two key numbers: 'reduces token usage by 29.6% and improves the fixing rate by 45.9%', contradicting the abstract's '45.9% cost reduction' and '29.6% fixing rate improvement.' These internal inconsistencies undermine the abstract claims."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The main causal claims ('crash-site repair reduces cost while maintaining effectiveness') are supported by controlled comparison: same benchmark (358 ARVO bugs), same LLM (gpt-4o-2024-08-06) for the head-to-head, with WilliamT tested against CodeRover-S, Agentless, and VulMaster under equivalent conditions."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Fixing 7,400 Bugs for 1$' is not supported anywhere in the paper body — the number 7,400 never appears in the text. The evaluation covers 358 bugs of 4 types (HBO, GBO, SBO, UAF) on one benchmark (ARVO). At the reported cost of $0.0026/bug, $1 covers ~385 bugs, not 7,400. The claim in the title is an unsupported extrapolation."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Section 6 discusses failure modes of WILLIAMT but does not consider alternative explanations for its performance. For example, there is no discussion of whether ARVO bugs are particularly amenable to crash-site repair, whether the specific bug types selected bias results, or whether LLMs may have seen the fixes in training."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Appendix A (Table 1, Figure 9) extensively distinguishes between the 'plausible fix' proxy metric and actual correctness. They show the plausible metric 'does not ensure that the program's functionality is preserved' and demonstrate that only 56/165 plausible fixes survive manual review. This is an unusually thorough proxy-outcome analysis."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Only GPT-4o is specified with a snapshot date ('gpt-4o-2024-08-06'). All other models use marketing names without API versions or snapshot dates: 'DeepSeek (V3, R1)', 'Claude (3.5-Haiku, 3.7-Sonnet)', 'o3-mini', 'Gemma3 (27B, 12B, 4B, 1B)'. Per the schema, marketing names without snapshot dates do not count."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figure 15 (Appendix C) provides the full prompt text used for crash site analysis, including the system instruction, variable format specifications, examples, and the <issue> tag structure. The patch templates are fully specified in Figures 12 and 13."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for any of the models tested. These settings significantly affect output and are not mentioned."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The WILLIAMT workflow is described in detail: Figure 1 shows the pipeline, Section 4 describes regex-based context retrieval and template-guided patch generation, Appendix B provides the vulnerability templates, and Appendix C details the regex parsing and prompt preparation."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Bug selection criteria are stated: 'all HOF, SOF, UAF and GOF bugs (358 bugs) that can be compiled within 15 minutes following the recommended practice' from ARVO. Appendix C documents the regex-based preprocessing of AddressSanitizer reports including stack trace parsing, bug type classification, and code context extraction."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 ('Discussion') serves as a dedicated limitations section, discussing three categories of limitations: incorrect crash site analysis, semantically disruptive patch insertion, and imprecise plausible metric. Each is discussed substantively."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The threats are specific to this study: 'the code snippet provided to the LLM may lack the correct variable due to limited context' (specific to their context window), ''No Patch' failures account for approximately 37% of all bugs, with many caused by such semantic violations' (quantified failure mode), and the plausible metric limitation with concrete data (only 56/165 are truly correct)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly scopes to crash-site repair (not root cause), four memory corruption types (HBO, GBO, SBO, UAF), and acknowledges crash-site fixes are temporary: 'giving developers time to implement a permanent solution' (Section 3). Section 6 states the plausible metric 'may overestimate the number of truly correct fixes.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw data is released — no patch outputs, LLM responses, per-bug results, or detailed logs. Only aggregate results are presented. The ARVO benchmark is public but the experimental outputs are not."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is described: ARVO benchmark with 5,000+ bugs, filtered to 4 bug types (HBO, GBO, SBO, UAF) that compile within 15 minutes, yielding 358 bugs. Selection follows 'the recommended practice' from [60]."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The data source is a standard public benchmark (ARVO)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The pipeline from ARVO (5,000+ bugs) to 358 evaluation bugs lacks intermediate counts. How many HBO, GBO, SBO, UAF bugs exist in ARVO before the 15-minute compile filter? How many were excluded by the compile filter? These intermediate steps are not documented."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source, acknowledgments section, or grant information is disclosed anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Han Zheng and Mathias Payer at EPFL; Ilia Shumailov at Google DeepMind; Tianqi Fan at Google; Aiden Hall at Google."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Three of five authors are affiliated with Google/DeepMind. The evaluation relies on Google infrastructure (OSS-Fuzz, ClusterFuzz, ARVO). Google has a stake in demonstrating effective APR for OSS-Fuzz bugs. No funding disclosure makes independence impossible to assess."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement, no patent disclosures, and no financial interests declaration appears in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff date is stated for any of the LLMs used (GPT-4o, DeepSeek, Claude, Gemma3). The ARVO bugs come from publicly accessible OSS-Fuzz reports and open-source git histories that could appear in training data."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of whether the LLMs may have seen ARVO bug reports, crash traces, or ground-truth patches during training. The OSS-Fuzz reports and open-source commit histories containing the fixes are publicly available and likely in LLM training corpora."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "ARVO bugs originate from OSS-Fuzz, with ground-truth fixes in public git repositories. These fixes were committed before most models' training cutoffs, creating significant contamination risk. This is not discussed or addressed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Detailed per-bug inference costs are reported: $0.0026/bug for WILLIAMT vs $0.93/bug for CodeRover-S (Figure 5a). Figure 7 breaks down cost across all frontier LLMs, with DeepSeek-V3 at <0.03 cents/bug and Claude35-Haiku at ~0.09 cents/bug."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Hardware is specified (AMD EPYC 7302P, 64GB RAM, RTX 4090, Mac Mini M4). Execution times are reported: WILLIAMT <1 minute vs CodeRover-S ~43.5 minutes. Figure 8 provides detailed time breakdown for preprocessing and each LLM backend. Total cost for all 358 bugs: <$0.68 for WILLIAMT."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No seed sensitivity analysis. WILLIAMT uses a one-shot design and all results appear to be single-run. LLM output stochasticity is not addressed."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "WILLIAMT is explicitly described as one-shot: 'WILLIAMT performs a single trial and applies only one patch throughout the entire fixing pipeline' (Section 5.1). CodeRover-S is stated to use 'up to three trials per bug' with 'up to 18 attempts.'"
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No discussion of how the prompt template, context window size (2 lines before/after), or patch templates were developed or tuned. No search budget reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No explanation of how the specific prompt design, template structure, or context window size were selected. The paper presents one configuration without discussing alternatives tried."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare their own WILLIAMT against SoTA tools. While they import CodeRover-S results from its original paper for fairness, they do not acknowledge the general bias of evaluating one's own system or discuss how this might affect template/prompt design choices."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Performance is explicitly analyzed as a function of cost: Figures 5a-5b compare cost and time between WILLIAMT and CodeRover-S, Figure 7 shows cost vs performance across LLM backends, and Section 5.3 discusses cost-performance tradeoffs for reasoning vs non-reasoning models."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper describes ARVO as a reliable benchmark and notes it 'ensures that all bugs are both ground-truth and reproducible' but does not question whether fixing 358 ARVO bugs of 4 types generalizes to real-world APR capability. No discussion of whether ARVO's specific bug distribution represents real-world vulnerability patterns."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "When comparing LLM backends (Figure 6), all models use the same WILLIAMT scaffold, isolating the model variable. The SoTA comparison (Figure 4) compares complete systems as intended — the scaffold IS the thing being tested."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Not discussed. ARVO bugs have ground-truth fixes committed to public repositories before the LLMs' training cutoffs. Models may have learned the exact patches from training data."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. The crash-site analysis prompt (Figure 15) provides sanitizer output and source context, which is the intended input. However, no analysis of whether the LLM might be retrieving memorized fixes rather than reasoning about the crash site."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. Multiple bugs may come from the same project or share similar code patterns. No analysis of whether results are driven by a few projects or are independent across bugs."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference tests, temporal splits, or decontamination procedures."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "WILLIAMT saves 97.7% CPU usage and 99.7% token cost while preserving over 86.7% of CodeRover-S performance.",
    371       "evidence": "Section 5.1, Figures 5a-5b: WILLIAMT costs $0.0026/bug vs CodeRover-S $0.93/bug. WILLIAMT fixes 165/358 bugs (46.1%) vs CodeRover-S 195/358 (54.5%), giving 84.6% relative performance. Execution time: 58.7s vs 2610.0s.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Combined WILLIAMT+CodeRover-S pipeline reduces token cost by 45.9% and improves fixing rate by 29.6% compared to CodeRover-S alone.",
    376       "evidence": "Section 5.2, Figure 5c Venn diagram. The pipeline achieves 60 additional plausible fixes. However, the Conclusion (Section 7) swaps these numbers ('reduces token usage by 29.6% and improves the fixing rate by 45.9%'), and the abstract claim of 73.5% combined rate (263/358) doesn't align with 195+60=255/358=71.2%.",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "Claude 3.5 Haiku achieves the highest fixing rate at 47.5% among all tested models.",
    381       "evidence": "Section 5.3, Figure 6: Claude35-haiku fixes 170 bugs (47.5%), followed by Claude37-sonnet (166, 46.4%) and GPT-4o (165, 46.1%). Single run without error bars.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Reasoning models do not improve fixing performance despite substantially higher costs.",
    386       "evidence": "Section 5.3, Figure 7: DeepSeek-R1 costs 0.70 cents/bug and o3-mini 0.72 cents/bug (5.7-5.9× more than Claude35-Haiku at 0.09 cents) with 159 and 153 fixes respectively, compared to Claude35-Haiku's 170. Reasoning models generate much more output tokens without better results.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Gemma3:27B running locally achieves 96.4% of GPT-4o's performance.",
    391       "evidence": "Section 5.3, Figure 6: Gemma3:27B fixes 163 bugs vs GPT-4o's 165. 163/165 = 98.8% (paper claims 96.4%, possibly calculated differently). Single benchmark, one run, no error bars.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Only 56 of 165 plausible WILLIAMT patches (34%) are truly correct across diverse inputs.",
    396       "evidence": "Appendix A, Figure 9: Detailed manual review pipeline shows 165 plausible → 95 no early exit → 56 correct for broader inputs. This honest assessment significantly deflates the headline plausible fix rate.",
    397       "supported": "strong"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Unexplained title number",
    403       "detail": "The title claims 'Fixing 7,400 Bugs for 1$' but the number 7,400 never appears in the paper body. The evaluation covers only 358 bugs. At the reported cost of $0.0026/bug, $1 covers ~385 bugs. Even with the cheapest model (DeepSeek-V3 at ~$0.0003/bug), $1 covers ~3,333 bugs. The title number appears to be unsupported."
    404     },
    405     {
    406       "flag": "Internal numerical inconsistency",
    407       "detail": "The Conclusion (Section 7) states 'reduces token usage by 29.6% and improves the fixing rate by 45.9%', directly swapping the two key numbers from the abstract and Section 5.2 ('reduces token cost by 45.9%' and '29.6% improvement in fixing rate'). The abstract claims 73.5% combined fixing rate but 195+60=255/358=71.2%."
    408     },
    409     {
    410       "flag": "No error bars or multiple runs",
    411       "detail": "All results are single-run point estimates. LLM outputs are stochastic, and the one-shot design means a single different LLM response could change any individual bug's outcome. No uncertainty quantification is provided."
    412     },
    413     {
    414       "flag": "Contamination risk unaddressed",
    415       "detail": "The ARVO benchmark draws from OSS-Fuzz, with ground-truth fixes committed to public GitHub repositories. All LLMs tested likely encountered these bug reports and patches during training. The paper does not discuss this contamination risk at all."
    416     },
    417     {
    418       "flag": "Google affiliation conflict of interest",
    419       "detail": "Three of five authors are affiliated with Google/DeepMind. The evaluation relies on Google infrastructure (OSS-Fuzz, ClusterFuzz, ARVO). Google's Gemma3 model is shown to perform well. No funding disclosure or conflict-of-interest statement is provided."
    420     },
    421     {
    422       "flag": "Plausible metric inflates success",
    423       "detail": "The headline results use the 'plausible fix' metric (PoC doesn't crash). The paper's own manual review (Appendix A) shows only 56/165 (34%) of plausible patches are truly correct, meaning the actual fixing rate is closer to 15.6% (56/358), not the reported 46.1%. This affects all compared tools equally, but the authors deserve credit for reporting the gap."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "AutoCodeRover: Autonomous program improvement",
    429       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    430       "year": 2024,
    431       "relevance": "Primary APR baseline using LLM agents; demonstrates agent-based program repair on real bugs."
    432     },
    433     {
    434       "title": "Fixing security vulnerabilities with AI in OSS-Fuzz",
    435       "authors": ["Yuntong Zhang", "Jiawei Wang", "Dominic Berzin", "Martin Mirchev", "Dongge Liu", "Abhishek Arya", "Oliver Chang", "Abhik Roychoudhury"],
    436       "year": 2024,
    437       "arxiv_id": "2411.03346",
    438       "relevance": "CodeRover-S, the top-performing baseline; evaluates AI-based vulnerability repair on the same ARVO benchmark."
    439     },
    440     {
    441       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    442       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    443       "year": 2024,
    444       "relevance": "Agentless APR framework; demonstrates conversational LLM-based program repair with cost analysis."
    445     },
    446     {
    447       "title": "Out of sight, out of mind: Better automatic vulnerability repair by broadening input ranges and sources",
    448       "authors": ["Xin Zhou", "Kisub Kim", "Bowen Xu", "DongGyun Han", "David Lo"],
    449       "year": 2024,
    450       "relevance": "VulMaster baseline; APR tool for vulnerability repair that broadens input context."
    451     },
    452     {
    453       "title": "ARVO: Atlas of reproducible vulnerabilities for open source software",
    454       "authors": ["Xiang Mei", "Pulkit Singh Singaria", "Jordi Del Castillo"],
    455       "year": 2024,
    456       "arxiv_id": "2408.02153",
    457       "relevance": "The primary evaluation benchmark providing 5,000+ ground-truth reproducible memory corruption bugs from OSS-Fuzz."
    458     },
    459     {
    460       "title": "Template-guided program repair in the era of large language models",
    461       "authors": ["Kai Huang", "Jian Zhang", "Xiangxin Meng", "Yang Liu"],
    462       "year": 2025,
    463       "relevance": "Template-guided APR using LLMs; directly related approach to WILLIAMT's template-based method."
    464     },
    465     {
    466       "title": "Code repair with LLMs gives an exploration-exploitation tradeoff",
    467       "authors": ["Hao Tang", "Keya Hu", "Jin Zhou", "Si Cheng Zhong", "Wei-Long Zheng", "Xujie Si", "Kevin Ellis"],
    468       "year": 2024,
    469       "relevance": "Analyzes the exploration-exploitation tradeoff in LLM-based code repair, relevant to understanding multi-attempt vs one-shot strategies."
    470     },
    471     {
    472       "title": "An empirical study on fine-tuning large language models of code for automated program repair",
    473       "authors": ["Kai Huang", "Xiangxin Meng", "Jian Zhang", "Yang Liu", "Wenjie Wang", "Shuhao Li", "Yuqing Zhang"],
    474       "year": 2023,
    475       "relevance": "Empirical evaluation of fine-tuning LLMs for APR; defines the Plausible/Multiple/Compiles/No Patch repair classification used here."
    476     },
    477     {
    478       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    479       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    480       "year": 2022,
    481       "relevance": "Zero-shot LLM-based APR demonstrating that LLMs can repair code without task-specific training."
    482     },
    483     {
    484       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    485       "authors": ["Daya Guo", "Dejian Yang"],
    486       "year": 2025,
    487       "arxiv_id": "2501.12948",
    488       "relevance": "Reasoning LLM used as a baseline; shown to produce excessive output tokens without improving repair performance."
    489     },
    490     {
    491       "title": "Sequencer: Sequence-to-sequence learning for end-to-end program repair",
    492       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano"],
    493       "year": 2019,
    494       "relevance": "Early neural program repair approach using sequence-to-sequence learning."
    495     },
    496     {
    497       "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation",
    498       "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota"],
    499       "year": 2019,
    500       "relevance": "Neural machine translation approach to learning bug-fixing patches from real-world data."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 3,
    506       "justification": "Directly addresses a real developer pain point (fuzzer bug backlog), designed for local deployment on consumer hardware (Mac Mini M4), and shows concrete cost savings."
    507     },
    508     "surprise_contrarian": {
    509       "score": 2,
    510       "justification": "Challenges the assumption that root-cause analysis and expensive frontier models are necessary for effective program repair; shows crash-site patches can be useful."
    511     },
    512     "fear_safety": {
    513       "score": 1,
    514       "justification": "Addresses security vulnerabilities (memory corruption) but proposes mitigations rather than raising new concerns."
    515     },
    516     "drama_conflict": {
    517       "score": 1,
    518       "justification": "Implicitly argues expensive agent-based APR approaches are overkill for many bugs, but framed constructively as complementary rather than adversarial."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "Code is not released; promised only upon acceptance. No live demo or installable tool available."
    523     },
    524     "brand_recognition": {
    525       "score": 2,
    526       "justification": "Three authors from Google/Google DeepMind, two from EPFL — recognizable institutions but not about a flagship Google product."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs