scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (34601B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Empirical Evaluation of Large Language Models in Automated Program Repair",
      6     "authors": [
      7       "Jiajun Sun",
      8       "Fengjie Li",
      9       "Xinzhu Qi",
     10       "Hongyu Zhang",
     11       "Jiajun Jiang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2506.13186",
     16     "doi": "10.48550/arXiv.2506.13186"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims about CodeLlama outperforming LLaMA (Tables IV, V), diminishing returns from model size (Tables IV, V), early correct patches (Figure 4), and prompt design impact (Table VI) are all supported by corresponding experimental results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'fine-tuning on code-related tasks significantly enhances LLMs' repair capabilities' (Finding 2) by comparing CodeLlama-7B vs LLaMA-2-13B. This conflates fine-tuning with parameter count differences (7B vs 13B) and potentially different base model training. The prompt ablation studies (RQ4) are better controlled but still lack formal causal identification.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Empirical Evaluation of Large Language Models in Automated Program Repair' implies broad coverage of LLMs, but the study tests only 4 open-source models (7B-33B) with no proprietary models (GPT-4, Claude). The abstract claims about 'modern, large-scale LLMs' are not well-bounded to the tested subset.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section V-A discusses data leakage as an alternative explanation for model performance. RQ3 investigates bug length as a confounding factor for cross-language performance differences. The analysis of incorrect bug analysis (Figure 7) provides alternative explanations for the negative effect on stronger models.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures repair rate and precision directly on benchmark bugs and does not overclaim these as broader metrics like 'software quality' or 'developer productivity.' Claims match the measurement granularity.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section V-B 'Limitation' and Section V-C 'Threats to Validity' (with Internal and External subsections) provide substantive discussion of study limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats include: only 4 models tested with parameter sizes up to 33B while larger models exist; manual verification by two authors as a potential internal validity concern; datasets may not represent real-world bug complexity; and limited programming language coverage.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section V-B explicitly states: 'the capabilities of even larger or more recent models remain unexplored', 'real-world bugs may be more complex than those in the datasets', and 'additional programming languages not included in this study may pose unique challenges.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Tianjin University, University of Electronic Science and Technology of China, and Chongqing University. They evaluate third-party open-source models, not their own products.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed. The authors evaluate open-source models from Meta, BigCode, and DeepSeek — no apparent conflict, but the lack of any funding disclosure is a gap.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "APR is defined in Sections I–II; 'repair rate' and 'precision' are defined in Section III-E; 'enterprise-grade' vs 'algorithmic assignment' bugs are distinguished with concrete examples.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section I explicitly lists five contributions including systematic evaluation of four LLMs across six benchmarks, analysis of 600,000 patches, and prompt engineering insights.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II-C specifically contrasts this work with Xia et al., Fan et al., and Xiang et al., explaining how this study extends prior evaluations beyond limited benchmarks and earlier-generation models.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper states 'we release all generated patches, evaluation scripts, prompt templates, and dataset configurations at our homepage' but provides no URL or repository link anywhere in the paper text. A vague reference to a 'homepage' without an actual link does not count as released.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All six evaluation benchmarks (Defects4J, BugsCpp, IntroClass, IntroClass-Java, ConDefects-Java, ConDefects-Py) are publicly available datasets with citations. The authors also claim to release generated patches, though no URL is provided.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specifications, dependency lists, Docker files, or hardware details are provided. The paper does not describe what GPU hardware was used or any software environment details.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided. While the paper describes the experimental methodology, there are no commands, scripts, or README-level instructions that would allow replication.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables IV, V, and VI are reported as point estimates (e.g., '15.7%' repair rate) with no confidence intervals, error bars, or uncertainty measures.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper makes numerous comparative claims (e.g., 'DeepSeek-Coder achieves the best repair performance') based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests).",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'CodeLlama's repair count increases by 206.7%', 'LLaMA's correct repairs rising from 1 to 32 (a 3100% increase)', and absolute repair rates across all conditions, providing sufficient magnitude context.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for why these specific benchmark sizes are adequate for the claims made. No power analysis or discussion of whether, e.g., 106 bugs in BugsCpp is sufficient for reliable conclusions.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs with no assessment of result stability across seeds or repeated experiments.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four LLMs are compared against each other across all datasets, and multiple prompt settings (zero-shot, one-shot, two-shot, analysis-augmented) serve as baseline comparisons.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The evaluated models (CodeLlama 2023, LLaMA-2 2023, StarCoder 2023, DeepSeek-Coder 2024) are contemporary and widely used in APR research. However, notable omissions include proprietary models like GPT-4 and Claude which had been used in recent APR work.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ4 systematically ablates prompt components: zero-shot vs one-shot (removing the example), and the impact of adding bug analysis. These controlled comparisons isolate the effect of prompt design choices.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are reported: C/P (correct/plausible count), Repair Rate (RRate), and Precision. These capture different aspects of repair effectiveness.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Section III-E states 'All plausible patches are then manually inspected by the first two authors. If a patch is semantically equivalent to the ground truth patch, it is classified as a correct patch.' This two-stage validation (automated test suite + manual inspection) is a strength.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The authors use established benchmarks without any tuning. The reduction from 200 to 30 patches was informed by preliminary results on Defects4J/BugsCpp and then applied to separate algorithmic datasets, maintaining separation.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by dataset (Tables IV, V, VI), programming language (C vs Java vs Python comparisons), model, prompt setting, bug length (Figure 5), and repair action types (Figure 6).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 2 shows a specific failure case on BugsCpp with analysis of why LLMs failed (bug length). Figure 7 shows how incorrect bug analysis misleads models. Section IV-A2 discusses compilation failures (20%) and incorrect location modifications (76%) on DeepSeek-Coder.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Several negative results are reported: all LLMs fail badly on BugsCpp (3.5% average RRate), bug analysis integration hurts DeepSeek-Coder (drops 46.6% on ConDefects-Java), and LLaMA shows very poor performance across most benchmarks.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table II specifies CodeLlama-7B, LLaMA-2-13B, StarCoderBase (15.5B), and DeepSeek-Coder-33B-instruct with providers and years. These are specific enough to identify exact model weights for open-source models.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 1 shows the full prompt structure including the actual guidance text ('// Provide a fix for the buggy function', 'You are a code analysis tool...'), the GCD example with actual code, and the template structure. Table III summarizes all four prompt configurations used.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No generation hyperparameters are reported — temperature, top-p, top-k, max tokens, and other sampling parameters are not mentioned anywhere in the paper, despite generating over 600,000 patches.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The approach is direct LLM inference with prompts — no tool use, retry logic, feedback loops, or multi-step reasoning pipelines.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III-B documents dataset filtering: selecting single-function bugs from Defects4J (255+228), BugsCpp (106), IntroClass (297 each), and ConDefects (563 each after filtering for cross-language assignment overlap and random selection of one submission per language).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The paper claims to release 'all generated patches, evaluation scripts, prompt templates, and dataset configurations' but provides no URL. Without an accessible link, the raw data cannot be verified.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section III-B describes dataset selection in detail: sources, years, languages, number of bugs, and filtering criteria (single-function bugs, cross-language overlap for ConDefects, random selection of one submission per assignment).",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, BugsCpp, IntroClass, ConDefects).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented: benchmark selection → single-function bug filtering (with counts: 255, 228, 106, 297, 297, 563, 563) → patch generation (200 or 30 per bug) → deduplication → test suite validation → manual inspection of plausible patches → classification as correct or not.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the four evaluated models. This is critical since Defects4J (2014) and IntroClass (2015) have been public for years before these models were trained.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Section V-A acknowledges the risk that 'some of the buggy or fixed code used in selected benchmarks may partially or fully exist in the model's training data' but performs no actual analysis of train/test overlap (no membership inference, n-gram checks, or temporal analysis).",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Defects4J (2014), IntroClass (2015), and QuixBugs-derived benchmarks have been publicly available on GitHub for years before the 2023-2024 models were trained. The paper's only mitigation is using 'multiple datasets that differ in collection time' without concretely analyzing contamination risk for specific benchmarks.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. This is a benchmark evaluation study using LLMs on bug datasets.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants. Manual patch verification by authors does not constitute a human subjects study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Despite generating over 600,000 patches and discussing computational cost as a key theme (RQ2), no actual inference costs (time, GPU hours, tokens, or monetary cost) are reported.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, GPU type, GPU hours, or hardware specifications are mentioned. The paper discusses reducing patches from 200 to 30 for efficiency but never quantifies the actual compute consumed.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of random seeds, seed sensitivity, or results across multiple seeds. All results appear to be from single experimental runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": true,
    380           "justification": "The paper explicitly states the number of patches generated per bug: 200 for Defects4J and BugsCpp (Section III-E), 30 for subsequent experiments (Section IV-B). Each patch generation constitutes a run.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. Generation parameters (temperature, etc.) are not even reported, let alone any search over them.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "All four prompt configurations are reported with full results in Table VI. No cherry-picking of configurations — the paper systematically compares all settings and reports both improvements and degradations.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper makes dozens of comparative claims across models, datasets, and prompt settings without any statistical testing.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": true,
    404           "justification": "The authors evaluate existing open-source LLMs rather than their own system, which mitigates the primary self-comparison bias concern. They do not re-implement baselines — they use the same generation pipeline for all models.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "While the paper discusses model parameter size vs performance (diminishing returns theme), it does not report actual compute costs. No GPU hours, inference time, or matched-compute comparisons are provided.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses six benchmarks without discussing whether they actually measure real-world repair capability. No analysis of construct validity — e.g., whether fixing single-function bugs from student assignments generalizes to real software maintenance.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is used. All models receive the same direct prompt-based generation pipeline, so there is no scaffold confound.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "Defects4J (2014) and IntroClass (2015) have been publicly available for nearly a decade before the 2023-2024 models were trained. The paper acknowledges diversity in 'collection time' but does not specifically analyze temporal leakage for older benchmarks.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": true,
    436           "justification": "Section III-E states 'we provided method-level perfect fault localization, eliminating potential confounding factors introduced by localization inaccuracies.' This is explicitly acknowledged as a deliberate design choice to isolate repair capability, not a leaked feature.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training corpora for the evaluated models include code from the same projects as the benchmarks (e.g., Apache Commons projects in Defects4J are widely available on GitHub).",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention methods are used. Section V-A discusses leakage risk conceptually ('it is plausible that some of the buggy or fixed code...may exist in the model's training data') but applies no detection techniques (no canary strings, membership inference, or n-gram overlap analysis).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Fine-tuned CodeLlama-7B consistently outperforms general-purpose LLaMA-2-13B across all benchmarks despite having fewer parameters",
    457       "evidence": "Table IV shows CodeLlama fixes 40/34 bugs on Defects4J v1.2/v2.0 vs LLaMA's 19/18; Table V shows similar pattern across all four algorithmic datasets",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "The majority of correct patches appear within the first 30 generated candidates (95.77% for StarCoder on IntroClass-Java), enabling cost-efficient APR",
    462       "evidence": "Figure 4(a) and (b) show cumulative correct patch rankings across models and datasets; 89.6% within top 30 for StarCoder on IntroClass datasets",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "LLMs perform roughly 10x better on algorithmic assignment bugs than enterprise-grade project bugs (DeepSeek: 45.45% vs 5.66%)",
    467       "evidence": "Direct comparison of Tables IV and V; average RRate drops from 15.1% on Defects4J to 3.5% on BugsCpp, while IntroClass rates reach 19.9–45.5%",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "One-shot prompting with a repair example improves repair performance by up to 206.7% over zero-shot for weaker models",
    472       "evidence": "Table VI directly compares zero-shot vs one-shot for all four LLMs on ConDefects-Java and ConDefects-Py with specific counts",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "Bug analysis in prompts helps weaker models but hurts stronger ones due to propagation of incorrect diagnostics",
    477       "evidence": "Table VI shows DeepSeek drops from 127 to 63 correct fixes with bug analysis on ConDefects-Java while LLaMA increases from 7 to 32; Figure 7 illustrates an incorrect analysis causing wrong patch",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Each evaluated LLM generates unique correct patches unattainable by the others, suggesting ensemble approaches would improve coverage",
    482       "evidence": "Figure 3 shows Venn diagrams of unique fixes per model on Defects4J; even LLaMA-2-13B generates 1 unique fix not recovered by any other model on v2.0",
    483       "supported": "strong"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "Evaluating four open-source LLMs (7B–33B parameters) on 2,309 bugs across six benchmarks in Java, C/C++, and Python, the study finds that fine-tuned code-specialized models consistently outperform larger general-purpose models—CodeLlama-7B beats LLaMA-2-13B on every benchmark. The vast majority of correct patches (>89%) appear within the first 30 generated candidates, enabling significant computational savings without sacrificing repair effectiveness. LLMs perform roughly 10x better on algorithmic assignment bugs than enterprise-grade project bugs, and one-shot prompting provides consistent gains over zero-shot. A counterintuitive finding is that incorporating LLM-generated bug analysis into prompts substantially helps weaker models but degrades performance of stronger ones due to propagation of incorrect diagnostic reasoning.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance tests",
    493       "detail": "All comparative claims across models and prompt conditions are based on point estimates with no hypothesis testing, confidence intervals, or variance reporting across runs."
    494     },
    495     {
    496       "flag": "No hyperparameters reported",
    497       "detail": "Temperature, top-p, and other sampling parameters for patch generation are never specified, making exact replication impossible despite the large-scale nature of the study."
    498     },
    499     {
    500       "flag": "No release URL provided",
    501       "detail": "The paper claims to release all patches, evaluation scripts, and configurations at 'our homepage' but provides no URL, making the claimed open release unverifiable."
    502     },
    503     {
    504       "flag": "No proprietary model comparison",
    505       "detail": "GPT-4, Claude, and Gemini are excluded from comparison; only open-source models from 2023–2024 are evaluated, substantially limiting the study's relevance to the current LLM landscape."
    506     },
    507     {
    508       "flag": "Funding not disclosed",
    509       "detail": "No funding source is mentioned anywhere in the paper, which is unusual for a multi-institutional academic study of this scale."
    510     },
    511     {
    512       "flag": "Data leakage acknowledged but not quantified",
    513       "detail": "Section V-A acknowledges benchmark code may be in training data but offers no decontamination analysis or quantification of potential contamination impact."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Automated program repair in the era of large pre-trained language models",
    519       "relevance": "Key prior work by Xia et al. evaluating LLMs for APR on Defects4J, ManyBugs, and QuixBugs; this paper extends that evaluation to larger models, more languages, and deeper analysis"
    520     },
    521     {
    522       "title": "ConDefects: A new dataset to address the data leakage concern for LLM-based fault localization and program repair",
    523       "relevance": "Two of the six evaluation benchmarks (ConDefects-Java, ConDefects-Py) come from this dataset specifically designed to mitigate LLM training data leakage"
    524     },
    525     {
    526       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    527       "relevance": "Primary enterprise-grade bug benchmark; de facto standard for APR evaluation"
    528     },
    529     {
    530       "title": "Code llama: Open foundation models for code",
    531       "relevance": "One of the four evaluated LLMs; shown to be the strongest small model, beating the larger general-purpose LLaMA despite fewer parameters"
    532     },
    533     {
    534       "title": "DeepSeek-Coder: When the large language model meets programming",
    535       "relevance": "Best-performing LLM in the evaluation; 33B parameter code-specialized model used as the upper-bound reference"
    536     },
    537     {
    538       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    539       "relevance": "Related LLM-based APR work using conversational models; provides context for cost-effectiveness discussion"
    540     },
    541     {
    542       "title": "An empirical study on fine-tuning large language models of code for automated program repair",
    543       "relevance": "Prior empirical study on fine-tuning smaller LLMs (CodeBERT, CodeT5) for APR; this paper extends to larger, more modern models with broader benchmark coverage"
    544     },
    545     {
    546       "title": "BugsCpp: A highly usable real world defect benchmark for C/C++",
    547       "relevance": "One of the two enterprise-grade bug benchmarks used; provides C/C++ evaluation dimension beyond the Java-dominant Defects4J"
    548     }
    549   ],
    550   "engagement_factors": {
    551     "practical_relevance": {
    552       "score": 2,
    553       "justification": "Provides actionable guidance on prompt design and model selection for APR practitioners, plus the finding that 30 patches suffices instead of 200."
    554     },
    555     "surprise_contrarian": {
    556       "score": 1,
    557       "justification": "The finding that smaller models generate unique patches not found by larger models is mildly surprising, but most results confirm expected trends."
    558     },
    559     "fear_safety": {
    560       "score": 0,
    561       "justification": "No AI safety or security concerns raised; this is a purely technical evaluation of code repair capabilities."
    562     },
    563     "drama_conflict": {
    564       "score": 0,
    565       "justification": "No controversy or provocative claims — a straightforward empirical comparison."
    566     },
    567     "demo_ability": {
    568       "score": 0,
    569       "justification": "Claims to release code and patches but provides no URL; nothing is demonstrably available to try."
    570     },
    571     "brand_recognition": {
    572       "score": 1,
    573       "justification": "Evaluates recognizable models (LLaMA, DeepSeek-Coder) but the research group and venue are not high-profile."
    574     }
    575   },
    576   "hn_data": {
    577     "threads": [
    578       {
    579         "hn_id": "44507887",
    580         "title": "Empirical Evaluation of Large Language Models in Automated Program Repair",
    581         "points": 5,
    582         "comments": 0,
    583         "url": "https://news.ycombinator.com/item?id=44507887"
    584       },
    585       {
    586         "hn_id": "40876136",
    587         "title": "LLMMatDesign – Gen AI for Materials",
    588         "points": 4,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=40876136"
    591       },
    592       {
    593         "hn_id": "44663723",
    594         "title": "Prompt Injection 2.0: Hybrid AI Threats – Paper and Open Source Testing Toolkit",
    595         "points": 3,
    596         "comments": 1,
    597         "url": "https://news.ycombinator.com/item?id=44663723"
    598       },
    599       {
    600         "hn_id": "43293373",
    601         "title": "RingFormer: Rethinking Recurrent Transformer with Adaptive Level Signals",
    602         "points": 3,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=43293373"
    605       },
    606       {
    607         "hn_id": "44943311",
    608         "title": "NaN-propagation: a novel method for sparsity detection in black-box computationa",
    609         "points": 3,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=44943311"
    612       },
    613       {
    614         "hn_id": "44962664",
    615         "title": "Chain-of-Agents",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=44962664"
    619       },
    620       {
    621         "hn_id": "43914672",
    622         "title": "Questions to Fall in Love with ChatGPT: An Experimental Study",
    623         "points": 2,
    624         "comments": 0,
    625         "url": "https://news.ycombinator.com/item?id=43914672"
    626       }
    627     ],
    628     "top_points": 5,
    629     "total_points": 22,
    630     "total_comments": 1
    631   }
    632 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs