scan-v5.json (19919B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Defects4C: Benchmarking Large Language Model Repair Capability with C/C++ Bugs", 6 "authors": [ 7 "Jian Wang", 8 "Xiaofei Xie", 9 "Qiang Hu", 10 "Shangqing Liu", 11 "Jiongchi Yu", 12 "Jiaolong Kong", 13 "Yi Li" 14 ], 15 "year": 2025, 16 "venue": "International Conference on Automated Software Engineering", 17 "arxiv_id": "2510.11059", 18 "doi": "10.1109/ASE63991.2025.00029" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "All abstract claims are verified in the paper body: 9M bug-relevant commits, 248 buggy functions, 102 vulnerable functions, and evaluation of 24 LLMs are all confirmed in Tables III–VII.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "The fine-tuning causal claim is supported by before/after comparisons using the same models (Table VII), a reasonable design for this type of claim; the paper appropriately hedges that improvements are limited.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": true, 37 "justification": "Conclusions are consistently bounded to C/C++ single-function bugs from the top 500 GitHub repositories; the paper explicitly states the scope and does not over-generalize to all program repair settings.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "The large performance gap between Defects4C and Defects4J is attributed solely to benchmark difficulty, without discussing alternative explanations such as prompt design differences, test harness discrepancies, or different evaluation protocols across the two benchmarks.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper uses test-passing (pass@k) as the sole measure of 'repair capability' but does not discuss limitations of this proxy, such as overfitting to weak test suites or semantically incorrect but test-passing patches; the distinction between 'plausible' and 'correct' is mentioned only in passing.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": true, 57 "justification": "Section VII 'Threat to Validity' is a dedicated section covering multiple distinct threats beyond a single sentence.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Specific threats include: single-function commit restriction excluding cross-file bugs, top-500 project selection bias, annotation subjectivity with measured Cohen's Kappa values (0.48→0.70→0.88), and training data quality in fine-tuning experiments.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": true, 69 "justification": "The paper explicitly states it covers only single-function commits, only the top 500 GitHub C/C++ repositories by stars, and only commits from January 2015 to December 2023; multi-function and cross-file bugs are explicitly excluded.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "Section IX Acknowledgements discloses funding from NRF Singapore, Cyber Security Agency, CyberSG R&D Programme Office, and Singapore Ministry of Education (RG12/23).", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "All four author affiliations are listed on the first page: Singapore Management University, Tianjin University, Nanjing University, and Nanyang Technological University.", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": true, 89 "justification": "All funders are Singapore government agencies (NRF, Cyber Security Agency, MOE) with no commercial interest in which LLMs perform best on C/C++ bug repair.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "The paper contains no competing interests statement or declaration of financial interests such as patents, equity, or consulting relationships.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "APR, single-round repair, conversation-based repair, and line/hunk/function bug granularities are all explicitly defined in Sections I, II, and V; 'plausible' vs 'correct' patches are also distinguished.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper states three explicit contributions in a bulleted list: the Defects4C benchmark dataset, the CLI/API tooling, and the empirical study of 24 LLMs on C/C++ repair.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table I systematically compares Defects4C against 13 existing C/C++ benchmarks on defect count, project diversity, and source type; Table II empirically motivates the gap via actual LLM performance comparisons.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "benchmark-creation": { 122 "construct_design": { 123 "construct_validity_argued": { 124 "applies": true, 125 "answer": true, 126 "justification": "Table II demonstrates that contest-style benchmarks yield artificially high LLM scores (GPT-4 at 74.6%) while real-world bugs yield low scores (9.0%), arguing that test-paired real-world bugs better measure actual repair capability.", 127 "source": "haiku" 128 }, 129 "difficulty_distribution_characterized": { 130 "applies": true, 131 "answer": false, 132 "justification": "Table III categorizes bugs by error type (Signature, Sanitizer, Memory Error, Logic) and counts, but there are no explicit difficulty tiers, no difficulty scores, and no analysis of whether categories differ in expected hardness prior to evaluation.", 133 "source": "haiku" 134 }, 135 "ceiling_floor_effects_checked": { 136 "applies": true, 137 "answer": false, 138 "justification": "GPT-4 achieves only 5/248 repairs (2%) in conversation-based mode, suggesting strong floor effects, but the paper treats this as evidence of benchmark challenge rather than a measurement validity concern; no explicit floor/ceiling analysis is performed.", 139 "source": "haiku" 140 }, 141 "human_baseline_included": { 142 "applies": true, 143 "answer": false, 144 "justification": "No human expert performance baseline is provided; the paper only reports LLM performance, leaving no reference for whether the benchmark items are solvable at all and at what expected rate.", 145 "source": "haiku" 146 }, 147 "scoring_rubric_justified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Pass@k is justified by citing EvalPlus [19] and Chen et al. [18] as established metrics; the unit-test-matching algorithm (Section III-B) that defines which tests count is formally specified with a pass/fail differential criterion.", 151 "source": "haiku" 152 } 153 }, 154 "robustness": { 155 "contamination_resistance_designed": { 156 "applies": true, 157 "answer": false, 158 "justification": "For fine-tuning decontamination is performed via UniXcoder cosine similarity filtering, but the evaluation benchmark itself has no contamination-resistance mechanism (no temporal holdouts, canary strings, or dynamic generation) and contamination is only acknowledged as a threat in Section VII.", 159 "source": "haiku" 160 }, 161 "temporal_robustness_discussed": { 162 "applies": true, 163 "answer": false, 164 "justification": "The paper does not discuss whether the benchmark will remain discriminative as LLMs improve, nor is there a versioning or update plan; temporal coverage of commits (2015–2023) is noted but benchmark longevity is not addressed.", 165 "source": "haiku" 166 }, 167 "failure_modes_discussed": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section VII explicitly identifies single-function restriction as a coverage limitation, and RQ3 (Section VI-C, Table VIII) categorizes four failure patterns (long/multi-hunk patches, deletion-centric fixes, missing external context, insufficient test feedback).", 171 "source": "haiku" 172 }, 173 "baseline_implementations_provided": { 174 "applies": true, 175 "answer": true, 176 "justification": "Full results for 24 LLMs are reported in Tables IV–VII with exact experimental configurations; the CLI/HTTP API with Docker isolation is publicly released, enabling reproduction of the reported numbers.", 177 "source": "haiku" 178 } 179 }, 180 "documentation": { 181 "dataset_documentation_complete": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section III provides a comprehensive pipeline description covering raw collection (38M commits), six filtering criteria, the unit-test matching algorithm, and the three-round human annotation protocol with inter-annotator Kappa scores.", 185 "source": "haiku" 186 }, 187 "licensing_and_access_clear": { 188 "applies": true, 189 "answer": false, 190 "justification": "The dataset is described as 'publicly released' with a website link, but no specific license (e.g., MIT, CC-BY, Apache) is stated anywhere in the paper, leaving reuse rights unclear.", 191 "source": "haiku" 192 }, 193 "intended_use_specified": { 194 "applies": true, 195 "answer": true, 196 "justification": "The paper explicitly distinguishes Defects4C_bgcommit as suitable for fine-tuning/pretraining (with caveats about false positives) and Defects4C_bug/vul as suitable for rigorous evaluation, with a clear explanation of why each subset serves its intended role.", 197 "source": "haiku" 198 } 199 } 200 } 201 }, 202 "claims": [ 203 { 204 "claim": "State-of-the-art LLMs can fix only 10.88% of general C/C++ bugs and 6.86% of vulnerabilities in Defects4C using conversation-based repair, far below their performance on Java benchmarks.", 205 "evidence": "Table IV shows best conversation-based results of 27/248 bugs (10.88%) and 7/102 vulnerabilities (6.86%); Table VI shows Defects4J single-round line repair rates of 71.3% vs single-digit rates on Defects4C.", 206 "supported": "strong" 207 }, 208 { 209 "claim": "Increasing LLM model size does not consistently improve C/C++ repair performance; CodeLlama-Python improves from 7B to 13B but degrades at 34B.", 210 "evidence": "Table V shows CodeLlama-Python pass@100 at T=0.8: 22.5 (7B) → 32.2 (13B) → 29.8 (34B); similar non-monotonic patterns for WizardCoder and CodeLlama-Instruct.", 211 "supported": "strong" 212 }, 213 { 214 "claim": "Fine-tuning LLMs with Defects4C_bgcommit improves C/C++ repair performance by an average of 84.89% relative across 21 of 28 settings.", 215 "evidence": "Table VII shows consistent pass@k improvements for fine-tuned CodeLlama-7B-Base (from 0% to 0.41% pass@1 greedy) and CodeLlama-7B-Instruct (2.45%→4.08%); 84.89% average relative improvement is stated in the text.", 216 "supported": "moderate" 217 }, 218 { 219 "claim": "Contest/interview-style C/C++ benchmarks produce artificially high LLM scores, making them poor proxies for real-world repair capability.", 220 "evidence": "Table II shows GPT-3.5 at 94.0% on CodeFlaws and 59.0% on DebugBench (both contest-style) versus 8.5% on Defects4C (real-world).", 221 "supported": "strong" 222 }, 223 { 224 "claim": "Temperature 0.8 generally produces better single-round repair results than temperature 0.2 across most LLMs.", 225 "evidence": "Table V shows consistent higher pass@100 at T=0.8 vs T=0.2 for most models (e.g., GPT-3.5: 38.9 vs 19.5; CodeLlama-Instruct-7B: 45.7 vs 24.9).", 226 "supported": "strong" 227 }, 228 { 229 "claim": "Long/multi-hunk patches and insufficient test feedback are the dominant failure patterns for LLMs repairing C/C++ vulnerabilities.", 230 "evidence": "Table VIII shows 52.0% of failures attributed to long/multi-hunk patches and 9.8% to insufficient test feedback; fine-tuning does not reduce these patterns.", 231 "supported": "moderate" 232 } 233 ], 234 "methodology_tags": [ 235 "benchmark-eval", 236 "observational" 237 ], 238 "key_findings": "Defects4C introduces a real-world C/C++ benchmark of 9M bug-related commits, 248 confirmed bugs, and 102 vulnerabilities, revealing that state-of-the-art LLMs fix only 10.88% and 6.86% of these respectively — far below their performance on the Java-based Defects4J benchmark. Larger model size does not consistently improve repair performance, with models producing verbose overgenerated outputs that exceed token limits. Fine-tuning with Defects4C training data yields an average 84.89% relative improvement but still achieves only ~5% pass@1, and fails to address the dominant failure modes of long multi-hunk patches and insufficient test feedback. These findings demonstrate that real-world C/C++ program repair remains a substantially harder and less-solved problem than commonly reported LLM benchmarks suggest.", 239 "red_flags": [ 240 { 241 "flag": "No human baseline", 242 "detail": "The paper provides no human expert performance data, making it impossible to assess whether benchmark items are achievable in principle or whether floor effects represent a fundamental benchmark problem rather than an LLM limitation." 243 }, 244 { 245 "flag": "GPT-4 budget-constrained evaluation", 246 "detail": "GPT-4 was limited to only 2 repair attempts (vs 10 for other models) due to cost constraints, making cross-model comparisons in Table IV unfair; the paper acknowledges this but still includes GPT-4 in rankings." 247 }, 248 { 249 "flag": "No contamination resistance in benchmark", 250 "detail": "The benchmark contains GitHub code from 2015-2023 that may appear in LLM pre-training corpora; the paper dismisses contamination risk by arguing low scores indicate minimal memorization, which is circular reasoning." 251 }, 252 { 253 "flag": "No license specified", 254 "detail": "The dataset is described as publicly released but no specific license is stated, creating legal ambiguity for downstream research use and reproduction." 255 }, 256 { 257 "flag": "Single-function only", 258 "detail": "Restricting to single-function commits excludes multi-function and cross-file bugs that may represent a large fraction of real-world defects, limiting ecological validity of the benchmark." 259 }, 260 { 261 "flag": "Floor effects not analyzed", 262 "detail": "With GPT-4 achieving 2% repair rate on conversation-based mode, the benchmark may be too hard to discriminate among LLMs; this is treated as a feature but could mask measurement noise at the floor." 263 } 264 ], 265 "cited_papers": [ 266 { 267 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 268 "relevance": "Primary comparison benchmark; Defects4C is explicitly designed as the C/C++ analog to Defects4J" 269 }, 270 { 271 "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT", 272 "relevance": "Conversation-based repair method evaluated on Defects4J; Defects4C re-evaluates this approach on C/C++" 273 }, 274 { 275 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 276 "relevance": "Source of pass@k metric adopted for single-round repair evaluation" 277 }, 278 { 279 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)", 280 "relevance": "Evaluation methodology reference for pass@k and greedy decoding protocol" 281 }, 282 { 283 "title": "BugsC++: A Highly Usable Real World Defect Benchmark for C/C++", 284 "relevance": "Most recent prior work on C/C++ benchmark; Defects4C addresses its limitations (false positives from keyword matching)" 285 }, 286 { 287 "title": "The ManyBugs and IntroClass Benchmarks for Automated Repair of C Programs", 288 "relevance": "Earlier real-world C benchmark; Defects4C addresses its low usability and outdated C standard support" 289 }, 290 { 291 "title": "Magicoder: Source Code is All You Need", 292 "relevance": "Decontamination methodology adopted for fine-tuning dataset preparation" 293 }, 294 { 295 "title": "Neural Transfer Learning for Repairing Security Vulnerabilities in C Code (VRepair)", 296 "relevance": "Keyword-based bug commit filtering methodology adopted from this work" 297 } 298 ], 299 "engagement_factors": { 300 "practical_relevance": { 301 "score": 2, 302 "justification": "Security researchers and APR practitioners working on C/C++ can directly use the benchmark and CLI tooling, though the extremely low LLM success rates may limit immediate practical application." 303 }, 304 "surprise_contrarian": { 305 "score": 2, 306 "justification": "The finding that larger models (34B) perform worse than smaller ones (13B) on C/C++ repair, and that LLMs achieve under 11% even with conversation, challenges the prevailing narrative of LLM coding capability." 307 }, 308 "fear_safety": { 309 "score": 2, 310 "justification": "C/C++ accounts for over 50% of disclosed open-source vulnerabilities, and the finding that LLMs can fix only 6.86% of vulnerabilities raises concerns about automated security patching claims." 311 }, 312 "drama_conflict": { 313 "score": 1, 314 "justification": "The paper implicitly challenges overly optimistic LLM-for-APR claims but frames this constructively as a benchmark gap rather than as a critique of specific prior work." 315 }, 316 "demo_ability": { 317 "score": 2, 318 "justification": "The publicly released CLI and HTTP API with Docker-based verification allow researchers to immediately test their own models on the benchmark." 319 }, 320 "brand_recognition": { 321 "score": 1, 322 "justification": "Authors are from reputable Asian universities (SMU, NTU, Nanjing) but no famous industry lab or flagship model name is behind the work." 323 } 324 }, 325 "hn_data": { 326 "threads": [ 327 { 328 "hn_id": "28970112", 329 "title": "Stipula: DSL that assists lawyers in programming legal contracts", 330 "points": 3, 331 "comments": 0, 332 "url": "https://news.ycombinator.com/item?id=28970112" 333 }, 334 { 335 "hn_id": "41866043", 336 "title": "Unboxing Virgil ADTs for Fun and Profit", 337 "points": 2, 338 "comments": 2, 339 "url": "https://news.ycombinator.com/item?id=41866043" 340 }, 341 { 342 "hn_id": "37980301", 343 "title": "Confidential Consortium Framework: Secure Multiparty Applications", 344 "points": 2, 345 "comments": 1, 346 "url": "https://news.ycombinator.com/item?id=37980301" 347 } 348 ], 349 "top_points": 3, 350 "total_points": 7, 351 "total_comments": 3 352 } 353 }