scan-v4.json (20185B)
1 { 2 "scan_version": 4, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Defects4C: Benchmarking Large Language Model Repair Capability with C/C++ Bugs", 6 "authors": [ 7 "Jian Wang", 8 "Xiaofei Xie", 9 "Qiang Hu", 10 "Shangqing Liu", 11 "Jiongchi Yu" 12 ], 13 "year": 2025, 14 "venue": "International Conference on Automated Software Engineering", 15 "arxiv_id": "2510.11059", 16 "doi": "10.1109/ASE63991.2025.00029" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims LLMs have significant limitations on C/C++ repair compared to Java — supported by Tables V, VI. The abstract claims the benchmark enables rigorous evaluation — supported by the construction methodology in Section III. All claims are substantiated in the results.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Main causal claims include 'fine-tuning benefits repair capability' (supported by controlled before/after comparison in Table VII) and 'increasing diversity of model outputs leads to better repair capability' (supported by temperature comparison). These use controlled single-variable manipulation.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "The title specifies 'C/C++ Bugs' and the paper consistently frames results within the C/C++ domain. Claims about LLM limitations are bounded to 'C/C++ program repair' and compared specifically against Defects4J (Java).", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section VII discusses specific alternative explanations: model contamination could inflate results (but results are low, suggesting minimal effect), annotation subjectivity, training data quality for fine-tuning, and selection bias from focusing on popular projects.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "Section II explicitly distinguishes between 'plausible' patches (pass all test cases) and 'correct' patches (effectively resolve the underlying bug). The paper reports pass@k and successful repairs, and the measurements match the granularity of claims.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section VII 'Threat to Validity' is a dedicated section with five paragraphs discussing specific threats to the study's validity.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Section VII discusses study-specific threats: focus on single-function commits excludes multi-function bugs, temporal/contamination bias from popular projects, inter-annotator agreement measured via Cohen's Kappa (0.48→0.70→0.88), and training data quality affecting fine-tuning.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states: 'our focus on single-function commits... excludes multi-function or cross-file defects, such as those involving both a function implementation and its declaration.' They also note plans to 'extend the dataset to include multi-function and cross-file bugs in future releases.'", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Section IX (Acknowledgements) discloses funding from the National Research Foundation Singapore, Cyber Security Agency, CyberSG R&D Programme Office, and Singapore Ministry of Education Academic Research Fund Tier 1.", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are listed: Singapore Management University, Tianjin University, Nanjing University, and Nanyang Technological University. They evaluate third-party LLMs, not their own products.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Funding comes from Singapore government agencies (NRF, CSA, MOE) which are independent research funders with no financial stake in whether LLMs perform well or poorly on C/C++ repair.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined: 'plausible' vs. 'correct' repair (intro), Line/Hunk/Function bug categories (Section IV), single-round vs. conversation-based repair (Section V.A), and pass@k metric (following EvalPlus).", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section I concludes with a bulleted contribution list explicitly claiming: a new executable C/C++ defect benchmark (Defects4C) and a large-scale empirical study of 24 LLMs on C/C++ repair.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section II provides a comparative Table I of 14 prior C/C++ benchmarks and explains specific deficiencies (toy sources, low diversity, poor usability) that Defects4C addresses, situating the contribution directly against existing work.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "benchmark-creation": { 120 "construct_design": { 121 "construct_validity_argued": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper argues construct validity via Table II, showing that interview/contest benchmarks inflate LLM performance (GPT-4: 74.6%) compared to real-world bugs (9.0%), making the case that Defects4C better measures genuine C/C++ repair capability.", 125 "source": "haiku" 126 }, 127 "difficulty_distribution_characterized": { 128 "applies": true, 129 "answer": false, 130 "justification": "Bug types (Signature, Sanitizer, Memory, Logic) and granularity (Line/Hunk/Function) are catalogued in Table III, but difficulty is not explicitly measured or tiered; the distinction between granularity categories and difficulty levels is not made.", 131 "source": "haiku" 132 }, 133 "ceiling_floor_effects_checked": { 134 "applies": true, 135 "answer": false, 136 "justification": "The benchmark shows severe floor effects (best LLM repairs only 10.88% in conversation mode), but this is framed as 'challenging' rather than explicitly analyzed as a floor effect limiting discriminative power between models.", 137 "source": "haiku" 138 }, 139 "human_baseline_included": { 140 "applies": true, 141 "answer": false, 142 "justification": "No human performance baseline is reported; the paper only evaluates automated LLM-based approaches, leaving open the question of how this benchmark compares to human-level repair capability.", 143 "source": "haiku" 144 }, 145 "scoring_rubric_justified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Pass@k is used following EvalPlus (Liu et al., 2023) and Chen et al. (2021), and repair count is used for conversation-based repair following Xia & Zhang (2024); the metrics are justified by reference to established community practice.", 149 "source": "haiku" 150 } 151 }, 152 "robustness": { 153 "contamination_resistance_designed": { 154 "applies": true, 155 "answer": false, 156 "justification": "No structural contamination resistance is built into the evaluation benchmark; the paper acknowledges contamination risk in Section VII but dismisses it post-hoc by noting LLMs underperform, rather than designing temporal splits or canary strings.", 157 "source": "haiku" 158 }, 159 "temporal_robustness_discussed": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper mentions future work extending to multi-function bugs but does not discuss how long Defects4C will remain useful, whether newer LLMs trained on post-2023 data might outperform, or any maintenance plan.", 163 "source": "haiku" 164 }, 165 "failure_modes_discussed": { 166 "applies": true, 167 "answer": false, 168 "justification": "Section VII discusses dataset limitations (single-function scope, false positives), but benchmark failure modes—such as incomplete test suites masking incorrect patches, or gaming the benchmark via test-passing without true repair—are not systematically discussed.", 169 "source": "haiku" 170 }, 171 "baseline_implementations_provided": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper describes a stateless HTTP and CLI interface with endpoints for patch extraction and isolated Docker-based verification, enabling others to reproduce evaluation numbers; additional details are on the project website.", 175 "source": "haiku" 176 } 177 }, 178 "documentation": { 179 "dataset_documentation_complete": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section III documents collection sources (GitHub BigQuery, CVEProject), filtering criteria, the unit test matching algorithm, and the three-round human annotation process with inter-annotator agreement metrics; the project website provides additional detail.", 183 "source": "haiku" 184 }, 185 "licensing_and_access_clear": { 186 "applies": true, 187 "answer": false, 188 "justification": "The benchmark is described as 'publicly released' and accessible via a Google Sites URL, but no license for the benchmark itself is specified in the paper; source repo redistribution licenses are only mentioned as a collection criterion.", 189 "source": "haiku" 190 }, 191 "intended_use_specified": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section IV explicitly specifies that Defects4C_bgcommit is for fine-tuning/pretraining while Defects4C_bug and Defects4C_vul are for rigorous evaluation, and the Usage subsection describes the CLI/HTTP interface for large-scale automated evaluation.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "LLM-based APR techniques can only fix 10.88% of bugs in Defects4C_bug and 6.86% in Defects4C_vul in conversation-based repair.", 203 "evidence": "Table IV shows best results: 27/248 bugs and 7/102 vulnerabilities repaired by best-performing model (GPT-3.5 Turbo and Phind-CodeLlama respectively).", 204 "supported": "strong" 205 }, 206 { 207 "claim": "Defects4C is significantly harder than existing C/C++ contest/interview-style benchmarks.", 208 "evidence": "Table II shows GPT-3.5 achieves 59% on DebugBench and 94% on CodeFlaws, versus 8.5% on Defects4C under the same settings.", 209 "supported": "strong" 210 }, 211 { 212 "claim": "Larger model size does not consistently improve APR performance on Defects4C.", 213 "evidence": "Table V shows CodeLlama-Python pass@100 improves from 22.5 (7B) to 32.2 (13B) then drops to 29.8 (34B); similar patterns in WizardCoder and CodeLlama-Instruct.", 214 "supported": "strong" 215 }, 216 { 217 "claim": "Fine-tuning with Defects4C_bgcommit improves repair performance by an average relative improvement of 84.89%.", 218 "evidence": "Table VII shows improvements in 21/28 cases; CodeLlama-7B-Instruct improves from 2.45 to 4.08 pass@1 under greedy decoding after fine-tuning.", 219 "supported": "moderate" 220 }, 221 { 222 "claim": "Long/multi-hunk patches and missing external context are the dominant failure patterns for LLMs on Defects4C_vul.", 223 "evidence": "Table VIII shows 52% of failures are long/multi-hunk patches and 28.4% are missing external context for CodeLlama-7B-Instruct.", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "C/C++ program repair remains significantly more challenging for LLMs than Java repair (Defects4J).", 228 "evidence": "Table VI shows Defects4J repair rates of 29.8–71.3% vs. Defects4C rates of 0–13.6% for the same models and repair strategies.", 229 "supported": "strong" 230 } 231 ], 232 "methodology_tags": [ 233 "benchmark-eval", 234 "benchmark-creation" 235 ], 236 "key_findings": "Defects4C fills a gap in C/C++ program repair research by providing 350 human-verified bugs and vulnerabilities from 41 real-world projects, paired with executable test cases and a usable CLI/API. State-of-the-art LLMs repair only 10.88% of general bugs and 6.86% of vulnerabilities in conversation-based mode, far below their performance on Java (Defects4J) and existing contest-style C/C++ benchmarks. Larger models do not consistently outperform smaller ones, with verbosity and token-limit issues degrading larger model output. Fine-tuning on the 9M-commit training corpus yields modest average improvement (~85% relative) but absolute pass@1 rates remain below 5%, indicating that C/C++-specific advances are needed.", 237 "red_flags": [ 238 { 239 "flag": "No human baseline", 240 "detail": "The benchmark evaluates only automated LLM approaches; no human performance baseline is reported, making it impossible to interpret whether the benchmark's difficulty is appropriate or whether it is unsolvable even for humans." 241 }, 242 { 243 "flag": "Floor effects not addressed", 244 "detail": "Most models repair fewer than 5% of bugs at pass@1, yet ceiling/floor effects are not analyzed. At these rates the benchmark has limited power to discriminate between models." 245 }, 246 { 247 "flag": "GPT-4 budget artificially constrained", 248 "detail": "GPT-4 is limited to 2 repair attempts in conversation-based evaluation due to cost, while other models get 10 attempts. This makes the cross-model comparison in Table IV misleading for GPT-4." 249 }, 250 { 251 "flag": "Contamination dismissed without rigorous analysis", 252 "detail": "Contamination risk is acknowledged in Section VII but dismissed by ex-post reasoning ('LLMs underperform, so contamination must be minimal') rather than through designed temporal holdouts or canary strings." 253 }, 254 { 255 "flag": "Benchmark license unspecified", 256 "detail": "The paper provides no license for the benchmark itself; reuse terms are unclear despite the benchmark being described as publicly available." 257 }, 258 { 259 "flag": "Test-passing conflated with correctness", 260 "detail": "Pass@k on the provided test suite is used as a correctness proxy throughout, but incomplete test suites could pass incorrect patches; the plausible-vs-correct distinction raised in the intro is not revisited in results." 261 } 262 ], 263 "cited_papers": [ 264 { 265 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 266 "relevance": "The primary inspiration and comparison benchmark for Defects4C; the Java APR benchmark that this work is explicitly designed to replicate and extend for C/C++." 267 }, 268 { 269 "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT", 270 "relevance": "Provides the conversation-based APR methodology and Defects4J baseline numbers used directly for comparison in Table VI." 271 }, 272 { 273 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)", 274 "relevance": "Source of the pass@k evaluation metric and evaluation methodology used throughout the empirical study." 275 }, 276 { 277 "title": "BugsC++: A Highly Usable Real World Defect Benchmark for C/C++", 278 "relevance": "The most recent prior C++ benchmark; directly compared and critiqued for including non-genuine bugs in commit messages." 279 }, 280 { 281 "title": "The ManyBugs and IntroClass Benchmarks for Automated Repair of C Programs", 282 "relevance": "Key prior C APR benchmark compared against Defects4C; critiqued for low usability and limited diversity." 283 }, 284 { 285 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 286 "relevance": "Introduces pass@k metric and establishes baseline methodology for LLM code evaluation used in this paper." 287 }, 288 { 289 "title": "Magicoder: Source Code Is All You Need", 290 "relevance": "Provides the decontamination methodology using UniXcoder similarity filtering applied to the fine-tuning dataset." 291 }, 292 { 293 "title": "UniXcoder: Unified Cross-Modal Pre-Training for Code Representation", 294 "relevance": "The model used for similarity-based decontamination of the fine-tuning split from the evaluation set." 295 }, 296 { 297 "title": "Neural Transfer Learning for Repairing Security Vulnerabilities in C Code (VRepair)", 298 "relevance": "Provides the keyword-based commit filtering heuristic adapted for collecting bug-related commits from GitHub." 299 }, 300 { 301 "title": "DBGBench: Where Is the Bug and How Is It Fixed? An Experiment with Practitioners", 302 "relevance": "Existing real-world C bug benchmark compared against Defects4C; critiqued for covering only 2 projects." 303 } 304 ], 305 "engagement_factors": { 306 "practical_relevance": { 307 "score": 2, 308 "justification": "The benchmark and CLI tools are directly usable by APR researchers evaluating LLMs on C/C++ code repair." 309 }, 310 "surprise_contrarian": { 311 "score": 1, 312 "justification": "Confirms the expected difficulty gap between contest-style and real-world bugs; the magnitude of the gap (94% to 8.5% for GPT-3.5) is somewhat surprising." 313 }, 314 "fear_safety": { 315 "score": 1, 316 "justification": "Highlights that LLMs cannot reliably fix real-world C/C++ vulnerabilities (only 6.86% success), relevant to security but not presenting a novel threat." 317 }, 318 "drama_conflict": { 319 "score": 0, 320 "justification": "Straightforward benchmark paper with no controversy or dramatic claims." 321 }, 322 "demo_ability": { 323 "score": 2, 324 "justification": "Released benchmark with CLI tools and HTTP API endpoints for automated evaluation, though requires setup." 325 }, 326 "brand_recognition": { 327 "score": 1, 328 "justification": "Authors from Singapore Management University and NTU; evaluates well-known models (GPT-4, CodeLlama) but not from a major AI lab." 329 } 330 }, 331 "hn_data": { 332 "threads": [ 333 { 334 "hn_id": "28970112", 335 "title": "Stipula: DSL that assists lawyers in programming legal contracts", 336 "points": 3, 337 "comments": 0, 338 "url": "https://news.ycombinator.com/item?id=28970112" 339 }, 340 { 341 "hn_id": "41866043", 342 "title": "Unboxing Virgil ADTs for Fun and Profit", 343 "points": 2, 344 "comments": 2, 345 "url": "https://news.ycombinator.com/item?id=41866043" 346 }, 347 { 348 "hn_id": "37980301", 349 "title": "Confidential Consortium Framework: Secure Multiparty Applications", 350 "points": 2, 351 "comments": 1, 352 "url": "https://news.ycombinator.com/item?id=37980301" 353 } 354 ], 355 "top_points": 3, 356 "total_points": 7, 357 "total_comments": 3 358 } 359 }