scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (20185B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Defects4C: Benchmarking Large Language Model Repair Capability with C/C++ Bugs",
      6     "authors": [
      7       "Jian Wang",
      8       "Xiaofei Xie",
      9       "Qiang Hu",
     10       "Shangqing Liu",
     11       "Jiongchi Yu"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Automated Software Engineering",
     15     "arxiv_id": "2510.11059",
     16     "doi": "10.1109/ASE63991.2025.00029"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims LLMs have significant limitations on C/C++ repair compared to Java — supported by Tables V, VI. The abstract claims the benchmark enables rigorous evaluation — supported by the construction methodology in Section III. All claims are substantiated in the results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Main causal claims include 'fine-tuning benefits repair capability' (supported by controlled before/after comparison in Table VII) and 'increasing diversity of model outputs leads to better repair capability' (supported by temperature comparison). These use controlled single-variable manipulation.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The title specifies 'C/C++ Bugs' and the paper consistently frames results within the C/C++ domain. Claims about LLM limitations are bounded to 'C/C++ program repair' and compared specifically against Defects4J (Java).",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section VII discusses specific alternative explanations: model contamination could inflate results (but results are low, suggesting minimal effect), annotation subjectivity, training data quality for fine-tuning, and selection bias from focusing on popular projects.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Section II explicitly distinguishes between 'plausible' patches (pass all test cases) and 'correct' patches (effectively resolve the underlying bug). The paper reports pass@k and successful repairs, and the measurements match the granularity of claims.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VII 'Threat to Validity' is a dedicated section with five paragraphs discussing specific threats to the study's validity.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section VII discusses study-specific threats: focus on single-function commits excludes multi-function bugs, temporal/contamination bias from popular projects, inter-annotator agreement measured via Cohen's Kappa (0.48→0.70→0.88), and training data quality affecting fine-tuning.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states: 'our focus on single-function commits... excludes multi-function or cross-file defects, such as those involving both a function implementation and its declaration.' They also note plans to 'extend the dataset to include multi-function and cross-file bugs in future releases.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section IX (Acknowledgements) discloses funding from the National Research Foundation Singapore, Cyber Security Agency, CyberSG R&D Programme Office, and Singapore Ministry of Education Academic Research Fund Tier 1.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed: Singapore Management University, Tianjin University, Nanjing University, and Nanyang Technological University. They evaluate third-party LLMs, not their own products.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funding comes from Singapore government agencies (NRF, CSA, MOE) which are independent research funders with no financial stake in whether LLMs perform well or poorly on C/C++ repair.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'plausible' vs. 'correct' repair (intro), Line/Hunk/Function bug categories (Section IV), single-round vs. conversation-based repair (Section V.A), and pass@k metric (following EvalPlus).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section I concludes with a bulleted contribution list explicitly claiming: a new executable C/C++ defect benchmark (Defects4C) and a large-scale empirical study of 24 LLMs on C/C++ repair.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II provides a comparative Table I of 14 prior C/C++ benchmarks and explains specific deficiencies (toy sources, low diversity, poor usability) that Defects4C addresses, situating the contribution directly against existing work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues construct validity via Table II, showing that interview/contest benchmarks inflate LLM performance (GPT-4: 74.6%) compared to real-world bugs (9.0%), making the case that Defects4C better measures genuine C/C++ repair capability.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Bug types (Signature, Sanitizer, Memory, Logic) and granularity (Line/Hunk/Function) are catalogued in Table III, but difficulty is not explicitly measured or tiered; the distinction between granularity categories and difficulty levels is not made.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The benchmark shows severe floor effects (best LLM repairs only 10.88% in conversation mode), but this is framed as 'challenging' rather than explicitly analyzed as a floor effect limiting discriminative power between models.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human performance baseline is reported; the paper only evaluates automated LLM-based approaches, leaving open the question of how this benchmark compares to human-level repair capability.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Pass@k is used following EvalPlus (Liu et al., 2023) and Chen et al. (2021), and repair count is used for conversation-based repair following Xia & Zhang (2024); the metrics are justified by reference to established community practice.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No structural contamination resistance is built into the evaluation benchmark; the paper acknowledges contamination risk in Section VII but dismisses it post-hoc by noting LLMs underperform, rather than designing temporal splits or canary strings.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper mentions future work extending to multi-function bugs but does not discuss how long Defects4C will remain useful, whether newer LLMs trained on post-2023 data might outperform, or any maintenance plan.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Section VII discusses dataset limitations (single-function scope, false positives), but benchmark failure modes—such as incomplete test suites masking incorrect patches, or gaming the benchmark via test-passing without true repair—are not systematically discussed.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper describes a stateless HTTP and CLI interface with endpoints for patch extraction and isolated Docker-based verification, enabling others to reproduce evaluation numbers; additional details are on the project website.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Section III documents collection sources (GitHub BigQuery, CVEProject), filtering criteria, the unit test matching algorithm, and the three-round human annotation process with inter-annotator agreement metrics; the project website provides additional detail.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The benchmark is described as 'publicly released' and accessible via a Google Sites URL, but no license for the benchmark itself is specified in the paper; source repo redistribution licenses are only mentioned as a collection criterion.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section IV explicitly specifies that Defects4C_bgcommit is for fine-tuning/pretraining while Defects4C_bug and Defects4C_vul are for rigorous evaluation, and the Usage subsection describes the CLI/HTTP interface for large-scale automated evaluation.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "LLM-based APR techniques can only fix 10.88% of bugs in Defects4C_bug and 6.86% in Defects4C_vul in conversation-based repair.",
    203       "evidence": "Table IV shows best results: 27/248 bugs and 7/102 vulnerabilities repaired by best-performing model (GPT-3.5 Turbo and Phind-CodeLlama respectively).",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Defects4C is significantly harder than existing C/C++ contest/interview-style benchmarks.",
    208       "evidence": "Table II shows GPT-3.5 achieves 59% on DebugBench and 94% on CodeFlaws, versus 8.5% on Defects4C under the same settings.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Larger model size does not consistently improve APR performance on Defects4C.",
    213       "evidence": "Table V shows CodeLlama-Python pass@100 improves from 22.5 (7B) to 32.2 (13B) then drops to 29.8 (34B); similar patterns in WizardCoder and CodeLlama-Instruct.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Fine-tuning with Defects4C_bgcommit improves repair performance by an average relative improvement of 84.89%.",
    218       "evidence": "Table VII shows improvements in 21/28 cases; CodeLlama-7B-Instruct improves from 2.45 to 4.08 pass@1 under greedy decoding after fine-tuning.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Long/multi-hunk patches and missing external context are the dominant failure patterns for LLMs on Defects4C_vul.",
    223       "evidence": "Table VIII shows 52% of failures are long/multi-hunk patches and 28.4% are missing external context for CodeLlama-7B-Instruct.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "C/C++ program repair remains significantly more challenging for LLMs than Java repair (Defects4J).",
    228       "evidence": "Table VI shows Defects4J repair rates of 29.8–71.3% vs. Defects4C rates of 0–13.6% for the same models and repair strategies.",
    229       "supported": "strong"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "benchmark-creation"
    235   ],
    236   "key_findings": "Defects4C fills a gap in C/C++ program repair research by providing 350 human-verified bugs and vulnerabilities from 41 real-world projects, paired with executable test cases and a usable CLI/API. State-of-the-art LLMs repair only 10.88% of general bugs and 6.86% of vulnerabilities in conversation-based mode, far below their performance on Java (Defects4J) and existing contest-style C/C++ benchmarks. Larger models do not consistently outperform smaller ones, with verbosity and token-limit issues degrading larger model output. Fine-tuning on the 9M-commit training corpus yields modest average improvement (~85% relative) but absolute pass@1 rates remain below 5%, indicating that C/C++-specific advances are needed.",
    237   "red_flags": [
    238     {
    239       "flag": "No human baseline",
    240       "detail": "The benchmark evaluates only automated LLM approaches; no human performance baseline is reported, making it impossible to interpret whether the benchmark's difficulty is appropriate or whether it is unsolvable even for humans."
    241     },
    242     {
    243       "flag": "Floor effects not addressed",
    244       "detail": "Most models repair fewer than 5% of bugs at pass@1, yet ceiling/floor effects are not analyzed. At these rates the benchmark has limited power to discriminate between models."
    245     },
    246     {
    247       "flag": "GPT-4 budget artificially constrained",
    248       "detail": "GPT-4 is limited to 2 repair attempts in conversation-based evaluation due to cost, while other models get 10 attempts. This makes the cross-model comparison in Table IV misleading for GPT-4."
    249     },
    250     {
    251       "flag": "Contamination dismissed without rigorous analysis",
    252       "detail": "Contamination risk is acknowledged in Section VII but dismissed by ex-post reasoning ('LLMs underperform, so contamination must be minimal') rather than through designed temporal holdouts or canary strings."
    253     },
    254     {
    255       "flag": "Benchmark license unspecified",
    256       "detail": "The paper provides no license for the benchmark itself; reuse terms are unclear despite the benchmark being described as publicly available."
    257     },
    258     {
    259       "flag": "Test-passing conflated with correctness",
    260       "detail": "Pass@k on the provided test suite is used as a correctness proxy throughout, but incomplete test suites could pass incorrect patches; the plausible-vs-correct distinction raised in the intro is not revisited in results."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    266       "relevance": "The primary inspiration and comparison benchmark for Defects4C; the Java APR benchmark that this work is explicitly designed to replicate and extend for C/C++."
    267     },
    268     {
    269       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    270       "relevance": "Provides the conversation-based APR methodology and Defects4J baseline numbers used directly for comparison in Table VI."
    271     },
    272     {
    273       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus)",
    274       "relevance": "Source of the pass@k evaluation metric and evaluation methodology used throughout the empirical study."
    275     },
    276     {
    277       "title": "BugsC++: A Highly Usable Real World Defect Benchmark for C/C++",
    278       "relevance": "The most recent prior C++ benchmark; directly compared and critiqued for including non-genuine bugs in commit messages."
    279     },
    280     {
    281       "title": "The ManyBugs and IntroClass Benchmarks for Automated Repair of C Programs",
    282       "relevance": "Key prior C APR benchmark compared against Defects4C; critiqued for low usability and limited diversity."
    283     },
    284     {
    285       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    286       "relevance": "Introduces pass@k metric and establishes baseline methodology for LLM code evaluation used in this paper."
    287     },
    288     {
    289       "title": "Magicoder: Source Code Is All You Need",
    290       "relevance": "Provides the decontamination methodology using UniXcoder similarity filtering applied to the fine-tuning dataset."
    291     },
    292     {
    293       "title": "UniXcoder: Unified Cross-Modal Pre-Training for Code Representation",
    294       "relevance": "The model used for similarity-based decontamination of the fine-tuning split from the evaluation set."
    295     },
    296     {
    297       "title": "Neural Transfer Learning for Repairing Security Vulnerabilities in C Code (VRepair)",
    298       "relevance": "Provides the keyword-based commit filtering heuristic adapted for collecting bug-related commits from GitHub."
    299     },
    300     {
    301       "title": "DBGBench: Where Is the Bug and How Is It Fixed? An Experiment with Practitioners",
    302       "relevance": "Existing real-world C bug benchmark compared against Defects4C; critiqued for covering only 2 projects."
    303     }
    304   ],
    305   "engagement_factors": {
    306     "practical_relevance": {
    307       "score": 2,
    308       "justification": "The benchmark and CLI tools are directly usable by APR researchers evaluating LLMs on C/C++ code repair."
    309     },
    310     "surprise_contrarian": {
    311       "score": 1,
    312       "justification": "Confirms the expected difficulty gap between contest-style and real-world bugs; the magnitude of the gap (94% to 8.5% for GPT-3.5) is somewhat surprising."
    313     },
    314     "fear_safety": {
    315       "score": 1,
    316       "justification": "Highlights that LLMs cannot reliably fix real-world C/C++ vulnerabilities (only 6.86% success), relevant to security but not presenting a novel threat."
    317     },
    318     "drama_conflict": {
    319       "score": 0,
    320       "justification": "Straightforward benchmark paper with no controversy or dramatic claims."
    321     },
    322     "demo_ability": {
    323       "score": 2,
    324       "justification": "Released benchmark with CLI tools and HTTP API endpoints for automated evaluation, though requires setup."
    325     },
    326     "brand_recognition": {
    327       "score": 1,
    328       "justification": "Authors from Singapore Management University and NTU; evaluates well-known models (GPT-4, CodeLlama) but not from a major AI lab."
    329     }
    330   },
    331   "hn_data": {
    332     "threads": [
    333       {
    334         "hn_id": "28970112",
    335         "title": "Stipula: DSL that assists lawyers in programming legal contracts",
    336         "points": 3,
    337         "comments": 0,
    338         "url": "https://news.ycombinator.com/item?id=28970112"
    339       },
    340       {
    341         "hn_id": "41866043",
    342         "title": "Unboxing Virgil ADTs for Fun and Profit",
    343         "points": 2,
    344         "comments": 2,
    345         "url": "https://news.ycombinator.com/item?id=41866043"
    346       },
    347       {
    348         "hn_id": "37980301",
    349         "title": "Confidential Consortium Framework: Secure Multiparty Applications",
    350         "points": 2,
    351         "comments": 1,
    352         "url": "https://news.ycombinator.com/item?id=37980301"
    353       }
    354     ],
    355     "top_points": 3,
    356     "total_points": 7,
    357     "total_comments": 3
    358   }
    359 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs