scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31277B)
      1 {
      2   "paper": {
      3     "title": "Defects4C: Benchmarking Large Language Model Repair Capability with C/C++ Bugs",
      4     "authors": [
      5       "Jian Wang",
      6       "Xiaofei Xie",
      7       "Qiang Hu",
      8       "Shangqing Liu",
      9       "Jiongchi Yu",
     10       "Jiaolong Kong",
     11       "Yi Li"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Automated Software Engineering",
     15     "arxiv_id": "2510.11059",
     16     "doi": "10.1109/ASE63991.2025.00029"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "LLM-based automated program repair techniques fix only 10.88% of general bugs and 6.86% of vulnerabilities in the new Defects4C C/C++ benchmark, dramatically lower than their performance on Java's Defects4J. Fine-tuning on the provided 9M bug-relevant commits improves repair rates by an average of 84.89% relative, but absolute performance remains poor (best pass@1 of 4.92%). Larger models do not consistently outperform smaller variants, and the dominant failure patterns are long/multi-hunk patches (52%) and missing external context (28.4%).",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper states the benchmark is 'publicly released' with a CLI and HTTP interface, accessible at https://sites.google.com/view/defects4c. They describe specific API endpoints (/extract_anchor_patch, /fix_with_patch, /reproduce, /error_dig)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper states the dataset including 9M bug-relevant commits, 248 buggy functions, and 102 vulnerable functions is publicly released at their project website. They also mention the dataset is 'accessible at the website.'"
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions '8X A100-SXM4-80GB GPUs' and Docker containers for patch verification, but does not provide a requirements.txt, Dockerfile for the experiments, or specific library versions. Detailed settings are deferred to the project website."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "While the paper describes the evaluation workflow, prompt design, and hyperparameters, it does not include step-by-step reproduction instructions for the LLM experiments. Detailed instructions are deferred to the project website."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Tables V, IV, VI, and VII report only point estimates for pass@k and repair counts. No confidence intervals or error bars are provided on any results."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper makes numerous comparative claims ('LLMs perform better in repairing Defects4C_bug than Defects4C_vul', 'increasing the diversity of model outputs leads to better program repair capability') but no statistical significance tests are performed."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports pass@k scores with sufficient baseline context (e.g., Defects4C_bug 10.88% vs Defects4J 29.8%-71.3%), and relative improvement from fine-tuning ('average relative improvement of 84.89%'). Raw numbers with baselines allow assessment of effect magnitude."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The benchmark contains 248 bugs and 102 vulnerabilities. No justification is provided for why these sample sizes are sufficient for the claims being made, nor is any power analysis discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Pass@k values are reported as single point estimates without any measure of result stability."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper evaluates 24 LLMs as baselines against each other and compares Defects4C results against Defects4J results from prior work (Table VI). Multiple model families and sizes are compared."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The evaluation includes GPT-4, GPT-3.5-Turbo, DeepSeek-Coder, CodeLlama, Gemma, WizardCoder, Magicoder, Mixtral, and Phind-CodeLlama — all contemporary models at the time of the study."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No ablation study is performed on the benchmark construction pipeline components (e.g., the effect of keyword filtering, unit test matching algorithm, human annotation). The evaluation examines different strategies (single-round, conversation-based, fine-tuning) but does not ablate their own system."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper uses pass@1, pass@10, pass@100 for single-round evaluation, and number of successful repairs plus average tries for conversation-based repair."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Human annotation is used for dataset construction (validating that commits are genuine bugs), but the evaluation of LLM repair outputs is entirely automated through test case execution. No human evaluation of generated patches is performed."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "For fine-tuning experiments (RQ2), the training data (Defects4C_bgcommit) is explicitly decontaminated against the evaluation sets (Defects4C_bug and Defects4C_vul) using UniXcoder embeddings with a 0.95 cosine similarity threshold."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table IV provides per-category breakdowns across Signature, Sanitizer, Memory Error, and Logic Organization categories. Table VI further breaks down by fix granularity (Function, Hunk, Line)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "RQ3 (Section VI.C) systematically analyzes failure patterns: long/multi-hunk patches, deletion-centric fixes, missing external context, and insufficient test feedback. Table VIII quantifies their distribution."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports multiple negative findings: larger models do not consistently outperform smaller ones (CodeLlama-Python 13B→34B performance drops), fine-tuning improvements are limited (best pass@1 of 4.92%), and open-source models 'perform poorly even in conversation-based repair.'"
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims LLMs have significant limitations on C/C++ repair compared to Java — supported by Tables V, VI. The abstract claims the benchmark enables rigorous evaluation — supported by the construction methodology in Section III. All claims are substantiated in the results."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Main causal claims include 'fine-tuning benefits repair capability' (supported by controlled before/after comparison in Table VII) and 'increasing diversity of model outputs leads to better repair capability' (supported by temperature comparison). These use controlled single-variable manipulation."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The title specifies 'C/C++ Bugs' and the paper consistently frames results within the C/C++ domain. Claims about LLM limitations are bounded to 'C/C++ program repair' and compared specifically against Defects4J (Java)."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section VII discusses specific alternative explanations: model contamination could inflate results (but results are low, suggesting minimal effect), annotation subjectivity, training data quality for fine-tuning, and selection bias from focusing on popular projects."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section II explicitly distinguishes between 'plausible' patches (pass all test cases) and 'correct' patches (effectively resolve the underlying bug). The paper reports pass@k and successful repairs, and the measurements match the granularity of claims."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Models are listed as 'GPT-4', 'GPT-35-Turbo', 'CodeLlama-7B', etc. without API snapshot dates or version identifiers. For closed-source models (GPT-4, GPT-3.5-Turbo), no version string (e.g., 'gpt-4-0613') is provided."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Figure 2 provides the complete prompt templates for single function, single hunk, and single line bug types, with concrete examples (parts 4 and 5). The fill values are deterministic from the released dataset (buggy functions and error messages)."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section V.C reports temperature (0.2, 0.8, 1.0), max token length (2048), conversation parameters (m=10/2, n=3), LoRA rank (8), learning rate (2e-5), batch size (16), epochs (3), and max input length (2048)."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The conversation-based repair scaffold is described in Section V.A: iterative invocation, compiler-based verification, error feedback incorporation into prompts, with hyperparameters m (attempts) and n (conversation length). The HTTP verification endpoints are also described in Section IV."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section III documents the full pipeline with specific filtering criteria and counts at each stage: 38M commits → 9M (availability validation with 7 criteria) → 76K (single-function filtering) → 3,785 (unit test matching) → 248+102 (human annotation). Each filtering step has explicit criteria."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section VII 'Threat to Validity' is a dedicated section with five paragraphs discussing specific threats to the study's validity."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section VII discusses study-specific threats: focus on single-function commits excludes multi-function bugs, temporal/contamination bias from popular projects, inter-annotator agreement measured via Cohen's Kappa (0.48→0.70→0.88), and training data quality affecting fine-tuning."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper explicitly states: 'our focus on single-function commits... excludes multi-function or cross-file defects, such as those involving both a function implementation and its declaration.' They also note plans to 'extend the dataset to include multi-function and cross-file bugs in future releases.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The dataset is publicly released at the project website, including bug-relevant commits, buggy functions, and test cases. The CLI provides access to the underlying data."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section III.A describes collection in detail: BigQuery extraction from 110,441 candidate repos (top 500 by stars, 200+ stars, non-fork, open-source, 2015-2023), keyword-based filtering (fix, solve, repair, bug, etc.), and CVE extraction from CVEProject repository."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Repository selection criteria are explicit (top 500 C/C++ repos by GitHub stars, 200+ stars, open-source with redistributable licenses). For human annotators, qualifications are stated: 'at least 5 years of programming experience and over 3 years in software testing or program analysis.'"
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Figure 1 shows the complete pipeline with counts at each stage. Section III details each transformation: 38M→9M (7 filtering criteria), 9M→76K (single-function + test suite), 76K→3,785 (unit test matching), 3,785→248 (three-round human annotation). No unexplained jumps."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Section IX (Acknowledgements) discloses funding from the National Research Foundation Singapore, Cyber Security Agency, CyberSG R&D Programme Office, and Singapore Ministry of Education Academic Research Fund Tier 1."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed: Singapore Management University, Tianjin University, Nanjing University, and Nanyang Technological University. They evaluate third-party LLMs, not their own products."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funding comes from Singapore government agencies (NRF, CSA, MOE) which are independent research funders with no financial stake in whether LLMs perform well or poorly on C/C++ repair."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 24 LLMs evaluated. This is important since the benchmark commits span 2015-2023 and could overlap with model training data."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "Section VII acknowledges 'there is a possibility that similar code patterns may appear in the pre-training corpora of LLMs' but does not perform any analysis (canary strings, membership inference, or temporal splits) to assess actual overlap. They only argue low performance suggests minimal contamination."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The benchmark consists of commits from popular GitHub repositories (top 500 by stars, 2015-2023) which are almost certainly in LLM training data. The paper acknowledges this risk in Section VII but does not formally address it beyond noting that low performance suggests contamination effects are 'minimal.'"
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved as study subjects. Human annotators are part of the research team performing dataset construction, not study participants."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The study mines public repositories and evaluates LLMs on code benchmarks."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants as study subjects. Annotator qualifications are described but this is for dataset construction, not a human subjects study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants as study subjects."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants as study subjects."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants as study subjects."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants as study subjects."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, or per-example costs are reported. The paper mentions budget constraints limited GPT-4 to 2 repair attempts but does not quantify the actual costs."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper mentions '8X A100-SXM4-80GB GPUs' as hardware but does not report total GPU hours, wall-clock time, or total compute budget for the experiments."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No seed sensitivity analysis is reported. Results are presented as single estimates without variation across random seeds."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of samples generated per problem for pass@k estimation is not stated. The paper follows EvalPlus methodology but does not specify the exact number of samples n used."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is performed. The paper uses fixed settings (temperatures 0.2, 0.8, 1.0; token limit 2048) adopted from prior work without reporting a search budget or justifying why these specific values were chosen."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "The paper reports results at multiple temperature settings (0.2, 0.8, greedy) rather than cherry-picking a single best configuration. Results are presented comprehensively across all settings in Table V."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction does not apply."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate third-party LLMs on their own benchmark. They do not discuss the potential bias of benchmark constructors choosing bugs that may systematically favor or disfavor certain model capabilities."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "GPT-4 is limited to 2 repair attempts while other models get 10, creating an unfair comparison. The paper acknowledges 'GPT-4 could achieve higher repair accuracy with more repair attempts' but does not systematically control for compute budget across models."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Table II compares performance on existing C/C++ benchmarks (DebugBench, CodeFlaws) vs Defects4C to demonstrate the difficulty gap between interview/contest-style and real-world bugs. The paper discusses what properties make a benchmark meaningful for APR evaluation."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "For conversation-based repair, the same scaffold (their verification system with identical endpoints) is used across all models. For single-round repair, no scaffolding is involved, so all models are compared directly."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The benchmark commits span 2015-2023. Models trained after 2023 may have seen these commits and their fixes. No temporal analysis is performed to assess this overlap."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The evaluation provides error messages from test cases to guide repair. While this is by design, the paper does not discuss whether the amount of information in error messages varies across bugs and could bias results."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "For fine-tuning experiments (RQ2), the paper applies decontamination using UniXcoder embeddings, filtering out training samples with cosine similarity >0.95 to evaluation samples. This explicitly addresses non-independence between train and test data."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": true,
    366         "justification": "For fine-tuning, they use UniXcoder to embed code snippets and filter out samples with cosine similarity >0.95 against evaluation data — a concrete decontamination pipeline. However, no leakage detection is applied for pre-trained model evaluation."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LLM-based APR techniques fix only 10.88% of bugs in Defects4C_bug and 6.86% in Defects4C_vul, significantly lower than Defects4J success rates (29.8%-71.3%).",
    373       "evidence": "Tables IV, V, and VI show conversation-based repair results: best model repairs 27/248 bugs and 7/102 vulnerabilities. Table VI compares against Defects4J results from Xia et al. showing substantially higher rates on Java.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Increasing model size does not necessarily lead to better repair accuracy on C/C++ bugs.",
    378       "evidence": "Table V shows CodeLlama-Python pass@100 improves from 7B (22.5) to 13B (32.2) but drops at 34B (29.8). Similar trends for WizardCoder-15B/33B and CodeLlama-Instruct sizes.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Fine-tuning with Defects4C_bgcommit improves repair performance, with higher pass@k in 21 of 28 cases and an average relative improvement of 84.89%.",
    383       "evidence": "Table VII compares pre-trained vs fine-tuned results across 4 models. Improvements are seen in most configurations but absolute performance remains low (best pass@1 of 4.92%).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Multi-line bugs and bugs requiring external context constitute the largest proportion of LLM failures on Defects4C_vul.",
    388       "evidence": "Table VIII shows long/multi-hunk patches account for 52.0% and missing external context for 28.4% of failures for CodeLlama-7B-Instruct. Fine-tuning reduces deletion-centric (-2.9%) and external context (-1.9%) failures but not the other types.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "LLMs with higher temperature (0.8) usually outperform those with lower temperature (0.2) on the C/C++ APR task.",
    393       "evidence": "Table V shows systematically higher pass@k values at temperature 0.8 compared to 0.2 across most models (e.g., GPT-3.5 pass@100: 38.9 at T=0.8 vs 19.5 at T=0.2).",
    394       "supported": "strong"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Unequal compute budget across models",
    400       "detail": "GPT-4 is limited to 2 conversation repair attempts due to budget constraints while all other models receive 10. This creates a systematically unfair comparison that the authors acknowledge but do not correct for."
    401     },
    402     {
    403       "flag": "No uncertainty quantification",
    404       "detail": "All results are reported as point estimates without confidence intervals, standard deviations, or error bars. The number of samples used for pass@k estimation is not stated, making it impossible to assess the reliability of reported values."
    405     },
    406     {
    407       "flag": "No training cutoff dates for any model",
    408       "detail": "None of the 24 evaluated LLMs have their training cutoff dates stated. Since the benchmark commits come from top-500 GitHub repositories (2015-2023), which are almost certainly in LLM training data, this is a significant contamination concern."
    409     },
    410     {
    411       "flag": "Token limit confound",
    412       "detail": "The 2048-token limit caused 19% of CodeLlama-Instruct-34B outputs to be incomplete. This is an experimental design limitation — the observed 'larger models perform worse' finding may partly be an artifact of output truncation rather than a genuine capability difference."
    413     },
    414     {
    415       "flag": "No statistical tests for comparative claims",
    416       "detail": "The paper makes numerous comparative claims (temperature effects, model size effects, fine-tuning benefits) without any statistical significance testing, despite sufficient data to support such tests."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Automated program repair in the era of large pre-trained language models",
    422       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    423       "year": 2023,
    424       "relevance": "Foundational study on LLM-based automated program repair that this work builds upon and extends to C/C++."
    425     },
    426     {
    427       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    428       "authors": ["C. S. Xia", "L. Zhang"],
    429       "year": 2024,
    430       "relevance": "Introduced conversation-based program repair methodology that this paper adapts for C/C++ evaluation."
    431     },
    432     {
    433       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    434       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    435       "year": 2023,
    436       "relevance": "EvalPlus methodology used in this paper for rigorous pass@k evaluation of LLM code generation."
    437     },
    438     {
    439       "title": "Evaluating large language models trained on code",
    440       "authors": ["M. Chen", "J. T. W. Zaremba"],
    441       "year": 2021,
    442       "relevance": "Introduced the unbiased pass@k estimation method used throughout this paper's evaluation."
    443     },
    444     {
    445       "title": "Exploring the potential of ChatGPT in automated code refinement: An empirical study",
    446       "authors": ["Q. Guo", "J. Cao", "X. Xie", "S. Liu"],
    447       "year": 2024,
    448       "relevance": "Related empirical study on LLM capabilities for code refinement and repair."
    449     },
    450     {
    451       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    452       "authors": ["R. Just", "D. Jalali", "M. D. Ernst"],
    453       "year": 2014,
    454       "relevance": "The seminal Java defect benchmark that Defects4C is modeled after and compared against."
    455     },
    456     {
    457       "title": "BugsC++: A highly usable real world defect benchmark for C/C++",
    458       "authors": ["G. An", "M. Kwon", "K. Choi", "J. Yi", "S. Yoo"],
    459       "year": 2023,
    460       "relevance": "Most recent prior C/C++ benchmark that Defects4C aims to improve upon."
    461     },
    462     {
    463       "title": "DebugBench: Evaluating debugging capability of large language models",
    464       "authors": ["R. Tian", "Y. Ye", "Y. Qin"],
    465       "year": 2024,
    466       "relevance": "LLM debugging benchmark used as a comparison point showing the gap between contest-style and real-world bugs."
    467     },
    468     {
    469       "title": "Magicoder: Source code is all you need",
    470       "authors": ["Y. Wei", "Z. Wang", "J. Liu", "Y. Ding", "L. Zhang"],
    471       "year": 2023,
    472       "arxiv_id": "2312.02120",
    473       "relevance": "Decontamination methodology adopted for fine-tuning data preparation in this paper."
    474     },
    475     {
    476       "title": "LoRA: Low-rank adaptation of large language models",
    477       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    478       "year": 2021,
    479       "arxiv_id": "2106.09685",
    480       "relevance": "Parameter-efficient fine-tuning method used in the paper's RQ2 experiments."
    481     },
    482     {
    483       "title": "ContrastRepair: Enhancing conversation-based automated program repair via contrastive test case pairs",
    484       "authors": ["J. Kong", "M. Cheng", "X. Xie", "S. Liu"],
    485       "year": 2024,
    486       "arxiv_id": "2403.01971",
    487       "relevance": "Conversation-based APR technique from the same research group evaluating repair with test feedback."
    488     },
    489     {
    490       "title": "UniXcoder: Unified cross-modal pre-training for code representation",
    491       "authors": ["D. Guo", "S. Lu", "N. Duan"],
    492       "year": 2022,
    493       "arxiv_id": "2203.03850",
    494       "relevance": "Code embedding model used for decontamination in fine-tuning data preparation."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "The benchmark and CLI tools are directly usable by APR researchers evaluating LLMs on C/C++ code repair."
    501     },
    502     "surprise_contrarian": {
    503       "score": 1,
    504       "justification": "Confirms the expected difficulty gap between contest-style and real-world bugs; the magnitude of the gap (94% to 8.5% for GPT-3.5) is somewhat surprising."
    505     },
    506     "fear_safety": {
    507       "score": 1,
    508       "justification": "Highlights that LLMs cannot reliably fix real-world C/C++ vulnerabilities (only 6.86% success), relevant to security but not presenting a novel threat."
    509     },
    510     "drama_conflict": {
    511       "score": 0,
    512       "justification": "Straightforward benchmark paper with no controversy or dramatic claims."
    513     },
    514     "demo_ability": {
    515       "score": 2,
    516       "justification": "Released benchmark with CLI tools and HTTP API endpoints for automated evaluation, though requires setup."
    517     },
    518     "brand_recognition": {
    519       "score": 1,
    520       "justification": "Authors from Singapore Management University and NTU; evaluates well-known models (GPT-4, CodeLlama) but not from a major AI lab."
    521     }
    522   }
    523 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs