scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34135B)
      1 {
      2   "paper": {
      3     "title": "Synthetic Code Surgery: Repairing Bugs and Vulnerabilities with LLMs and Synthetic Data",
      4     "authors": [
      5       "David de-Fitero-Dominguez",
      6       "Antonio Garcia-Cabot",
      7       "Eva Garcia-Lopez"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2505.07372",
     12     "doi": "10.48550/arXiv.2505.07372"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Quality-filtered synthetic data (threshold 8.5, retaining ~70% of 30K generated samples) significantly outperforms both unfiltered synthetic data and real-world commit data (CommitPackFT) for automated program repair fine-tuning, with ANOVA confirming statistically significant differences (p<0.0001). The best configurations (vulrep_synt_85, vulrep_synt_85_commitpack) achieved 17.18-17.26% Top@1 and 22.47-23.00% Top@5 Perfect Prediction on VulRepair, surpassing VulMaster (20.0%) and VulRepair (16.8%) despite using sampling with 5 candidates rather than beam search with 50. Cross-model LLM evaluation across 6 generator/evaluator models provides a consensus-based quality filtering mechanism that demonstrably improves downstream training effectiveness.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No source code repository URL, GitHub link, or archive is provided anywhere in the paper. No mention of code availability."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The novel synthetic dataset (~30K samples, ~21K filtered) is not released. While VulRepair and CommitPackFT are publicly available existing datasets, the paper's main contribution (the synthetic training data) has no download link or repository."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions vLLM for inference and the Outlines library for structured output, and lists model names, but provides no requirements.txt, Dockerfile, conda environment file, or detailed environment setup section with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a conceptual level but not at the level needed to reproduce without significant guesswork."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Standard deviations are reported for all configurations in Table 10 (e.g., vulrep_synt_85 Top@1: 0.0036 std dev). While not confidence intervals, ± std dev over 50 runs provides uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Comprehensive statistical testing: ANOVA (F=1735.92, p<0.0001 for Top@1; F=646.27, p<0.0001 for Top@5) followed by Tukey's HSD post-hoc pairwise comparisons (Section 5.2). Shapiro-Wilk normality test and Levene's homogeneity of variances test also applied."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Tukey's HSD tables (Tables 15-16) report mean differences between all configuration pairs (e.g., vulrep_synt_85 vs vulrep: 0.0549, approximately 5.5 percentage points improvement). Raw PP% values with baseline context are provided throughout."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Each experiment is repeated 50 times but no justification is given for why 50 iterations were chosen. No power analysis or discussion of whether 50 runs is sufficient for the claimed statistical precision."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Standard deviations across 50 runs are reported in Table 10 for all configurations under both Top@1 and Top@5 settings (e.g., vulrep Top@1 std dev: 0.0039)."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines: vulrep (VulRepair training data alone), vulrep_commitpack (with real-world commits), and external benchmarks VulMaster (20.0%), VulRepair (16.8%), and prior Mistral-based work (22.04%)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "VulMaster (Zhou et al., 2024) and the authors' own prior work (de-Fitero-Dominguez et al., 2024) are recent. VulRepair (2022) is the standard benchmark in this space. All are appropriate comparisons."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Five training configurations systematically vary components: baseline (vulrep), +filtered synthetic, +CommitPackFT, +both, +unfiltered synthetic. This isolates the effect of synthetic data, quality filtering, and combination with real-world data."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Results reported under both Top@1 (single prediction, temperature 0.4) and Top@5 (five predictions, temperature 0.8) evaluation settings, analogous to Pass@1 and Pass@5."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of generated patches. All evaluation is automated via Perfect Prediction (exact match). Human evaluation of patch quality would be relevant since exact match misses semantically correct but syntactically different patches."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results reported on the VulRepair test set, explicitly separated from training data. Section 4.3 states: 'we removed any overlapping samples between the VulRepair training and test datasets to prevent overfitting.' The test dataset was 'left unchanged.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Per-language and per-bug-type breakdowns are provided for synthetic data quality scores (Tables 5-9), but the actual repair evaluation (PP%) in Table 10 is only reported in aggregate per training configuration. No per-vulnerability-type or per-CWE breakdown of repair performance."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No qualitative analysis of failed patches. The paper notes that concurrency issues and arithmetic errors receive lower quality scores during generation, but does not analyze specific cases where the repair model failed to generate correct patches."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The unfiltered synthetic dataset (vulrep_synt_full) performed significantly worse than the filtered version despite having ~30% more samples. Adding CommitPackFT to filtered synthetic data hurt Top@5 performance (22.47% vs 23.00%, p<0.0001). These demonstrate that more data can be counterproductive."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'statistically significant improvements' confirmed by ANOVA (p<0.0001); 'quality-filtered synthetic dataset outperforming...in certain scenarios' confirmed in Tables 10, 15-16; 'surpassed existing systems despite using a less computationally intensive decoding strategy' confirmed for Top@5 comparisons with appropriate hedging."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper claims synthetic data 'improves' repair performance. The experimental design — same base model (Qwen 2.5 Coder 7B), same fine-tuning procedure, same evaluation, varying only training data — is a controlled single-variable manipulation adequate for this causal claim. Statistical testing (ANOVA + Tukey HSD) confirms significance."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'Synthetic Code Surgery: Repairing Bugs and Vulnerabilities' and conclusion ('potentially transforming approaches to data scarcity across software engineering tasks') frame the contribution much more broadly than what was tested. Evaluation is only on C/C++ vulnerability repair (VulRepair). While Section 4.4 notes 'the performance metrics...specifically reflect the effectiveness of the models in repairing the C/C++ vulnerabilities,' this scoping is not maintained in the title, abstract, or conclusion."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The Discussion considers multiple alternative factors: model size vs. architecture/training data differences explaining quality variation, subjective weighting schemes potentially affecting results differently, methodological differences in decoding strategies complicating external comparisons, and the possibility that synthetic data captures similar knowledge to real-world commits."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper consistently frames results in terms of 'Perfect Prediction rates' and 'PP%' — the metric is well-defined (exact match with ground truth) and claims stay at this granularity. The abstract says 'improvements in Perfect Prediction rates' rather than claiming broader 'code quality' or 'developer productivity.'"
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model identifiers provided: Llama-3.1-Nemotron-70B-Instruct-HF, Meta-Llama-3.1-70B-Instruct, Qwen2.5-72B-Instruct, Mistral-Small-Instruct-2409, gemma-2-27b-it, Qwen2.5-32B-Instruct, and Qwen 2.5 Coder 7B. These are precise HuggingFace model identifiers."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper describes prompt construction conceptually ('instruct the model to generate examples demonstrating specific bugs') and shows input/output format templates with placeholders in Section 4.3, but never provides the actual full prompt text used for generation or evaluation. The reader cannot reconstruct the exact prompts sent to models."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Comprehensive hyperparameters reported: generation temperature 0.7, evaluation temperature 0.2, Top@1 temperature 0.4, Top@5 temperature 0.8. LoRA: rank 16, alpha 32, dropout 0.1. Training: learning rate 3e-4, cosine scheduler, batch size 256, max sequence length 4096, 3 epochs."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The approach is straightforward fine-tuning followed by direct inference — no tools, feedback loops, retry logic, or agent workflows."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.3 documents preprocessing: deduplication of training data, removal of train/test overlaps with VulRepair, and detailed formatting with special tokens (<inst>, <desc>, <file>, <lines>, <le>, <sep>). Section 4.2 documents the quality filtering pipeline with specific threshold (8.5) and sample counts at each stage."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section exists. Limitations are discussed inline within the Discussion (Section 6) and Conclusion (Section 7), but there is no labeled subsection specifically for limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats are discussed: 'the weights assigned to different evaluation criteria...involve some subjective judgment' (Section 4.2), evaluation limited to C/C++ VulRepair despite multi-language generation, differential model capabilities across languages and bug types creating potential training bias, and methodological differences in decoding strategies complicating external comparisons."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 4.4 explicitly states: 'the performance metrics reported in our experiments specifically reflect the effectiveness of the models in repairing the C/C++ vulnerabilities characteristic of this benchmark.' Section 4.5: 'our primary goal is not to outperform existing systems, but rather to investigate the feasibility of synthetic data generation.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Neither the synthetic dataset (30K generated, 21K filtered) nor the experimental results (50 runs per configuration) are available for download. Only aggregated statistics are reported in tables."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The synthetic data generation process is thoroughly described in Section 4.1: 6 LLMs generating 5,000 samples each, random selection of language and bug type, XML-structured output, automated validation. Quality evaluation protocol detailed in Section 4.2 with scoring criteria, weights, and threshold."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks (VulRepair, CommitPackFT) and LLM-generated synthetic data."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Full pipeline documented with counts: 6 models × 5,000 samples → 29,646 valid samples after format validation (Table 3 shows per-model validity rates) → quality scoring (mean 8.58, std 0.86) → filtering at 8.5 threshold → 20,832 samples (70% retention). Breakdowns by language, bug type, and model provided."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgements section lists two funding sources: project TIFON (PLEC2023-010251) and project RACO-NLP (SBPLY/23/180225/000063), both through Spanish research program calls."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All authors are affiliated with Universidad de Alcalá, Departamento de Ciencias de la Computación. No affiliation with any of the model providers (Meta, Alibaba/Qwen, Google/Gemma, Mistral, NVIDIA) whose models are evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funders are Spanish government research programs (I+D+i and Castilla-La Mancha regional funding), with no financial stake in any particular outcome regarding LLM performance or synthetic data effectiveness."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The training data cutoff for Qwen 2.5 Coder 7B (the fine-tuned base model) is not stated. The model was released in 2024 and could have been pre-trained on data including VulRepair examples."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 4.3: 'we removed any overlapping samples between the VulRepair training and test datasets to prevent overfitting.' The paper also references VulMaster's correction of data leakage in the original VulRepair evaluation. However, this addresses only fine-tuning data overlap, not pre-training contamination."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "VulRepair (derived from CVEFixes 2021 and BigVul 2020) was publicly available well before Qwen 2.5 Coder's training. The base model may have seen VulRepair test examples during pre-training. This contamination risk is not discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. It is a computational experiment involving LLM fine-tuning and automated benchmark evaluation."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study generates and evaluates synthetic code data using LLMs."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or tokens consumed reported. The paper mentions using vLLM for efficiency and sampling with 5 candidates (vs 50-beam search), but does not quantify actual cost per example or total inference time."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No GPU hours, training time, hardware specifications, or total compute budget stated. Generating ~30K samples with 6 large models (27B-72B parameters) and fine-tuning 5 configurations of a 7B model represents substantial compute that is not quantified."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Each experiment repeated 50 times with results reported as mean ± std dev (Table 10). The stochasticity arises from sampling-based decoding (temperatures 0.4 and 0.8), effectively testing seed sensitivity."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 4.5 explicitly states: 'we repeat each experiment 50 times to ensure that the results remain stable and are not affected by chance.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 4.4 mentions 'Our internal testing confirmed that these settings provided a stable baseline, as minor adjustments to these parameters did not result in significant performance variations,' implying some search was done but the budget (configurations tried, compute spent) is not reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "All 5 training configurations are reported with full results (Table 10) and statistically compared pairwise (Tables 15-16). No selective reporting — all configurations shown including underperforming ones."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Tukey's HSD test is used for post-hoc pairwise comparisons, which explicitly controls for family-wise error rate. Section 5.2.3: 'This test systematically evaluates all possible comparisons while controlling for the family-wise error rate.'"
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement all training configurations themselves and evaluate against their own baseline. While internal comparisons use the same model/procedure (reducing bias), they do not explicitly acknowledge or discuss author-evaluation bias, particularly for the external comparisons."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No performance-vs-compute analysis. The paper notes that sampling with 5 candidates is less expensive than beam search with 50, but does not quantify the compute difference or report performance as a function of compute budget."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses Perfect Prediction (exact match) without discussing whether it validly measures repair capability. Semantically correct patches with different syntax would be counted as failures. No discussion of VulRepair's construct validity as a benchmark for vulnerability repair."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The approach is direct fine-tuning and inference without agentic workflows or tool use."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "VulRepair data (CVEFixes 2021, BigVul 2020) predates Qwen 2.5 Coder's release (2024). The base model may have been pre-trained on data that includes VulRepair examples or their source code. This temporal leakage risk is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The input format provides error descriptions, vulnerable line numbers, and full source code. In real-world use, a developer may not have precise vulnerability line numbers or structured error descriptions. This potential feature leakage from the evaluation setup is not discussed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether VulRepair test examples are independent from each other (e.g., from the same repositories, same CVEs, or structurally similar vulnerabilities that could inflate performance)."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "While train/test overlap was removed by deduplication for the fine-tuning data, no concrete leakage detection method (canary strings, membership inference, n-gram overlap analysis) was applied to assess pre-training contamination of the base model."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Quality-filtered synthetic data (threshold 8.5) significantly outperforms unfiltered synthetic data for APR fine-tuning, with ~2 percentage point improvement in both Top@1 and Top@5.",
    369       "evidence": "Table 10: vulrep_synt_85 achieves 17.18% Top@1 vs vulrep_synt_full 15.21%; 23.00% Top@5 vs 21.13%. Tukey HSD confirms significance (mean diff 0.0197, p<0.0001 for Top@1). Section 5.2.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Filtered synthetic data outperforms real-world commit data (CommitPackFT) as a training augmentation strategy.",
    374       "evidence": "Table 10: vulrep_synt_85 (17.18% Top@1, 23.00% Top@5) vs vulrep_commitpack (16.14% Top@1, 21.79% Top@5). Tukey HSD: mean diff 0.0104 (Top@1, p<0.001) and 0.0121 (Top@5, p<0.001). Tables 15-16.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The best configurations surpass VulMaster (20.0%) and VulRepair (16.8%) on the VulRepair test set despite using sampling with 5 candidates rather than beam search with 50.",
    379       "evidence": "Table 10 and Figure 6: vulrep_synt_85 achieves 23.00% Top@5, exceeding VulMaster 20.0% and VulRepair 16.8%. Authors acknowledge the methodological difference in decoding strategy (Section 5.1, 6).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Cross-model LLM evaluation provides a robust quality assessment framework that mitigates individual model biases.",
    384       "evidence": "Table 4 shows evaluator variation: Mistral gives highest average (9.27), Nemotron gives lowest (8.09). The cross-model approach averages across these biases. Section 4.2.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Data quality is more important than quantity for training effective APR systems.",
    389       "evidence": "Filtered dataset (20,832 samples, ~70% of original) significantly outperforms unfiltered (29,646 samples) across all metrics and evaluation settings. Tukey HSD p<0.0001 for all comparisons. Tables 10, 15-16.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "High-quality synthetic data promotes greater diversity in patch generation, particularly visible in Top@5 results.",
    394       "evidence": "In Top@5, vulrep_synt_85 (23.00%) significantly outperforms vulrep_synt_85_commitpack (22.47%, p<0.0001), while in Top@1 they are not significantly different (p=0.806). Section 6 interprets this as synthetic data promoting greater solution diversity.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No code or data released",
    401       "detail": "Neither the synthetic dataset (~21K filtered samples), the generation/evaluation code, nor the fine-tuned models are released. This severely limits reproducibility despite the paper's emphasis on methodological rigor."
    402     },
    403     {
    404       "flag": "Self-evaluation circularity",
    405       "detail": "LLMs both generate and evaluate the synthetic data with no external ground truth validation. The quality scores are based on LLM judgment of LLM output. While cross-model evaluation mitigates individual bias, there is no independent validation that high LLM-assigned scores correlate with actual training utility."
    406     },
    407     {
    408       "flag": "Evaluation scope narrower than framing",
    409       "detail": "Synthetic data spans 12 languages and 13 bug types, but evaluation is only on C/C++ vulnerabilities (VulRepair). The title and conclusion frame contributions broadly ('repairing bugs and vulnerabilities,' 'transforming approaches to data scarcity across software engineering tasks') without this qualification."
    410     },
    411     {
    412       "flag": "Non-comparable external baselines",
    413       "detail": "External comparisons with VulMaster and VulRepair use fundamentally different decoding strategies (beam search with 50 beams vs sampling with 5 candidates). While the authors acknowledge this, the comparison is still prominently featured as evidence of superiority."
    414     },
    415     {
    416       "flag": "Pre-training contamination unaddressed",
    417       "detail": "Qwen 2.5 Coder 7B (released 2024) was likely pre-trained on data including VulRepair examples (published 2020-2022). Since all configurations use the same base model, relative comparisons remain valid, but absolute PP% numbers may be inflated."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "VulRepair: A T5-based automated software vulnerability repair",
    423       "authors": ["M. Fu", "C. Tantithamthavorn", "T. Le", "V. Nguyen", "D. Phung"],
    424       "year": 2022,
    425       "doi": "10.1145/3540250.3549098",
    426       "relevance": "Primary evaluation benchmark; fine-tunes CodeT5 for vulnerability repair and provides the dataset used in this paper's experiments."
    427     },
    428     {
    429       "title": "Multi-LLM Collaboration + Data-Centric Innovation = 2x Better Vulnerability Repair",
    430       "authors": ["X. Zhou", "K. Kim", "B. Xu", "D. Han", "D. Lo"],
    431       "year": 2024,
    432       "arxiv_id": "2401.15459",
    433       "relevance": "VulMaster is a key comparison baseline; also identified and corrected data leakage in the original VulRepair evaluation."
    434     },
    435     {
    436       "title": "Enhanced automated code vulnerability repair using large language models",
    437       "authors": ["D. de-Fitero-Dominguez", "E. Garcia-Lopez", "A. Garcia-Cabot", "J.-J. Martinez-Herraiz"],
    438       "year": 2024,
    439       "doi": "10.1016/j.engappai.2024.109291",
    440       "relevance": "Authors' prior work on LLM-based APR using Mistral, provides the data formatting approach extended in this paper and serves as an external comparison benchmark (22.04% Top@5)."
    441     },
    442     {
    443       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    444       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    445       "year": 2023,
    446       "doi": "10.1109/ICSE48619.2023.00129",
    447       "relevance": "Comprehensive study of LLMs for APR, directly relevant to understanding the landscape of learning-based program repair approaches."
    448     },
    449     {
    450       "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair",
    451       "authors": ["Q. Zhang", "C. Fang", "Y. Xie", "Y. Ma", "W. Sun", "Y. Yang", "Z. Chen"],
    452       "year": 2024,
    453       "arxiv_id": "2405.01466",
    454       "relevance": "Survey of LLM-based APR systems, provides taxonomy of APR approaches and identifies key challenges in the field."
    455     },
    456     {
    457       "title": "Conversational Automated Program Repair",
    458       "authors": ["C. S. Xia", "L. Zhang"],
    459       "year": 2023,
    460       "arxiv_id": "2301.13246",
    461       "relevance": "ChatRepair uses conversational feedback loops for zero-shot APR, representing a different paradigm (prompt-based) compared to fine-tuning approaches."
    462     },
    463     {
    464       "title": "ContrastRepair: Enhancing Conversation-Based Automated Program Repair via Contrastive Test Case Pairs",
    465       "authors": ["J. Kong", "M. Cheng", "X. Xie", "S. Liu", "X. Du", "Q. Guo"],
    466       "year": 2024,
    467       "arxiv_id": "2403.01971",
    468       "relevance": "Integrates positive feedback from passing tests into conversational APR, complementing ChatRepair's approach to LLM-based repair."
    469     },
    470     {
    471       "title": "OctoPack: Instruction Tuning Code Large Language Models",
    472       "authors": ["N. Muennighoff", "Q. Liu", "A. Zebaze", "Q. Zheng", "B. Hui", "T. Y. Zhuo", "S. Singh", "X. Tang", "L. von Werra", "S. Longpre"],
    473       "year": 2023,
    474       "arxiv_id": "2308.07124",
    475       "relevance": "CommitPackFT dataset used as a baseline for real-world code change data in this paper's experiments."
    476     },
    477     {
    478       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    479       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang", "L. Wang", "W. Chen"],
    480       "year": 2021,
    481       "arxiv_id": "2106.09685",
    482       "relevance": "LoRA is the fine-tuning method used for all experiments in this paper."
    483     },
    484     {
    485       "title": "Large Language Model for Vulnerability Detection and Repair: Literature Review and the Road Ahead",
    486       "authors": ["X. Zhou", "S. Cao", "X. Sun", "D. Lo"],
    487       "year": 2024,
    488       "arxiv_id": "2404.02525",
    489       "relevance": "Survey of LLMs for vulnerability detection and repair, directly relevant to the vulnerability repair task studied in this paper."
    490     },
    491     {
    492       "title": "Evaluating Language Models as Synthetic Data Generators",
    493       "authors": ["S. Kim", "J. Suk", "X. Yue", "V. Viswanathan", "S. Lee", "Y. Wang", "K. Gashteovski", "C. Lawrence", "S. Welleck", "G. Neubig"],
    494       "year": 2024,
    495       "arxiv_id": "2412.03679",
    496       "relevance": "Evaluates LLMs as synthetic data generators, directly related to the core methodology of using LLMs to generate training data."
    497     },
    498     {
    499       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    500       "authors": ["Y. Wang", "Y. Kordi", "S. Mishra", "A. Liu", "N. A. Smith", "D. Khashabi", "H. Hajishirzi"],
    501       "year": 2023,
    502       "arxiv_id": "2212.10560",
    503       "relevance": "Establishes the self-bootstrapping paradigm where LLMs generate their own training data, conceptually related to this paper's synthetic data approach."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "The synthetic data generation approach is practically useful for APR researchers, but requires running 6 large models (27B-72B parameters) and substantial infrastructure."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "Quality-filtered synthetic data outperforming real-world commits is mildly surprising, but the quality-over-quantity finding aligns with recent trends in ML data curation."
    514     },
    515     "fear_safety": {
    516       "score": 0,
    517       "justification": "No AI safety or security risk angle; the paper is about improving automated bug fixing."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or dramatic claims — straightforward empirical methodology paper."
    522     },
    523     "demo_ability": {
    524       "score": 0,
    525       "justification": "No code, demo, or synthetic dataset released for others to try."
    526     },
    527     "brand_recognition": {
    528       "score": 0,
    529       "justification": "Authors from Universidad de Alcalá; no famous lab or product association."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs