scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25634B)
      1 {
      2   "paper": {
      3     "title": "Automated Repair of AI Code with Large Language Models and Formal Verification",
      4     "authors": [
      5       "Yiannis Charalambous",
      6       "Edoardo Manino",
      7       "Lucas C. Cordeiro"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2405.08848"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper explicitly states 'This report is to be considered as the official documentation of the public software repository at https://github.com/emanino/plain_c_nn_benchmark' and also references a staging repository at https://github.com/Yiannis128/plain_c_nn_benchmark."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The NeuroCodeBench dataset is publicly available (part of SV-COMP since November 2023), and the expanded dataset is generated via the released code pipeline. The GitHub repositories contain the benchmark and mutation pipeline."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions hardware specs (Intel Xeon E5-2620 v4, 198 GB RAM) and ESBMC version (v7.4.0), ESBMC-AI version (0.5.0rc4), but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions sufficient to recreate the full environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper describes its methodology but does not provide step-by-step reproduction instructions. There is no README with commands to run, no 'Reproducing Results' section, and no explicit scripts to replicate the main experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (percentages in tables and figures) without confidence intervals or error bars. Box plots show distributions but no formal confidence intervals are given for the main repair rate claims."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims (e.g., 'persona prompts yield a higher percentage', 'Forward History performed best') based solely on comparing raw percentages without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper reports raw percentages (e.g., 18% for single-iteration, ~25% for iterative) but without baseline context or standardized effect size measures."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Each experiment uses 100 random samples from the dataset but there is no justification for why 100 was chosen and no power analysis. Given that repair rates are often below 10%, 100 samples may be insufficient for reliable comparisons."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Results appear to be from single runs per configuration. There is no mention of multiple experimental runs, standard deviations, or variance across repeated experiments. The box plots show distributions of scores within a single run, not across multiple runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper includes the 'old ESBMC-AI prompt' from prior work (Charalambous et al. 2023) as a baseline, and compares simple prompts, persona prompts, and various configurations against each other."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is the authors' own prior prompt (ESBMC-AI from 2023). No comparison is made against other contemporary automated program repair tools or approaches from the literature (e.g., other LLM-based APR systems)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper systematically varies prompt components: source code format (contextual vs. one-line), verifier feedback (none, before, after), prompt style (simple, persona, old), roles (6 different roles), backtick presence, and history format (LSO, Forward, Reverse). These function as ablations showing which components matter."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Four metrics of increasing difficulty are used: syntax (C/C++ detection score), relevance (string match), compilation rate, and verification rate (Section 4.2)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The paper evaluates repair quality entirely through automated metrics (C/C++ detection, string matching, compilation, ESBMC verification). No human evaluation of patch quality, correctness beyond verification, or semantic appropriateness is included."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper samples 100 random examples from the dataset for experiments but does not describe a formal train/test split or held-out test set. The same 100 samples appear to be used across experiments."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by prompt template, source code strategy (contextual vs. one-line), verifier feedback type, persona role, temperature, and message history format. Figure 4 breaks down ESBMC results by neural network category."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses why contextual patches fail (LLM skips code portions), why compilation rates are low, why verifier feedback hurts performance, and identifies the 'Reverse History' approach as performing poorly. The 'Lessons Learned' sections discuss failure modes."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results are reported: verifier feedback makes performance worse (Section 4.2), contextual strategy yields very low compilation rates (~1%), Reverse History performs worst, the 'Dog' persona performs comparably to expert roles, and the overall best repair rate is still only ~25%."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims are relatively modest: expanding NeuroCodeBench to 81k programs (supported in Section 3), verifying memory safety with ESBMC (supported in Section 3.3), and comparing prompt engineering techniques (supported in Sections 4 and 5). No unsupported claims in the abstract."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims like 'persona prompts yield a higher percentage of C-like LLM outputs' and 'providing feedback from a verifier about the nature of the vulnerability makes the performance worse' without controlling for confounds or running significance tests. With only 100 samples and no repeated runs, observed differences could be noise."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper tests only GPT-3.5-Turbo on AI C code (neural network implementations) but makes broad claims about 'large language models' and 'prompt engineering techniques' in general. The title says 'Large Language Models' (plural) though only one is tested. The conclusion acknowledges the need to test other LLMs, but the body text often generalizes beyond what the single-model evidence supports."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not systematically discuss alternative explanations. For example, the finding that verifier feedback hurts performance could have multiple explanations (prompt length, confusion from technical output, etc.) but only speculation is offered. The anomalous 'Dog' persona result is not explored."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 4.2 specifies 'GPT-3.5-Turbo was used as the LLM of choice, specifically gpt-3.5-turbo-0125' — this is a specific model version with a snapshot identifier."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Full prompt templates are provided in Listings 1-7 and the modified templates 9-2 and 11-2. While they use placeholders ({source}, {esbmc}, {role}), the fill values are described: specific roles are listed, and source/ESBMC formats are defined. The actual prompt text structure is fully provided."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Temperature is stated as 1.0 (default) for Section 4 experiments, and temperatures 0.0, 0.4, 0.7, 1.0, 1.3 are explored in Section 5. The paper states 'no defaults were changed' for the initial experiments."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The iterative repair scaffold is described in detail in Section 5.1 with Algorithm 1, including the loop structure, message history formats (LSO, Forward, Reverse), the verification feedback loop, and the maximum attempts (5). Figures 5, 9, and 10 provide visual diagrams."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3 describes the full pipeline: building base samples from NeuroCodeBench (505 samples), generating mutation patches with Mull, expanding to 81,129 samples, and classifying with ESBMC. The data augmentation process is documented with numbers at each stage."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The 'Lessons Learned' subsections (3.4, 4.5, 5.4) discuss some findings but are not structured as limitations discussions. Section 6 (Conclusions and Future Work) briefly mentions testing other LLMs but does not substantively discuss limitations."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No specific threats to validity are discussed. The paper does not address internal validity (e.g., random sampling of 100 examples, single LLM), construct validity (e.g., whether ESBMC verification truly captures repair quality), or external validity (e.g., generalizability beyond AI C code)."
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper does not explicitly state what the results do NOT show. While it mentions plans to test other LLMs in the future, it does not provide explicit scope boundaries like 'these results apply only to GPT-3.5-Turbo on memory safety bugs in neural network C code.'"
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "While the code repositories are linked, the actual experimental results (the 14,400 LLM outputs, ESBMC verification results, etc.) are not made available as raw data files for independent verification."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 3 thoroughly describes how the dataset was created: starting from NeuroCodeBench, using Mull for mutation, expanding from 505 to 81,129 samples, and classifying with ESBMC. The 100 random sample selection for experiments is mentioned."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were involved. The study uses automated benchmarks and LLM APIs."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The full pipeline from NeuroCodeBench through mutation with Mull to ESBMC classification is documented in Section 3 with Figure 3 providing a visual overview. Numbers at each stage are provided (505 base samples, 81,129 after mutation, breakdown by category in Figure 4a)."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding information or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Author affiliations are clearly listed: University of Manchester and Federal University of Amazonas. The paper evaluates GPT-3.5-Turbo and ESBMC; the authors are ESBMC developers (Cordeiro is a known ESBMC author), which is relevant but affiliations are disclosed."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding is disclosed, so independence cannot be assessed. Additionally, the authors are the developers of ESBMC and ESBMC-AI, the tools being evaluated, which represents a potential conflict of interest that is not acknowledged."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement is present. The authors evaluate their own tool (ESBMC-AI) without disclosing whether they have financial interests related to it."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The paper does not state the training data cutoff for GPT-3.5-Turbo. It argues that the mutated code is 'out-of-distribution' because NeuroCodeBench was released in late 2023, but does not state the actual training cutoff date."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section 4 explicitly discusses this: 'the original NeuroCodeBench is in the public domain, it was released in late 2023, and it is thus unlikely to have been included in the training set of most state-of-the-art large language models. In addition, our automated mutation technique further ensures that our AI code dataset looks very different from any piece of software the current large language models have been trained on.'"
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "The paper argues that contamination risk is low because (1) NeuroCodeBench was released in late 2023 and (2) the mutation process creates code that is different from anything in training sets. This is a reasonable argument even if not formally verified."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants were involved in this study."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants were involved in this study."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants were involved in this study."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants were involved in this study."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants were involved in this study."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants were involved in this study."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants were involved in this study."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper runs 14,400 GPT-3.5-Turbo API calls for Section 4 and additional calls for the iterative experiments in Section 5, but never reports the API cost, tokens consumed, or wall-clock time for the LLM inference."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "Hardware specifications are given (Intel Xeon CPUs, 32 GB / 198 GB RAM) but no total compute budget is stated: no GPU hours, no total API spend, no total experiment runtime."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "ESBMC can efficiently verify memory safety properties in neural network C code, contrasting with prior claims that software verifiers struggle with such code.",
    290       "evidence": "Section 3.3 and Figure 4 show ESBMC classified a sizable portion of the 81,129 mutated programs as safe or unsafe within a 300-second timeout, with verification times often in the range of seconds to minutes.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "The best single-iteration prompt engineering approach achieves approximately 18% successful repair rate using the one-line source code strategy with persona prompts.",
    295       "evidence": "Section 4.3 and Figure 8b report the best single-line prompt (template 11, role 'Automated Code Repair Tool', counterexample feedback) achieved ~18% verified repairs. Tables 2-3 provide compilation and verification rates across all configurations.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Iterative automated program repair increases the successful repair rate from ~18% to ~25% compared to single-iteration approaches.",
    300       "evidence": "Section 5.4 states 'the best results of non-iterative APR were ~18%, while the iterative approach explored in Section 5 increased the successful repairs to ~25%.' Figure 11 shows per-attempt success rates.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Providing verifier feedback in the prompt worsens LLM repair performance.",
    305       "evidence": "Figures 6b, 7b show that adding ESBMC output lowers C/C++ detection scores and source code match for the contextual strategy. Section 4.5 states 'providing feedback from a verifier about the nature of the vulnerability makes the performance worse.'",
    306       "supported": "weak"
    307     },
    308     {
    309       "claim": "Forward History message format is the best approach for iterative APR, and lower temperature yields higher repair accuracy.",
    310       "evidence": "Figures 11 and 12 show Forward History outperforming LSO and Reverse History, with temperature 0.0 yielding the best results for Forward History. Section 5.4 summarizes these findings.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Persona prompts produce C-like code more consistently than simple prompts.",
    315       "evidence": "Figure 6a shows persona prompts yield higher C/C++ detector scores than simple prompts for the contextual strategy.",
    316       "supported": "weak"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The paper expands NeuroCodeBench from 505 to 81,129 neural network C programs via mutation and uses ESBMC to identify memory safety vulnerabilities. Using GPT-3.5-Turbo with 144 prompt engineering configurations, the best single-iteration repair rate is approximately 18% (one-line source code with persona prompt), while iterative repair with forward history reaches approximately 25%. The study reveals that providing verifier feedback paradoxically worsens repair performance, and that the specific persona role assigned to the LLM has minimal effect on outcomes.",
    323   "red_flags": [
    324     {
    325       "flag": "Authors evaluate their own tool",
    326       "detail": "The authors are developers of ESBMC and ESBMC-AI, the core tools being evaluated. This conflict of interest is not acknowledged anywhere in the paper."
    327     },
    328     {
    329       "flag": "Single LLM tested with broad claims",
    330       "detail": "Only GPT-3.5-Turbo is tested, but the title and text frequently reference 'Large Language Models' in the plural, suggesting broader generalizability than the evidence supports."
    331     },
    332     {
    333       "flag": "No statistical significance testing",
    334       "detail": "Comparisons between prompt configurations are made by comparing raw percentages from 100 samples without any significance tests. With small sample sizes and low repair rates (<18%), many observed differences could be due to random variation."
    335     },
    336     {
    337       "flag": "No repeated experimental runs",
    338       "detail": "Experiments appear to be single runs with temperature=1.0 (stochastic), meaning results could vary significantly across repetitions. The paper acknowledges that 'temporary changes in the ChatGPT backend' could affect results but does not address this with repeated runs."
    339     },
    340     {
    341       "flag": "No cost reporting despite large API usage",
    342       "detail": "14,400+ API calls to GPT-3.5-Turbo in Section 4 alone, plus extensive iterative experiments in Section 5, with no cost or token consumption reported."
    343     },
    344     {
    345       "flag": "Missing limitations section",
    346       "detail": "The paper has no dedicated limitations or threats-to-validity section, which is a significant omission for empirical work."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "A new era in software security: Towards self-healing software via large language models and formal verification",
    352       "authors": ["Y. Charalambous", "N. Tihanyi", "R. Jain", "Y. Sun", "M. A. Ferrag", "L. C. Cordeiro"],
    353       "year": 2023,
    354       "arxiv_id": "2305.14752",
    355       "relevance": "Precursor work on using LLMs with formal verification for automated code repair, providing the baseline prompt template evaluated in this paper."
    356     },
    357     {
    358       "title": "Automated repair of programs from large language models",
    359       "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. H. Tan"],
    360       "year": 2023,
    361       "relevance": "Contemporary work on LLM-based automated program repair, directly relevant to the survey scope of AI-assisted code generation and repair."
    362     },
    363     {
    364       "title": "A prompt pattern catalog to enhance prompt engineering with ChatGPT",
    365       "authors": ["J. White", "Q. Fu", "S. Hays", "M. Sandborn", "C. Olea", "H. Gilbert", "A. Elnashar", "J. Spencer-Smith", "D. C. Schmidt"],
    366       "year": 2023,
    367       "arxiv_id": "2302.11382",
    368       "relevance": "Foundational work on prompt engineering patterns used as theoretical basis for the prompting strategies compared in this paper."
    369     },
    370     {
    371       "title": "Language models are few-shot learners",
    372       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    373       "year": 2020,
    374       "relevance": "GPT-3 paper providing the foundation for the GPT-3.5-Turbo model used in the experiments."
    375     },
    376     {
    377       "title": "How is ChatGPT's behavior changing over time?",
    378       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    379       "year": 2023,
    380       "arxiv_id": "2307.09009",
    381       "relevance": "Addresses temporal instability in LLM behavior, cited to explain anomalous experimental results — relevant to evaluation methodology for LLM studies."
    382     },
    383     {
    384       "title": "Pitfalls in machine learning research: Reexamining the development cycle",
    385       "authors": ["S. Biderman", "W. J. Scheirer"],
    386       "year": 2020,
    387       "relevance": "Discusses methodological pitfalls in ML research, relevant to the survey's focus on research quality assessment."
    388     },
    389     {
    390       "title": "NeuroCodeBench: a Plain C Neural Network Benchmark for Software Verification",
    391       "authors": ["E. Manino", "R. S. Menezes", "F. Shmarov", "L. C. Cordeiro"],
    392       "year": 2023,
    393       "relevance": "The foundational benchmark expanded in this paper, central to the evaluation of AI code verification and repair."
    394     },
    395     {
    396       "title": "Taxonomy of real faults in deep learning systems",
    397       "authors": ["N. Humbatova", "G. Jahangirova", "G. Bavota", "V. Riccio", "A. Stocco", "P. Tonella"],
    398       "year": 2020,
    399       "relevance": "Provides taxonomy of bugs in deep learning systems, motivating the focus on memory safety vulnerabilities in neural network code."
    400     },
    401     {
    402       "title": "Language models are unsupervised multitask learners",
    403       "authors": ["A. Radford", "J. Wu", "R. Child", "D. Luan", "D. Amodei", "I. Sutskever"],
    404       "year": 2019,
    405       "relevance": "GPT-2 paper referenced for the GPT family architecture underlying the LLM used in experiments."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs