ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25712B)


      1 {
      2   "paper": {
      3     "title": "CloudFix: Automated Policy Repair for Cloud Access Control Policies Using Large Language Models",
      4     "authors": [
      5       "Bethel Hall",
      6       "Owen Ungaro",
      7       "William Eiers"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv preprint",
     11     "arxiv_id": "2512.09957"
     12   },
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Section IX states 'The policy dataset and accompanying source code used in this study are publicly available at https://github.com/bethelhall/fixmypolicy.' A GitHub URL is provided."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Section IX states the policy dataset is publicly available at the same GitHub URL. Section IV also states 'we curate real-world AWS IAM policies from the AWS re:Post forum and make them available to the research community.'"
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Section V-A describes hardware (Rocky Linux 9.3, NVIDIA L40S GPU, etc.) and states models were loaded via HuggingFace transformers. However, there are no requirements.txt, Dockerfile, or detailed library version listings sufficient to recreate the environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the source code and dataset are released, the paper does not describe step-by-step reproduction instructions, a README with commands to run, or specific scripts to replicate the main experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables II-V report point estimates of accuracy percentages and counts without confidence intervals or error bars. No uncertainty quantification is provided for any of the main results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Table II reports a two-tailed t-test comparing mean repair accuracies between baseline and Fault Localization, with p-values for each request size (p<0.001 for sizes 10, 20, 30; p=0.056 for size 50)."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table II reports the difference in accuracy percentage points between baseline and Fault Localization (e.g., +17.55pp for size 10, +7.29pp for size 20), alongside the baseline values, providing enough context to judge magnitude."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The dataset of 282 policies is described but not justified via power analysis or any explanation of why this sample size is sufficient. The RQ5 subset of only 10 policies is also not justified."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. Tables show only means (e.g., 'Avg. Total Time') and counts without any indication of variability across policies."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares its Fault Localization guided approach against a baseline approach (simple prompt without fault localization). Section V-C describes both approaches and explicitly notes the baseline 'serves as an ablation to isolate the impact of Fault Localization.'"
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is the authors' own simplified prompt (no Fault Localization). No comparison is made against prior symbolic policy repair tools (e.g., Eiers et al. 2023, D'Antoni et al. 2024) or other LLM-based repair approaches. The related work section discusses these but they are not experimentally compared."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baseline vs. Fault Localization comparison functions as an ablation study, as stated in Section V-C: 'The baseline serves as an ablation to isolate the impact of Fault Localization.' This is a controlled single-variable manipulation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section V-D defines three evaluation metrics: repair accuracy (percentage of correctly classified requests), iteration count, and total repair time. Tables III and IV report all three."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "There is no human evaluation of repair quality. Evaluation is entirely automated via SMT solver verification. Given that the repairs are access control policies, a human evaluation of repair quality or security implications would add value."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "RQ4 explicitly tests generalization to held-out requests: 'we repaired each policy using both approaches, then tested generalizability by evaluating the repaired policies on 15 held-out variant requests not provided during the repair process.'"
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table III provides per-request-size breakdowns. Table V provides per-model breakdowns. Results are reported across four request sizes (10, 20, 30, 50) and four models, with three accuracy categories (100%, 80-99%, <80%)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section VI discusses failure modes in detail: 'condition omission (e.g., dropping IpAddress constraints), entity hallucination (substituting arn:aws:iam::123456789012:root for the actual principal), and syntactic degeneration where 46% of policies showed no improvement due to repeated invalid JSON generation until token exhaustion.'"
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that at request size 50, the Fault Localization improvement is not statistically significant (p=0.056). It also reports poor performance of DeepSeek-Coder and Granite (75%+ failed repairs) and notes 'accuracy degradation at larger request sizes' where Fault Localization only generated 7 complete repairs."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 'CloudFix improves repair accuracy over a baseline implementation across varying request sizes.' This is supported by Table II/III showing statistically significant improvements for request sizes 10, 20, and 30, though the abstract does not mention the non-significant result at size 50."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims about Fault Localization improving repair performance. This is supported by a controlled ablation design where the only variable changed between baseline and FL-guided approaches is the inclusion of the fault localization report, with other factors held constant (same prompts, models, configurations)."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'Cloud Access Control Policies' but experiments are only on AWS IAM policies. The abstract claims 'enabling efficient and automated repair of cloud access control policies' without bounding to AWS. Section II-A acknowledges 'the techniques we present in this work are not specific to the AWS IAM policy language and can be applied to any policy language' but provides no evidence for other policy languages."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "Section VI discusses some limitations but does not discuss alternative explanations for the observed improvements. For example, the improvement could be due to the additional context length rather than the fault localization information specifically, or CodeLlama's dominance could reflect training data contamination with AWS policy formats. These alternatives are not considered."
    130       }
    131     },
    132     "setup_transparency": {
    133       "model_versions_specified": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper lists model names such as 'CodeLLaMA-7B-Instruct', 'Granite-3.3-8B-Instruct', 'DeepSeek-Coder-7B-Instruct', and 'Llama3.2-3B-Instruct'. For RQ5, 'GPT-5, accessed via its web interface' is mentioned without any version or snapshot date. The open-source models have size identifiers but no specific checkpoint versions or dates."
    137       },
    138       "prompts_provided": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Figure 3 shows a detailed example of the Fault Localization guided prompt. Section III-C states 'The complete prompts used in our experiments are provided in the accompanying artifact repository (see Section IX).' The prompt structure is also fully described."
    142       },
    143       "hyperparameters_reported": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper does not report temperature, top-p, max tokens, or other generation hyperparameters. Section V-B states 'We maintain similar prompts and model hyperparameters for all models used' but does not list the actual values."
    147       },
    148       "scaffolding_described": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section III provides a detailed description of the agentic scaffolding: the iterative repair loop (Alg. 1), Goal Validator (Alg. 2), Fault Localizer (Alg. 3), Prompt Generator, and Repair Synthesizer. The maximum iteration threshold, feedback mechanism, and workflow are all documented."
    152       },
    153       "data_preprocessing_documented": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IV-A describes the data pipeline: scraping 9,968 IAM-related posts, scanning for JSON structures with IAM policy keys, validating with Quacky, applying syntactic repair to enforce IAM conventions, resulting in 282 faulty policies and 45 community-accepted repairs. Section IV-B documents the request generation process (Alg. 4)."
    157       }
    158     },
    159     "limitations_and_scope": {
    160       "limitations_section_present": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section VI is titled 'Discussion and Threat to Validity' and contains substantive discussion of limitations including dataset representativeness, performance degradation at scale, and hardware constraints."
    164       },
    165       "threats_to_validity_specific": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI discusses specific threats: 'Our dataset relies on policies from AWS re:Post with algorithmically generated requests, which may not fully capture real-world complexity.' It also notes 'accuracy degraded as request size grew to 50' and 'We restricted evaluation to smaller open-source models due to hardware constraints.'"
    169       },
    170       "scope_boundaries_stated": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "While the paper mentions some limitations in Section VI, it does not explicitly state scope boundaries about what the results do NOT show. It does not clearly say the results are limited to AWS IAM policies, small open-source models, or synthetically generated requests. The claims remain broad relative to the tested setting."
    174       }
    175     },
    176     "data_integrity": {
    177       "raw_data_available": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section IX states the dataset is publicly available at the GitHub repository. The 282 policies and generated request sets can be independently verified."
    181       },
    182       "data_collection_described": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section IV-A describes data collection: scraping 9,968 IAM-related posts from AWS re:Post, extracting JSON structures, validating with Quacky, and applying syntactic repair to obtain 282 policies. Table I provides summary statistics."
    186       },
    187       "recruitment_methods_described": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No human participants were involved. The dataset consists of access control policies scraped from public forum posts. This is not a human subjects study."
    191       },
    192       "data_pipeline_documented": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section IV documents the pipeline: scraping posts (9,968) → detecting JSON with IAM keys → validating with Quacky → syntactic repair → 282 faulty policies and 45 ground truth repairs. Request generation is also documented (Alg. 4) with parameters for allow/deny ratios and misclassification rates."
    196       }
    197     },
    198     "conflicts_of_interest": {
    199       "funding_disclosed": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants or sponsors."
    203       },
    204       "affiliations_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "All three authors are listed with their affiliation at Stevens Institute of Technology, Department of Computer Science. One author (William Eiers) has prior work with the Quacky tool used in the framework, which is disclosed through co-authorship on the cited references."
    208       },
    209       "funder_independent_of_outcome": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding information is disclosed, so independence cannot be assessed. The absence of a funding disclosure does not mean the work is unfunded."
    213       },
    214       "financial_interests_declared": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    218       }
    219     },
    220     "contamination": {
    221       "training_cutoff_stated": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "The paper evaluates multiple LLMs on AWS IAM policies but does not state the training data cutoff for any of the models used (CodeLlama, Llama 3.2, DeepSeek-Coder, Granite, or GPT-5). AWS IAM policy examples are widely available online and could be in training data."
    225       },
    226       "train_test_overlap_discussed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The policies are scraped from public AWS re:Post forums, which are publicly accessible on the internet. The LLMs used could have been trained on these same posts. No discussion of potential train/test overlap."
    230       },
    231       "benchmark_contamination_addressed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The AWS re:Post forum posts containing the policies are publicly available online and were posted before the training cutoff of any of the models used. The paper does not address whether the models may have seen these policies during training."
    235       }
    236     },
    237     "human_studies": {
    238       "pre_registered": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants were involved in this study. It is a benchmark evaluation of an automated tool."
    242       },
    243       "irb_or_ethics_approval": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants. The study uses publicly available forum posts and automated evaluation."
    247       },
    248       "demographics_reported": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "inclusion_exclusion_criteria": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "randomization_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "blinding_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "attrition_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       }
    273     },
    274     "cost_and_practicality": {
    275       "inference_cost_reported": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Table IV reports average total repair time in seconds (e.g., 136.5s for FL at request size 10, up to 1306.9s at size 50), broken down into SMT time and LLM time, along with average iteration counts. This provides wall-clock time cost per policy repair."
    279       },
    280       "compute_budget_stated": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section V-A states: 'Each job was allocated a single NVIDIA L40S GPU with 46 GB of memory, two Intel(R) Xeon(R) Platinum 8562Y+ CPU cores, and 256 GB of system RAM.' Table IV reports timing across all experiments. However, total GPU hours for the full experimental campaign are not stated."
    284       }
    285     }
    286   },
    287   "claims": [
    288     {
    289       "claim": "CloudFix with Fault Localization achieves 84.0% complete repair rate on 10-request policies compared to 48.2% for baseline.",
    290       "evidence": "Table III shows 237/282 (84.0%) complete repairs for FL vs 136/282 (48.2%) for baseline at request size 10. Table II shows p<0.001 for this comparison.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "Fault Localization maintains 54.3% complete repair rate on 30-request policies while baseline drops to 22.3%.",
    295       "evidence": "Table III shows 153/282 (54.3%) for FL vs 63/282 (22.3%) for baseline at request size 30. Table II shows p<0.001.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "At request size 50, Fault Localization improvement over baseline is not statistically significant.",
    300       "evidence": "Table II reports p=0.056 for request size 50 with only a 3.85pp improvement (66.28% vs 62.44%). Only 7 complete repairs achieved by FL vs 0 by baseline.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "CodeLlama-7B-Instruct achieves the highest overall repair accuracy (92.3%) among four open-source LLMs.",
    305       "evidence": "Figure 5 and Table V show CodeLlama at 92.3% overall accuracy, with 54.3% complete repair rate at request size 30, compared to Llama (63.7%), DeepSeek-Coder (54.0%), and Granite (53.7%).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "CloudFix generates repairs that generalize to semantically similar unseen requests.",
    310       "evidence": "RQ4 results (Figure 6) show FL achieves 84.9% generalization accuracy on 15 held-out variant requests, compared to 82.8% repair accuracy on provided requests.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "CloudFix is the first automated policy repair framework for cloud access control that combines formal methods with LLMs.",
    315       "evidence": "Section VII reviews related work and identifies no prior work combining LLMs with formal verification for access control policy repair. However, this is a novelty claim that depends on complete literature coverage.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "CloudFix can repair policies using LLM-synthesized requests from natural language descriptions.",
    320       "evidence": "RQ5 (Figure 7) shows 86.4% FL accuracy and 85.2% baseline accuracy on 10 policies with GPT-5-generated requests. Very small sample (N=10) limits generalizability.",
    321       "supported": "weak"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "CloudFix combines SMT-based fault localization with LLM-driven repair to fix cloud access control policies, achieving 84% complete repair rate at 10-request complexity versus 48.2% for an unguided baseline. Performance degrades significantly at larger request sizes (50 requests), with only 7 complete repairs out of 282 policies. CodeLlama-7B-Instruct substantially outperforms three other open-source models of comparable size, and the approach shows ability to generalize to held-out semantically similar requests.",
    328   "red_flags": [
    329     {
    330       "flag": "No external baselines",
    331       "detail": "The only baseline is the authors' own simplified prompt approach. Prior symbolic repair tools (Eiers et al. 2023, D'Antoni et al. 2024, Xu and Peng 2016) are discussed in related work but not experimentally compared, making it impossible to assess whether CloudFix outperforms existing non-LLM approaches."
    332     },
    333     {
    334       "flag": "Contamination risk with public forum data",
    335       "detail": "The 282 policies were scraped from publicly available AWS re:Post forums. All evaluated LLMs could have been trained on this exact data. The policies and their correct versions may be in the models' training sets, which would inflate repair accuracy."
    336     },
    337     {
    338       "flag": "Synthetic request sets may not reflect real-world complexity",
    339       "detail": "Request sets are algorithmically generated (Alg. 4) rather than derived from real access logs. The paper acknowledges this in the discussion but the entire evaluation depends on these synthetic requests."
    340     },
    341     {
    342       "flag": "Very small sample for RQ5",
    343       "detail": "RQ5 uses only 10 policies (from the 45 with ground truth), a very small sample for drawing conclusions about natural language intent-driven repair. The 86.4% vs 85.2% difference between FL and baseline is likely not meaningful at this sample size."
    344     },
    345     {
    346       "flag": "No hyperparameters reported",
    347       "detail": "Temperature, top-p, max tokens, and other generation parameters are not reported despite being critical for LLM output quality. The paper states models used 'similar hyperparameters' without specifying values."
    348     },
    349     {
    350       "flag": "No variance across runs",
    351       "detail": "LLM outputs are stochastic but no repeated runs or variance measures are reported. All results appear to be single-run, making it impossible to assess result stability."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Large language models for software engineering: A systematic literature review",
    357       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    358       "year": 2024,
    359       "relevance": "Comprehensive survey of LLMs for software engineering, directly relevant to assessing methodological quality of LLM-for-SE research."
    360     },
    361     {
    362       "title": "A systematic literature review on large language models for automated program repair",
    363       "authors": ["Q. Zhang", "C. Fang", "Y. Xie"],
    364       "year": 2024,
    365       "arxiv_id": "2405.01466",
    366       "relevance": "Survey of LLM-based automated program repair, the broader category that CloudFix's approach falls into."
    367     },
    368     {
    369       "title": "Hybrid automated program repair by combining large language models and program analysis",
    370       "authors": ["F. Li", "J. Jiang", "J. Sun", "H. Zhang"],
    371       "year": 2025,
    372       "relevance": "Closely related hybrid LLM+formal methods approach to automated program repair."
    373     },
    374     {
    375       "title": "A deep dive into large language models for automated bug localization and repair",
    376       "authors": ["S. B. Hossain", "N. Jiang", "Q. Zhou"],
    377       "year": 2024,
    378       "relevance": "Evaluates LLMs for bug localization and repair, related methodology for assessing LLM repair capabilities."
    379     },
    380     {
    381       "title": "Automated program repair in the era of large pre-trained language models",
    382       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    383       "year": 2023,
    384       "relevance": "Early work on LLMs for automated program repair, benchmark evaluation methodology relevant to survey."
    385     },
    386     {
    387       "title": "Counterexample guided program repair using zero-shot learning and maxsat-based fault localization",
    388       "authors": ["P. Orvalho", "M. Janota", "V. M. Manquinho"],
    389       "year": 2025,
    390       "relevance": "Most closely related work: combines formal counterexample guidance with LLMs for program repair."
    391     },
    392     {
    393       "title": "Synthesizing access control policies using large language models",
    394       "authors": ["A. Vatsa", "P. Patel", "W. Eiers"],
    395       "year": 2025,
    396       "arxiv_id": "2503.11573",
    397       "relevance": "LLM-based access control policy synthesis from the same research group, directly related to policy generation evaluation."
    398     },
    399     {
    400       "title": "Empirical evaluation of generalizable automated program repair with large language models",
    401       "authors": ["V. Campos", "R. Shariffdeen", "A. Ulges", "Y. Noller"],
    402       "year": 2025,
    403       "arxiv_id": "2506.03283",
    404       "relevance": "Empirical evaluation of LLM-based program repair generalizability, related methodological assessment."
    405     },
    406     {
    407       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    408       "authors": ["L. Huang", "W. Yu", "W. Ma"],
    409       "year": 2025,
    410       "relevance": "LLM hallucination survey, relevant to understanding failure modes when LLMs generate incorrect policy repairs."
    411     }
    412   ]
    413 }

Impressum · Datenschutz