scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28098B)
      1 {
      2   "paper": {
      3     "title": "LLMSecConfig: An LLM-Based Approach for Fixing Software Container Misconfigurations",
      4     "authors": ["Ziyang Ye", "Triet Huynh Minh Le", "M. Ali Babar"],
      5     "year": 2025,
      6     "venue": "IEEE Working Conference on Mining Software Repositories",
      7     "arxiv_id": "2502.02009",
      8     "doi": "10.1109/MSR66628.2025.00099"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "LLMSecConfig combines static analysis tools (Checkov) with LLMs and RAG to automatically repair Kubernetes security misconfigurations. Mistral Large 2 achieved a 94.3% repair pass rate on 1,000 real-world configurations, significantly outperforming GPT-4o-mini (40.2%). An ablation study found that source code context (90.3% PR) was more useful than documentation (65.2% PR), while combining all context types yielded the best results (94.3%).",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Section IX and contribution 3 state: 'we provide our complete implementation, including the collected dataset of Kubernetes misconfiguration, source code of the framework, and evaluation scripts at https://figshare.com/s/2a9be8ccfbec9d8ba199.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The same figshare link includes the collected dataset of Kubernetes misconfigurations, as stated in Section IX and contribution 3."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specification is mentioned in the paper. The paper mentions Python (Checkov is pure Python) but provides no dependency versions or environment setup instructions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While source code and evaluation scripts are provided via figshare, the paper contains no step-by-step reproduction instructions. No README with commands or 'Reproducing Results' section is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables I and II report only point estimates (e.g., 94.3% PR, 40.2% PR) with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims Mistral Large 2 'significantly outperforms' GPT-4o-mini and reports different performance across context types (Table II) without any statistical significance tests (no p-values, t-tests, or other tests)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results in Tables I and II provide baseline context for all comparisons (e.g., Mistral 94.3% vs GPT-4o-mini 40.2% PR; ablation from 88.0% base to 94.3% full context), allowing readers to assess the magnitude of differences."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The dataset comprises 1,000 configurations from the top 1,000 ArtifactHub projects, but no justification is given for why 1,000 is sufficient, and no power analysis is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or repeat-run statistics are reported anywhere in Tables I or II."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "GPT-4o-mini serves as a baseline in RQ1 (Table I), and the ablation study in RQ3 (Table II) compares four context configurations as baselines against each other."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Both Mistral Large 2 (2407) and GPT-4o-mini are 2024 models, making them contemporary at the time of writing. However, only two models are tested and both are on the cost-efficient end of the model spectrum."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ3 (Table II) is a systematic ablation study varying context types: Checkov output only, + source code, + Prisma documentation, and full context. This demonstrates which components contribute to performance."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Seven metrics are used: Pass Rate, Parse Success Rate, Average Pass Steps, AUCPRS, AUCAPSS, Security Improvement, and Average Introduced Errors (Section IV-D, Tables I-II)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Evaluation is entirely automated via Checkov (SAT) scans. No human evaluation of repair quality, operational correctness, or semantic preservation is performed."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper tuned hyperparameters (temperature=0.5 and max_retry=5) 'through experimentation' (Section V-A) but does not mention a separate validation set. The same 1,000 configurations appear to be used for both tuning and final evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "RQ2 (Fig. 8) provides per-security-policy-type breakdown of pass rates for both models across different misconfiguration categories."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "RQ2 identifies and discusses categories of misconfigurations that are hard to correct, including complex privilege-related configurations for GPT-4o-mini (<50% PR) and advanced network policies for Mistral Large 2."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "GPT-4o-mini's poor performance (40.2% PR) is reported as a negative result. The ablation study shows Prisma documentation alone decreased performance (65.2% vs 88.0% base), which is a negative finding."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims '94% success rate' (Table I: 94.3%) and 'low rate of introducing new misconfigurations' (Table I: 0.024) are supported by the results. The abstract's claim about '1,000 real-world Kubernetes configurations' matches the experimental setup."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (RQ3) makes causal claims about which context types contribute to performance through controlled single-variable manipulation (adding/removing context sources while holding other variables constant). This is adequate for the ablation claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper title says 'Software Container Misconfigurations' and abstract discusses 'Container Orchestrators (COs)' generally, but all experiments are on Kubernetes only. While Section IV-A notes Kubernetes as a 'case study,' the title and abstract do not adequately bound the scope to Kubernetes."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section VI-B discusses threats to validity but focuses on parameter settings and dataset representativeness. No alternative explanations are offered for the large performance gap between models, or why Prisma documentation hurts performance (described as 'noise' without deeper analysis)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures Checkov pass rate and frames this as 'security configuration repair.' Passing Checkov checks is a proxy for actual security — a configuration could pass all Checkov checks and still be insecure via vectors Checkov doesn't cover. This gap is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "'mistral-large-2407' includes a version date, but 'GPT-4o-mini' is stated without a snapshot date or API version. Per schema criteria, marketing names without version identifiers do not count as specified."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section III-C states 'More details about the prompt templates can be found in the Appendix at https://figshare.com/s/2a9be8ccfbec9d8ba199.' A repository link containing prompts meets the criterion."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section V-A reports temperature=0.5, maximum parser retry time=10, and maximum retry value=5. Default values were used for other hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The three-phase pipeline is described in detail in Section III: SAT integration, context retrieval (RAG with three context sources), repair generation with inner/outer retry loops, and validation. Fig. 2 provides an architecture diagram."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-B documents the full data pipeline: ArtifactHub API → top 1,000 projects → Helm chart conversion → YAML parsing into individual sub-configurations → SAT scanning → retention of files with detected issues → 1,000 files."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VI-B 'Threats to Validity' provides a dedicated subsection discussing internal and external validity threats."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Internal validity discusses 'potential sub-optimal configuration of our framework and baseline models' and their mitigation via sensitivity analysis. External validity notes results 'may not generalise to all CO platforms and security scenarios' with specific mention of Kubernetes focus."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While threats to validity mention generalizability concerns, the paper does not explicitly state what the results do NOT show. No clear enumeration of untested scenarios, excluded configuration types, or claims the authors are not making."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section IX provides a figshare link with the complete dataset of Kubernetes misconfigurations, enabling independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section IV-B describes the collection: ArtifactHub API was used to identify top 1,000 most popular projects based on internal ranking, followed by Helm chart conversion and SAT scanning."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section IV-B describes the sample selection: top 1,000 most popular projects from ArtifactHub's internal ranking system, ensuring 'configurations widely used and representative of common industry practices.' However, this popularity-based selection could introduce bias toward well-maintained configurations."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section IV-B documents each step: API access → top 1,000 projects → Helm chart to raw YAML conversion → individual sub-configuration isolation → SAT analysis → retention of misconfigured files → 1,000 files. Fig. 6 shows the distribution of misconfiguration types."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from CREST (Centre for Research on Engineering Software Technologies) and University of Adelaide. They do not evaluate their own product, so there is no product affiliation conflict."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure is a transparency gap."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement, patent disclosure, or financial interest declaration is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for either Mistral Large 2 or GPT-4o-mini. The models are evaluated on publicly available Kubernetes configurations that could have been in their training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The 1,000 configurations are from ArtifactHub, a public repository. Both models could have seen these configurations during training. This overlap is not discussed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "ArtifactHub configurations are publicly available and likely predate both models' training cutoffs. No discussion of whether models have memorized common Kubernetes security patterns from training data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The evaluation is entirely automated using SATs on Kubernetes configurations."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates LLMs on configuration files."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Despite proposing a framework for production use and using API-based models with retry mechanisms (up to 5 retries × 10 parser retries), no inference costs, tokens consumed, API costs, or wall-clock times are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget, GPU hours, API spend, or hardware specifications are mentioned despite running 1,000 configurations through multiple retry iterations."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or seed sensitivity analysis. Temperature=0.5 introduces non-determinism but results appear to be from single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. Results appear to be single-run, with the retry mechanism being part of the method rather than repeated independent evaluations."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Temperature was selected 'based on findings from prior research' (citation [44], which is about graph tasks) and max_retry was determined 'through experimentation,' but no search budget is reported — how many configurations were tried is unstated."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The selection of temperature=0.5 is justified by citing prior work on graph tasks [44], not configuration repair. Max_retry=5 was chosen 'through experimentation' without describing the selection process or showing results for other values."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors designed the LLMSecConfig framework and evaluate it without acknowledging potential self-evaluation bias. No independent evaluation or discussion of this bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Mistral Large 2 and GPT-4o-mini likely have different computational costs, but no comparison at matched compute budgets is provided. The paper does not discuss whether the performance gap is due to model capability or compute differences."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses Checkov pass rate as the primary measure of security but does not discuss whether passing Checkov checks actually means configurations are secure. Checkov's coverage of all security issues is not questioned."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Both Mistral Large 2 and GPT-4o-mini are evaluated within the same LLMSecConfig framework with identical prompts, context retrieval, and retry mechanisms, controlling for the scaffold confound across model comparisons."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "ArtifactHub configurations are publicly available and could have been in both models' training data. No discussion of temporal relationship between dataset creation and model training cutoffs."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The framework provides Checkov error messages, policy source code, and Prisma documentation as context. Whether this level of context would be available in real deployment (particularly the source code of checks) is not discussed as potential leakage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether configurations from the same ArtifactHub projects share structural similarities that could inflate results, or whether the top 1,000 projects represent independent samples."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used despite the dataset being derived from publicly available sources."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Mistral Large 2 achieves a 94.3% repair pass rate on 1,000 real-world Kubernetes configurations with 100% parse success rate.",
    365       "evidence": "Table I in Section V-A: Mistral Large 2 achieves 94.3% PR, 100% PSR, 3.06 APS, 0.986 security improvement, and 0.024 average introduced errors.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "GPT-4o-mini achieves only a 40.2% pass rate, significantly underperforming Mistral Large 2.",
    370       "evidence": "Table I in Section V-A: GPT-4o-mini achieves 40.2% PR, 99.8% PSR, 4.38 APS. No statistical significance test accompanies the 'significantly outperforms' language.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Source code context provides the most effective guidance for repairs (90.3% PR), while Prisma documentation alone decreases performance (65.2% PR).",
    375       "evidence": "Table II in Section V-C: ckv_out+code achieves 90.3% PR vs base ckv_out at 88.0% and ckv_out+prisma at 65.2%. Full context (94.3%) is best overall.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Both models maintain low error introduction rates (<0.03), demonstrating viability for production use.",
    380       "evidence": "Table I: GPT-4o-mini 0.029, Mistral Large 2 0.024 average introduced errors. However, production viability claim is unsupported — no human evaluation, cost analysis, or real-world deployment testing.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Complex security contexts, especially privilege-related configurations, are consistently challenging for GPT-4o-mini with <50% pass rates.",
    385       "evidence": "Section V-B and Fig. 8a show per-category breakdown with GPT-4o-mini struggling on privilege-related policies. Mistral Large 2 shows better but still imperfect performance on advanced network policies (Fig. 8b).",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Weak baseline selection",
    392       "detail": "Only two models are tested: Mistral Large 2 and GPT-4o-mini (OpenAI's cheapest/smallest model). No comparison with GPT-4o, GPT-4, Claude, or other competitive models. The baseline appears chosen to make the primary model look good."
    393     },
    394     {
    395       "flag": "No variance or repeat runs",
    396       "detail": "With temperature=0.5 introducing non-determinism and a retry mechanism, results could vary substantially across runs. All results appear to be from single runs with no error bars, standard deviations, or confidence intervals."
    397     },
    398     {
    399       "flag": "Contamination risk unaddressed",
    400       "detail": "Both models could have been trained on ArtifactHub configurations and Kubernetes security documentation. The models may have memorized common Kubernetes security patterns. This risk is entirely unacknowledged."
    401     },
    402     {
    403       "flag": "Hyperparameter justification from unrelated domain",
    404       "detail": "Temperature=0.5 is justified by citing [44] (Wang et al.), a paper about 'Reasoning with Large Language Models on Graph Tasks.' Configuration repair is a different task domain; the optimal temperature may differ."
    405     },
    406     {
    407       "flag": "No cost analysis for proposed production tool",
    408       "detail": "The paper advocates for production deployment (Section VI-A) but reports no API costs, latency, or compute requirements despite a retry mechanism that could multiply costs by up to 50x (5 retries × 10 parser retries per configuration)."
    409     },
    410     {
    411       "flag": "Proxy metric without validity discussion",
    412       "detail": "Checkov pass rate is used as the sole measure of 'security,' but Checkov cannot detect all security issues. A configuration passing Checkov checks may still be insecure, and this limitation is not discussed."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    418       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    419       "year": 2024,
    420       "relevance": "Comprehensive survey of LLMs in software engineering, directly relevant to the survey scope on LLM capabilities in SE tasks."
    421     },
    422     {
    423       "title": "Purple LLaMA CyberSecEval: A Secure Coding Benchmark for Language Models",
    424       "authors": ["M. Bhatt", "S. Chennabasappa"],
    425       "year": 2023,
    426       "arxiv_id": "2312.04724",
    427       "relevance": "Benchmark for evaluating LLM security capabilities in code generation, relevant to LLM safety and code quality evaluation."
    428     },
    429     {
    430       "title": "CyberSecEval 2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models",
    431       "authors": ["M. Bhatt", "S. Chennabasappa"],
    432       "year": 2024,
    433       "arxiv_id": "2404.13161",
    434       "relevance": "Extended security evaluation benchmark for LLMs, relevant to assessing AI safety in code-related tasks."
    435     },
    436     {
    437       "title": "CyberSecEval 3: Advancing the Evaluation of Cybersecurity Risks and Capabilities in Large Language Models",
    438       "authors": ["S. Wan", "C. Nikolaidis", "D. Song"],
    439       "year": 2024,
    440       "arxiv_id": "2408.01605",
    441       "relevance": "Third iteration of cybersecurity evaluation for LLMs, relevant to evolving AI safety benchmarks."
    442     },
    443     {
    444       "title": "Deep Learning for Source Code Modeling and Generation: Models, Applications, and Challenges",
    445       "authors": ["T. H. M. Le", "H. Chen", "M. A. Babar"],
    446       "year": 2020,
    447       "relevance": "Survey on deep learning for code generation, a predecessor to the LLM-based code generation work in the survey scope."
    448     },
    449     {
    450       "title": "Retrieval-Augmented Generation for Large Language Models: A Survey",
    451       "authors": ["Y. Gao", "Y. Xiong", "X. Gao"],
    452       "year": 2023,
    453       "arxiv_id": "2312.10997",
    454       "relevance": "Survey of RAG techniques used in LLM systems, relevant to understanding augmentation methods for LLM-based tools."
    455     },
    456     {
    457       "title": "VulRepair: A T5-Based Automated Software Vulnerability Repair",
    458       "authors": ["M. Fu", "C. Tantithamthavorn", "T. Le", "V. Nguyen", "D. Phung"],
    459       "year": 2022,
    460       "relevance": "LLM-based approach for automated vulnerability repair in source code, closely related to LLM code repair capabilities."
    461     },
    462     {
    463       "title": "LineVul: A Transformer-Based Line-Level Vulnerability Prediction",
    464       "authors": ["M. Fu", "C. Tantithamthavorn"],
    465       "year": 2022,
    466       "relevance": "Transformer-based vulnerability detection at line level, relevant to AI-assisted code security analysis."
    467     },
    468     {
    469       "title": "KGSecConfig: A Knowledge Graph Based Approach for Secured Container Orchestrator Configuration",
    470       "authors": ["M. U. Haque", "M. M. Kholoosi", "M. A. Babar"],
    471       "year": 2022,
    472       "relevance": "Prior approach to container security configuration using knowledge graphs, predecessor to LLMSecConfig and relevant baseline for AI-assisted security."
    473     },
    474     {
    475       "title": "Data Quality for Software Vulnerability Datasets",
    476       "authors": ["R. Croft", "M. A. Babar", "M. M. Kholoosi"],
    477       "year": 2023,
    478       "relevance": "Study on data quality in vulnerability datasets, relevant to understanding data quality issues in AI security evaluations."
    479     }
    480   ]
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs