scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33419B)
      1 {
      2   "paper": {
      3     "title": "IRIS: LLM-Assisted Static Analysis for Detecting Security Vulnerabilities",
      4     "authors": [
      5       "Ziyang Li",
      6       "Saikat Dutta",
      7       "Mayur Naik"
      8     ],
      9     "year": 2025,
     10     "venue": "International Conference on Learning Representations (ICLR 2025)",
     11     "arxiv_id": "2405.17238",
     12     "doi": "10.48550/arXiv.2405.17238"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "IRIS, a neuro-symbolic system combining LLMs with CodeQL static analysis, detects 55/120 vulnerabilities in CWE-Bench-Java (vs. 27 for CodeQL alone) while reducing average false discovery rate by 5.21%. The approach uses LLMs to infer taint specifications (sources/sinks) and filter false positives via contextual analysis. Even small open-source models like DeepSeekCoder 7B detect 52 vulnerabilities, though contextual analysis only helps models with strong reasoning (GPT-4, Llama-3 70B). IRIS also discovered 4 previously unknown vulnerabilities in latest versions of Java projects.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Two public GitHub repositories are provided: https://github.com/iris-sast/iris for the IRIS tool and https://github.com/iris-sast/cwe-bench-java for the benchmark dataset. Both are referenced in the abstract and Section 4."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "CWE-Bench-Java is publicly available at https://github.com/iris-sast/cwe-bench-java, containing 120 vulnerabilities across 4 CWE classes with scripts to fetch, build, and analyze projects (Section 4)."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper specifies CodeQL 2.15.3, LLM model IDs (Table 6), hardware specs (Appendix C.1), and inference parameters (temperature=0, top_p=1, max_tokens=2048). However, no requirements.txt, Dockerfile, or comprehensive environment specification is provided. Java/Maven versions per project are handled by a semi-automated script but not listed."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The public repositories include scripts to fetch, build, and analyze Java projects (Section 4: 'The dataset and the corresponding scripts to fetch, build, and analyze the Java projects are available publicly'). CodeQL queries are provided in the appendix (Listings 3-5)."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 1 and 2 are point estimates (e.g., '55 detected', '84.82% FDR', '0.177 F1') with no confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims IRIS outperforms CodeQL and other baselines based solely on comparing raw numbers (e.g., 55 vs. 27 detected) without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Improvements are reported with baseline context throughout: '55 (+28)', '84.82% (↓5.21)', '0.177 (↑0.101)' in Table 1, and per-CWE deltas in Table 2. Relative improvement is also stated: '103.7% more than CodeQL'."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The dataset contains 120 vulnerabilities across 4 CWE classes (55/13/31/21 split). No power analysis or justification for why 120 is sufficient, and CWE-78 has only 13 instances which is very small."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Results appear to be from single runs with temperature=0 and fixed seeds for GPT models. No standard deviation, variance, or spread measures are reported across multiple runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Four baselines are compared: CodeQL with built-in security queries, Facebook Infer, SpotBugs, and Snyk (Table 2, Section 5.1)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "CodeQL 2.15.3 (2024), Snyk, Facebook Infer are contemporary and widely-used static analysis tools. These represent the state of the art in static vulnerability detection."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 3 ablates source/sink specifications (LLM-inferred vs. CodeQL), showing both are necessary. Figure 9 ablates contextual analysis, showing its impact varies by LLM reasoning capability."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three metrics are used: #Detected (recall), Average FDR (precision proxy), and Average F1 Score (Section 3.6, Table 1)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Manual analysis of 50 random alarms found 27/50 exhibit potential attack surfaces (Section 5.2). Additionally, 960 randomly selected LLM-inferred labels were manually evaluated for precision (Section 5.4, Fig. 7)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No explicit train/dev/test split is described. The motivating example (cron-utils, Section 2) is from CWE-Bench-Java itself. Hyperparameters like context window size (±5 lines) and intermediate steps (S=10) were tuned with observations like 'we observed that setting S to 10 provides a good balance' (Appendix A.3), without specifying whether this tuning used benchmark data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 provides per-CWE breakdown of detected vulnerabilities for all baselines and IRIS variants (CWE-22, CWE-78, CWE-79, CWE-94)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.2 discusses CWE-78 being 'particularly challenging' due to 'gadget-chains or external side effects.' Appendix C.7 provides detailed error analysis with three main causes of undetected vulnerabilities and two LLM-induced failure modes."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Llama-3 8B underperforms CodeQL on CWE-22 (19 vs. 22 detected, Table 2). Contextual analysis hurts smaller models' F1 scores (Fig. 9). CWE-78 remains challenging for all LLMs (max 3/13 detected)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'detects 55 (+28)' matches Table 1, 'improves upon CodeQL's average false discovery rate by 5% points' matches Table 1 (90.03% → 84.82%), '4 previously unknown vulnerabilities' is supported in Section 5.3."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims ('IRIS improves detection') are supported by controlled ablation studies: Table 3 shows removing LLM-inferred sources or sinks drastically reduces detection, and Fig. 9 isolates the effect of contextual analysis. Single-variable manipulation is adequate for these claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'LLM-Assisted Static Analysis for Detecting Security Vulnerabilities' is broader than what was tested: only Java projects, only 4 CWE classes, only CodeQL as the static analysis backbone. While limitations mention 'it is unknown if IRIS will perform well on other languages' (Section 7), the title and abstract frame the contribution more broadly."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not consider alternative explanations for IRIS's improvement: whether CodeQL's baseline configuration was suboptimal, whether CWE-Bench-Java selection favors specification-gap approaches, or whether improvements stem from expanded search space rather than LLM intelligence. Section 5.4 discusses over-approximation benefiting IRIS but this explains mechanism, not alternatives."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures vulnerability detection (known CVEs found) and FDR directly. These metrics closely match the claimed capability. The paper also acknowledges the FDR metric's limitation: 'the reported FDR is an upper bound' since true unknown vulnerabilities found by IRIS are counted as false positives (Section 5.2)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Exact model IDs are specified in Table 6: gpt-4-0125-preview, gpt-3.5-turbo-0125, meta-llama/Meta-Llama-3-8B-Instruct, meta-llama/Meta-Llama-3-70B-Instruct, deepseek-ai/deepseek-coder-7b-instruct, Qwen/Qwen2.5-Coder-32B-Instruct, google/gemma-2-27b-it."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt templates are provided in Listings 1 and 2 (Appendix A.2) for specification inference, and Fig. 4 shows the contextual analysis prompt structure. The IRIS source code repository (https://github.com/iris-sast/iris) is linked for complete prompt access. However, the specific few-shot examples and contextual analysis system prompt are not shown in the paper text."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix C.3 reports: temperature=0, max_tokens=2048, top_p=1 for all LLMs, batch sizes of 20 (internal) and 30 (external), few-shot counts per CWE (3-4 examples), and fixed seeds for GPT models."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The four-stage pipeline is thoroughly described: candidate extraction (Section 3.2), LLM specification inference (Section 3.3), CodeQL taint analysis (Section 3.4), and contextual analysis filtering (Section 3.5). Fig. 3 provides a visual overview. CodeQL queries are shown in Appendix A.4."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4 and Fig. 5 document the full dataset curation pipeline with counts at each stage: 1065 initial CVEs → 430 with GitHub URL → 265 with fix commits → 149 compilable → 120 after manual validation. Filtering criteria are stated at each step."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 'Conclusion and Limitations' contains a dedicated limitations paragraph discussing undetected vulnerabilities, LLM call costs, language generalizability, and the gap between generated and developer-desired reports."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats discussed: 'many vulnerabilities that IRIS cannot detect' with error analysis in Appendix C.7, 'IRIS makes numerous calls to LLMs... increasing the potential cost of analysis', 'it is unknown if IRIS will perform well on other languages', CWE-78 being inherently difficult for static analysis (Section 5.2)."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 7 states: 'it is unknown if IRIS will perform well on other languages,' 'there is still a gap between the IRIS generated report and the report that the developers would like to see.' Section 5.2 notes that CWE-78 'highlights the inherent limitations of static analysis, as opposed to dynamic approaches—an area that we leave for future work.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "CWE-Bench-Java is publicly available with all project data, CVE metadata, fix commits, and build scripts at https://github.com/iris-sast/cwe-bench-java. The IRIS tool is also public at https://github.com/iris-sast/iris."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4 and Fig. 5 describe the complete data collection procedure: starting from GitHub Advisory Database, cross-validating with libraries.io and Snyk, obtaining git information, building with Maven, and manual validation. Table 4 shows detailed statistics at each step."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data selection process is thoroughly described: CVEs from GitHub Advisory Database filtered by 4 CWE classes and Java Maven packages, cross-validated with libraries.io and Snyk for fix information, compiled with semi-automated Java/Maven version selection, and manually validated by two co-authors with cross-checking (Section 4, Appendix B.1)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Fig. 5 and Table 4 document the full pipeline from 1065 initial CVEs through 6 filtering stages to 120 final vulnerabilities, with counts and criteria at each step. Appendix B.1 provides additional details on exclusion criteria."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgments section states: 'This research was supported by NSF award CCF 2313010.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: University of Pennsylvania (Ziyang Li, Mayur Naik) and Cornell University (Saikat Dutta). No evaluated product affiliations."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "NSF is a government funding agency with no financial stake in whether IRIS outperforms CodeQL or any other tool."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper does not state training data cutoff dates for any of the 7 LLMs used. GPT-4-0125-preview, Llama-3, DeepSeekCoder, etc. are used without stating when their training data ends. Since CVE information is public, the models may have seen vulnerability details during training."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "CWE-Bench-Java contains known CVEs with public descriptions, advisories, and fix commits. GPT-4 may have seen detailed information about these specific vulnerabilities during training (e.g., CVE descriptions, fix patches). This potential contamination is not discussed."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The CVEs in CWE-Bench-Java are public and pre-date GPT-4's training. The LLMs may have seen vulnerability descriptions, source/sink patterns, or even fix commits for these exact CVEs. No contamination analysis or mitigation is provided."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The evaluation is entirely automated with manual validation of benchmark and outputs."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study analyzes open-source software projects."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 10 (Appendix D) provides wall-clock runtime per project for all 120 projects using Llama 3 8B, ranging from 43 seconds to 4 hours. However, API costs for closed-source models (GPT-4, GPT-3.5) are not reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is described (Intel Xeon machines with RTX 2080 Ti and A100 GPUs, Appendix C.1) and per-project runtimes are given (Table 10), but total computational budget (total GPU hours, total API spend) is not stated. The limitations section acknowledges 'IRIS makes numerous calls to LLMs... increasing the potential cost' without quantifying."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Temperature is set to 0 and seeds are fixed for GPT models (Appendix C.3), but no sensitivity analysis across different seeds or configurations is reported. Only single-configuration results are presented."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper does not explicitly state how many runs produced the results. The use of temperature=0 and fixed seeds implies single runs, but this is not stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters include batch sizes (20/30), context window (±5 lines), intermediate steps (S=10), and few-shot counts (3-4). Appendix A.3 states 'we observed that setting S to 10 provides a good balance' but no systematic search budget is reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "While all 7 LLM results are reported (not just the best), the selection of hyperparameters (batch size, context window, few-shot examples) is not justified through validation-set evaluation. It is unclear whether these were tuned using CWE-Bench-Java projects."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper makes numerous comparative claims across 7 LLMs, 4 CWE classes, and multiple ablation configurations without any statistical tests or multiple comparison corrections."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors built both IRIS and CWE-Bench-Java, and wrote custom CodeQL queries for the evaluation. The potential for self-comparison bias (e.g., benchmark design favoring their approach, or baselines not being optimally configured) is not acknowledged."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Models ranging from 7B to GPT-4 are compared without discussing compute differences. Table 10 shows runtimes only for Llama 3 8B. The significant cost difference between running a 7B model locally and calling GPT-4 API is not analyzed."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether detecting known CVEs in prepared benchmark projects generalizes to detecting unknown vulnerabilities in the wild. The 4 new vulnerabilities (Section 5.3) partially validate practical utility but no systematic construct validity analysis is provided."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The IRIS pipeline (candidate extraction → LLM labeling → CodeQL analysis → contextual filtering) is held constant across all 7 LLM comparisons in Table 1. Only the LLM component varies, making model comparisons fair with respect to scaffolding."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "CWE-Bench-Java contains public CVEs from before GPT-4's training cutoff. The LLMs may have seen vulnerability descriptions, fix patches, or even CodeQL discussions about these specific CVEs. This temporal leakage is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The LLMs are given CWE descriptions and API signatures as input features. If the models have memorized associations between specific APIs and vulnerability types from training data (e.g., from CVE databases), this constitutes feature leakage. Not discussed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Several projects appear multiple times in CWE-Bench-Java with different CVEs (e.g., multiple xstream, keycloak, DSpace entries in Table 10). Non-independence between these related evaluations is not addressed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, temporal splits, or decontamination pipelines are applied."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "IRIS with GPT-4 detects 55 vulnerabilities (103.7% more than CodeQL's 27) on CWE-Bench-Java",
    369       "evidence": "Table 1 shows 55 detected for IRIS+GPT-4 vs. 27 for CodeQL across 120 vulnerabilities. Table 2 provides per-CWE breakdown confirming the aggregate numbers.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "IRIS with GPT-4 improves average false discovery rate by 5.21 percentage points over CodeQL",
    374       "evidence": "Table 1 reports 84.82% AvgFDR for IRIS+GPT-4 vs. 90.03% for CodeQL. Manual analysis of 50 random alarms yields an estimated refined FDR of 46% (Section 5.2).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "IRIS identifies 4 previously unknown vulnerabilities in latest versions of Java projects",
    379       "evidence": "Section 5.3 describes application to latest versions of 30 Java projects, finding 3 CWE-22 and 1 CWE-94 vulnerabilities. One example (alluxio Zip-Slip) is shown in Fig. 8. CodeQL alone could not detect them.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "LLM-inferred sink specifications can replace CodeQL sinks with high recall (87.11% for GPT-4)",
    384       "evidence": "Fig. 6 shows recall of LLM-inferred specifications against CodeQL's known specifications. GPT-4 achieves 87.11% sink recall. Fig. 7 shows manual precision evaluation of 960 samples with GPT-4 achieving >70% precision.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Both LLM-inferred sources and sinks are necessary for IRIS's performance",
    389       "evidence": "Table 3 ablation: replacing either LLM sources or sinks with CodeQL specs reduces detection from 55 to 36 or 24 respectively. The ablation is clean single-variable manipulation.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Contextual analysis effectiveness depends on the LLM's reasoning capability",
    394       "evidence": "Fig. 9 shows contextual analysis improves precision/F1 for GPT-4, GPT-3.5, and Llama-3 70B but hurts smaller models (Llama-3 8B, DeepSeekCoder 7B). 'Smaller models are more likely to respond with vulnerable than larger models' (Section 5.5).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Smaller specialized LLMs like DeepSeekCoder 7B can still detect 52 vulnerabilities, close to GPT-4's 55",
    399       "evidence": "Table 1 shows DSC 7B detecting 52 vs. GPT-4's 55, but with much higher FDR (95.40% vs. 84.82%) and lower F1 (0.062 vs. 0.177).",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "Benchmark contamination risk unaddressed",
    406       "detail": "CWE-Bench-Java contains known CVEs with public descriptions, fix commits, and advisory database entries. GPT-4 and other LLMs may have seen detailed information about these specific vulnerabilities during training (e.g., which APIs are sources/sinks for these CVEs). This could inflate IRIS's performance. The paper does not discuss or test for this contamination."
    407     },
    408     {
    409       "flag": "No statistical testing on any comparison",
    410       "detail": "All claims of superiority (IRIS vs. CodeQL, GPT-4 vs. other LLMs) are based on comparing raw numbers without significance tests, confidence intervals, or variance estimates. With only 120 vulnerabilities and small per-CWE counts (CWE-78: 13), differences may not be statistically significant."
    411     },
    412     {
    413       "flag": "Authors built both the system and the benchmark",
    414       "detail": "The same team created IRIS, CWE-Bench-Java, and the custom CodeQL queries used for evaluation. The benchmark curation process (which CVEs to include, how to label vulnerable locations) could unconsciously favor the approach. No independent evaluation or self-comparison bias acknowledgment."
    415     },
    416     {
    417       "flag": "Very high FDR even for best configuration",
    418       "detail": "IRIS with GPT-4 has an 84.82% average false discovery rate—meaning roughly 85% of alarms are false positives. While this is 5% better than CodeQL, it still represents a large developer burden. The manually estimated 46% FDR (from 50 samples) is substantially different from the automated metric, raising questions about metric validity."
    419     },
    420     {
    421       "flag": "Non-independent benchmark entries",
    422       "detail": "Multiple entries in CWE-Bench-Java come from the same project (e.g., keycloak, DSpace, xwiki-commons appear multiple times in Table 10). Performance on these correlated entries is treated as independent, which could bias aggregate results."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    428       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    429       "year": 2023,
    430       "arxiv_id": "2310.06770",
    431       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks; cited as evidence that LLMs struggle with project-level reasoning."
    432     },
    433     {
    434       "title": "Understanding the Effectiveness of Large Language Models in Detecting Security Vulnerabilities",
    435       "authors": ["Avishree Khare", "Saikat Dutta", "Ziyang Li", "Alaia Solko-Breslin", "Rajeev Alur", "Mayur Naik"],
    436       "year": 2023,
    437       "arxiv_id": "2311.16169",
    438       "relevance": "Directly evaluates LLMs on method-level vulnerability detection, showing LLMs fail at complex code reasoning without context—motivating IRIS."
    439     },
    440     {
    441       "title": "A Comprehensive Study of the Capabilities of Large Language Models for Vulnerability Detection",
    442       "authors": ["Benjamin Steenhoek", "Md Mahbubur Rahman", "Monoshi Kumar Roy", "Mirza Sanjida Alam", "Earl T. Barr", "Wei Le"],
    443       "year": 2024,
    444       "arxiv_id": "2403.17218",
    445       "relevance": "Studies LLMs' effectiveness at detecting vulnerabilities, demonstrating limitations that motivate combining LLMs with static analysis."
    446     },
    447     {
    448       "title": "Enhancing Static Analysis for Practical Bug Detection: An LLM-Integrated Approach",
    449       "authors": ["Haonan Li", "Yu Hao", "Yizhuo Zhai", "Zhiyun Qian"],
    450       "year": 2024,
    451       "relevance": "Concurrent work combining LLMs with static analysis for bug detection, directly related to the IRIS approach."
    452     },
    453     {
    454       "title": "LLMDFA: Analyzing Dataflow in Code with Large Language Models",
    455       "authors": ["Chengpeng Wang", "Wuqi Zhang", "Zian Su", "Xiangzhe Xu", "Xiaoheng Xie", "Xiangyu Zhang"],
    456       "year": 2024,
    457       "relevance": "Uses LLMs for dataflow analysis in code, a complementary approach to IRIS's neuro-symbolic pipeline."
    458     },
    459     {
    460       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    461       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    462       "year": 2023,
    463       "relevance": "Evaluates LLMs for automated program repair, demonstrating LLM capabilities in code understanding and modification."
    464     },
    465     {
    466       "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
    467       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    468       "year": 2023,
    469       "relevance": "Combines LLMs with traditional test generation tools, exemplifying the neuro-symbolic approach to software engineering."
    470     },
    471     {
    472       "title": "Fuzz4All: Universal Fuzzing with Large Language Models",
    473       "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi", "Jia Le Tian", "Michael Pradel", "Lingming Zhang"],
    474       "year": 2024,
    475       "relevance": "Uses LLMs for universal fuzzing, demonstrating LLMs' utility in security testing—related to IRIS's vulnerability detection goals."
    476     },
    477     {
    478       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    479       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim", "Chawin Sitawarin", "Xinyun Chen", "Basel Alomair", "David Wagner", "Baishakhi Ray", "Yizheng Chen"],
    480       "year": 2024,
    481       "arxiv_id": "2403.18624",
    482       "relevance": "Evaluates code language models for vulnerability detection, finding limited effectiveness—supporting IRIS's motivation for neuro-symbolic approaches."
    483     },
    484     {
    485       "title": "Large Language Models for Code: Security Hardening and Adversarial Testing",
    486       "authors": ["Jingxuan He", "Martin Vechev"],
    487       "year": 2023,
    488       "relevance": "Studies LLMs for code security, including hardening and adversarial testing, directly relevant to AI-assisted security analysis."
    489     },
    490     {
    491       "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models While Translating Code",
    492       "authors": ["Rangeet Pan", "Ali Reza Ibrahimzada", "Rahul Krishna"],
    493       "year": 2024,
    494       "relevance": "Studies bugs introduced by LLMs during code translation, relevant to understanding LLM reliability in code tasks."
    495     },
    496     {
    497       "title": "Large Language Models for Test-Free Fault Localization",
    498       "authors": ["Aidan ZH Yang", "Ruben Martins", "Claire Le Goues", "Vincent J. Hellendoorn"],
    499       "year": 2023,
    500       "arxiv_id": "2310.01726",
    501       "relevance": "Uses LLMs for fault localization without test cases, combining LLMs with program analysis for debugging."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "IRIS is an open-source tool that can be applied to real Java projects and found 4 real unknown vulnerabilities, but requires CodeQL setup and LLM API access."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "Combining LLMs with static analysis is a natural idea; the specific 2x improvement over CodeQL is impressive but not contrarian."
    512     },
    513     "fear_safety": {
    514       "score": 2,
    515       "justification": "Directly addresses security vulnerability detection and found previously unknown vulnerabilities in production software."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or dramatic claims; straightforward system paper showing improvements over baselines."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Open-source code and benchmark on GitHub, but requires CodeQL installation, Java project compilation, and LLM API access to try."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "From UPenn and Cornell (respected but not headline labs); uses GPT-4 (well-known) but the tool itself is new."
    528     }
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs