scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21835B)
      1 {
      2   "paper": {
      3     "title": "Exploring Automatic Cryptographic API Misuse Detection in the Era of LLMs",
      4     "authors": ["Yifan Xia", "Zichen Xie", "Peiyu Liu", "Kangjie Lu", "Yan Liu", "Wenhai Wang", "Shouling Ji"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2407.16576"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper states 'We will open-source the refined benchmarks with detailed analysis for future research' — a promise of future release, not an actual release. No repository URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The benchmarks used (CryptoAPI-Bench, MASC, ApacheCryptoAPI-Bench) are publicly available, but the paper's refined versions and the 11,940 LLM-generated reports are not released. They state they 'will release the misuse reports and analysis' — future tense."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The appendix mentions specific API model versions (gpt-3.5-turbo-1106, gpt-4-turbo-1106, Gemini-pro-1.0) and that open-source models used Hugging Face, with 'default settings for model hyper-parameters like temperature.' No requirements.txt, Dockerfile, or detailed environment setup is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a high level but lacks the specificity needed for exact reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (precision, recall, accuracy) with no confidence intervals or error bars despite the stochastic nature of LLMs."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., GPT-4 outperforms SATs by 20.8%) but no statistical significance tests are applied to any comparison."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '33.1% increase in FP alerts', '20.1% average accuracy increase', '20.8% increase in performance over the leading SATs', with full precision/recall/accuracy tables providing absolute values."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the choice of 5 queries per test case in the multi-query approach, nor for the selection of top 200 Java repositories in the usability study."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Despite running 5 queries per test case (acknowledging LLM stochasticity), no variance or standard deviation across these runs is reported. Only aggregated results are shown."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three SOTA static analysis tools (CryptoGuard, CogniCryptSAST, SpotBugs) are compared against the LLMs in RQ2, with detailed accuracy, precision, and recall comparisons (Figure 4, Figure 7)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "CryptoGuard (2019) and CogniCryptSAST (2017) are the leading academic tools in this specific domain, and SpotBugs is widely used in industry. These are appropriate baselines given the niche area."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically evaluates components: unconstrained vs. task-aware settings, with and without the code & analysis validation mechanism, showing each component's contribution (Table 2, Table 3)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Precision, recall, and accuracy are all reported, along with raw TP/FP/TN/FN counts in Tables 2 and 3."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two authors with 5+ years of experience independently reviewed LLM outputs to classify TP/FP/FN, with cross-checking to minimize bias (Section 3.4). The usability study also involved developer feedback on reported misuses."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The usability study (RQ3) uses real-world GitHub repositories not part of the benchmarks, serving as an independent validation of the approach."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by complexity level (basic/advanced/mutation in Figure 3), by misuse category (Table 4), by file size impact, and by failure pattern categories with percentages."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Three failure patterns are extensively analyzed: Erroneous Cryptographic Knowledge (41.3% of FPs), Code Semantics Misunderstanding (55.2% of FPs), and Hallucination/Denial-of-Service (3.4%). Sub-categories of code misunderstanding are further broken down."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: GPT-4 performs worse than GPT-3.5 without validation (Finding 3 in RQ1), FP rates exceed 50% in unconstrained settings, performance degrades significantly with larger programs, and the validation mechanism sometimes removes true positives."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims ~90% detection rate (GPT-4 achieves 0.90 recall in Table 2), surpassing traditional methods (shown in Figure 4), and 63 real-world misuses with 46 acknowledged (Table 4 shows 46 accepted). All are supported by the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about the validation mechanism improving performance are supported by controlled ablation (with/without validation). Claims about task-aware settings improving results are similarly supported by systematic comparison. The ablation design is adequate single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'Exploring Automatic Cryptographic API Misuse Detection in the Era of LLMs' is appropriately scoped, but the paper's generalization discussion (Section 5.2.3) is brief and generic: 'our findings may still not generalize to other datasets or languages.' The abstract claims 'surpassing traditional methods' without bounding to the specific benchmarks tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5.2.1 discusses data leakage as an alternative explanation for LLM performance, providing statistical analysis (0.3% overlap) to argue against it. The paper also discusses how misleading benchmark metadata could inflate results, leading them to rename test cases."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The appendix specifies exact versions: gpt-3.5-turbo-1106, gpt-4-turbo-1106, Gemini-pro-1.0, CodeLlama-34B-Instruct, DeepSeek-Coder-33b-Instruct (Section 7.1.2)."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided in Figures 5 and 6 of the appendix, showing both the detection and validation prompts with system roles, basic prompts, setting-specific components, and formatting instructions."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper states 'All experiments maintain the default settings for model hyper-parameters like temperature' but does not specify what those defaults are. Default temperature varies by API version and is not always documented."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-query approach (5 queries per test case) and code & analysis validation mechanism are described in detail in Section 3.3.2, with the workflow illustrated in Figure 1."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2.2 documents benchmark refining steps: removing inapplicable cases (context-insensitive, obsolete, redundant), addressing ground truth leakage by standardizing naming, with specific counts (154 GTMs in manually-crafted, 53 GTMs in real-world after filtering)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.1.2 discusses limitations and Section 5.2 is dedicated to 'Threat of validity' covering data leakage, manual analysis reliability, and generalization concerns."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The threats are specific to this study: data leakage from cryptographic benchmarks into LLM training data (with statistical analysis), bias in manual TP/FP classification, and limitations of Java/Python-only evaluation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "Section 5.2.3 only states generically 'our findings may still not generalize to other datasets or languages.' No specific boundaries are drawn about what the results do NOT show (e.g., does not show LLMs can replace SATs in production, does not demonstrate reliability for safety-critical deployment)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The 11,940 LLM-generated reports are not released. The paper promises future release but no data is currently available for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection is well described: benchmark selection and refining (Section 3.2), LLM querying procedure with 5 queries per test case (Section 3.3.2), and repository selection for usability study (Section 4.4.1, with criteria in Appendix Section 5)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. The developer feedback in the usability study was from maintainers of existing open-source projects contacted through GitHub advisories/issues, not recruited participants."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: benchmark selection → refining (with counts: 154 GTMs manually-crafted, 53 real-world) → LLM querying (5x per case) → validation mechanism → manual analysis by two reviewers with cross-checking."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Zhejiang University, University of Minnesota Twin Cities, and Ant Group. Ant Group is a technology company but is not a producer of the evaluated LLMs."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself a concern."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Table 1 lists training data cutoff dates for all models: GPT-3.5-turbo (Sep 2021), GPT-4-turbo (Apr 2023), Gemini-1.0-pro (July 2023), CodeLlama (July 2023), DeepSeek-Coder (May 2023)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 5.2.1 extensively discusses data leakage concerns, analyzing whether benchmark ground truths could appear in training data. They find only 0.3% overlap for CryptoAPI-Bench and zero for ApacheCryptoAPI-Bench. They also address ground truth leakage from test case naming (Section 3.2.2)."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The paper actively addresses contamination by renaming test cases to prevent metadata leakage (Section 3.2.2) and argues that the presence of numerous failure cases among LLMs makes memorization-based success unlikely (Section 5.2.1)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants study. The developer feedback is from open-source maintainers responding to bug reports, not a structured human subjects study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human subjects study. The paper mentions ethical considerations in anonymizing non-public reports (Section 4.4) but this does not constitute a human subjects study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, API spend, or per-query cost is reported despite evaluating 11,940 LLM-generated reports across 5 models with multiple queries per test case."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours for open-source models, or API spend is reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "LLMs' false positive rates exceed 50% even for GPT-4 in unconstrained settings on manually-crafted benchmarks.",
    286       "evidence": "Table 2 shows GPT-4 unconstrained without validation: precision 0.48, with 89 FP out of 172 total alerts.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The optimized approach (task-aware + validation) achieves nearly 90% detection rate, surpassing traditional methods.",
    291       "evidence": "Table 2 shows GPT-4 TA w/V: recall 0.90, precision 0.87, accuracy 0.84. Figure 4 shows this exceeds all SATs.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "63 real-world cryptographic misuses were discovered, with 46 acknowledged by developers.",
    296       "evidence": "Table 4 shows 37 Java + 29 Python = 66 reported (after excluding 10 manual FPs), with 26+20=46 accepted.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The code & analysis validation mechanism provides an average 20.1% accuracy increase.",
    301       "evidence": "Comparison of w/V vs w/o V rows in Tables 2 and 3 across all models supports this claim.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Task-aware setting generates 18.9% more TP alerts on average compared to unconstrained setting.",
    306       "evidence": "Finding 2 in Section 4.2.1 states this with range from 10.3% (GPT-3.5) to 32.5% (GPT-4), verifiable from Table 2.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "This paper systematically evaluates 5 LLMs for cryptographic API misuse detection using refined benchmarks. LLMs exhibit high false positive rates (>50%) in unconstrained settings, but task-aware scoping combined with a self-validation mechanism raises GPT-4's accuracy to 84-88%, surpassing SOTA static analysis tools. Three failure patterns are identified: erroneous cryptographic knowledge (41.3% of FPs), code semantics misunderstanding (55.2%), and hallucination (3.4%). A usability study on real GitHub repositories discovered 63 cryptographic misuses, with 46 acknowledged by developers and a 100% acceptance rate for Java findings.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance reported despite stochastic methods",
    315       "detail": "The paper acknowledges LLM stochasticity and runs 5 queries per test case, yet reports only aggregated point estimates without any variance, standard deviation, or confidence intervals across runs."
    316     },
    317     {
    318       "flag": "No statistical significance tests for comparisons",
    319       "detail": "All comparative claims (LLMs vs SATs, with/without validation, task-aware vs unconstrained) are based on raw metric differences without any statistical tests."
    320     },
    321     {
    322       "flag": "No cost reporting",
    323       "detail": "11,940 LLM reports were generated across 5 models including GPT-4, yet no API costs, GPU hours, or computational budget is mentioned, making practical replicability unclear."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Large Language Models for Code Analysis: Do LLMs Really Do Their Job?",
    329       "authors": ["C. Fang", "N. Miao", "S. Srivastav"],
    330       "year": 2023,
    331       "arxiv_id": "2310.12357",
    332       "relevance": "Directly evaluates LLM capabilities for code analysis tasks, relevant to understanding LLM reliability."
    333     },
    334     {
    335       "title": "How ChatGPT is Solving Vulnerability Management Problem",
    336       "authors": ["P. Liu", "J. Liu", "L. Fu"],
    337       "year": 2023,
    338       "relevance": "Evaluates ChatGPT for security vulnerability management, relevant to LLM-based security analysis."
    339     },
    340     {
    341       "title": "Keep the Conversation Going: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    342       "authors": ["C. S. Xia", "L. Zhang"],
    343       "year": 2023,
    344       "arxiv_id": "2304.00385",
    345       "relevance": "Uses LLMs for automated program repair with cost analysis, relevant to LLM-based code quality tools."
    346     },
    347     {
    348       "title": "Enhancing Static Analysis for Practical Bug Detection: An LLM-Integrated Approach",
    349       "authors": ["H. Li", "Y. Hao", "Y. Zhai", "Z. Qian"],
    350       "year": 2024,
    351       "relevance": "Integrates LLMs with static analysis for bug detection, directly relevant to hybrid LLM+traditional tool approaches."
    352     },
    353     {
    354       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    355       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    356       "year": 2023,
    357       "relevance": "EvalPlus benchmark for evaluating LLM code generation correctness, relevant to LLM evaluation methodology."
    358     },
    359     {
    360       "title": "A Survey of Large Language Models",
    361       "authors": ["W. X. Zhao", "K. Zhou", "J. Li"],
    362       "year": 2023,
    363       "arxiv_id": "2303.18223",
    364       "relevance": "Comprehensive survey of LLMs covering hallucination and reliability issues central to this paper's concerns."
    365     },
    366     {
    367       "title": "A Systematic Evaluation of Large Language Models of Code",
    368       "authors": ["F. F. Xu", "U. Alon", "G. Neubig", "V. J. Hellendoorn"],
    369       "year": 2022,
    370       "relevance": "Systematic evaluation of code LLMs including temperature effects, relevant to LLM evaluation methodology."
    371     },
    372     {
    373       "title": "Why Crypto-Detectors Fail: A Systematic Evaluation of Cryptographic Misuse Detection Techniques",
    374       "authors": ["A. S. Ami", "N. Cooper", "K. Kafle"],
    375       "year": 2022,
    376       "relevance": "Systematic evaluation of cryptographic misuse detection tools and their failure modes, directly relevant baseline work."
    377     },
    378     {
    379       "title": "Can Large Language Models Reason About Program Invariants?",
    380       "authors": ["K. Pei", "D. Bieber", "K. Shi", "C. Sutton", "P. Yin"],
    381       "year": 2023,
    382       "relevance": "Evaluates LLM reasoning about program properties, relevant to understanding LLM code comprehension capabilities."
    383     }
    384   ]
    385 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs