scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25185B)
      1 {
      2   "paper": {
      3     "title": "BASICS: Binary Analysis and Stack Integrity Checker System for Buffer Overflow Mitigation",
      4     "authors": ["Luís Ferreirinha", "Ibéria Medeiros"],
      5     "year": 2025,
      6     "venue": "IEEE Transactions on Reliability (submitted)",
      7     "arxiv_id": "2511.19670"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The BASICS tool is available as open source at a GitHub repository (reference [21]: https://github.com/Singularitty/BASICS). The paper states 'the BASICS tool (available at [21])' in the contributions list (Section 1) and 'BASICS is open source [21]' in Section 9."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The evaluation uses publicly available datasets: the Juliet C/C++ test suite (NIST reference [43]) and the NIST SARD dataset (reference [44]). The real-world applications are sourced from public GitHub, GitLab, and SourceForge repositories. These are all publicly available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 7 specifies Python v3.10.12, Angr v9.2.102, LTL2BA v1.3, and E9Patch v1.0.0-rc9. Section 8.1 specifies Ubuntu 24.04.1 LTS, 2 CPU cores, 24 GB RAM, AMD EPYC 7643 processor, and the GCC compiler. This provides sufficient detail to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the code is released, the paper does not provide step-by-step reproduction instructions, a README with commands, or scripts to replicate the main experiments. The datasets are referenced but no specific commands or configuration files are described for reproducing the reported results."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper reports only point estimates for accuracy (87%, 55%), precision (87%, 92%), recall (23%, 68%), and F1-Score (37%, 78%). No confidence intervals or error bars are provided for any metrics."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims BASICS outperforms CWE Checker (e.g., 'a slight advantage in precision, with a 4% increase') but provides no statistical significance tests to support these comparative claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports performance differences in context: e.g., BASICS precision of 87% vs CWE Checker 83% on Juliet, and BASICS accuracy 87% vs CWE Checker 52% on SARD. Tables 4 and 5 provide confusion matrices and full metric breakdowns with baseline context, allowing readers to assess the magnitude of differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for sample sizes. The Juliet subset of 3,010 cases (after excluding 532 timeouts) and the SARD dataset of 135 programs are used without any power analysis or discussion of whether these sample sizes are adequate for the claims made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper reports single-run results. No variance, standard deviation, or spread measures are reported across any experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "BASICS is compared against CWE Checker, described as 'an open-source tool for detecting CWE vulnerability classes in binary programs' (Section 8.1). Full confusion matrices and performance metrics are reported for both tools."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Only a single baseline (CWE Checker) is used. There is no discussion of whether CWE Checker represents the state of the art for binary BO detection. Other related works mentioned in Section 10 (Arbiter, Vyper, IntScope) are not compared against experimentally."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper does not ablate the individual components of BASICS (e.g., model checking alone vs. with concolic execution, contribution of individual security properties). The system has multiple components (model checker, concolic execution, patcher) but no ablation study shows which matter most."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are reported: accuracy, precision, recall, and F1-Score (Table 5). Confusion matrices are also provided (Table 4)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "For the SARD dataset, 'Each program was manually classified as either containing a BO vulnerability or not, establishing a ground truth dataset' (Section 8.2). For real-world applications, 'A manual review of their source code confirmed the BOs' (Section 8.4). These constitute expert manual review of system outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning system. There is no training/validation/test split. BASICS uses formal verification (model checking) rather than learned models, so held-out test sets are not applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 6 provides a breakdown by function type (gets, scanf, strcpy, sprintf) for SARD. Table 7 breaks down patching results per function. Results are also reported separately for each dataset (Juliet vs SARD vs real-world applications)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 8.2 discusses false positives (60 instances due to overestimation of stack writes in concolic execution) and false negatives (1,282 instances due to inaccuracies in C library function emulation and missed atomic transitions). Section 8.4 discusses the Contacts Management case where patching was not supported for scanf."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports multiple negative results: 532 timeout instances in Juliet due to state explosion, low recall of 23% on Juliet, the inability to patch scanf calls, sprintf patches that 'exhibited slightly altered behaviour,' and scalability issues with larger applications (Hash-Map taking 759.73 seconds)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'accuracy and precision above 87%, both in detection and correction' and 'outperforming [CWE Checker].' Tables 4-5 show precision of 87% on Juliet and 92% on SARD for detection. SARD accuracy is 87%. Patching shows 100% success rate. The comparison with CWE Checker is supported by Tables 4-5. The abstract claims are supported by the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper primarily makes capability claims ('BASICS detects/patches vulnerabilities') rather than strong causal claims. Where causal language is used (e.g., 'the emulation of certain functions... led to an overestimation of stack writes'), it is backed by analysis of specific failure cases. The experimental design of comparing tool output against ground truth is adequate for the capability claims made."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds its claims to binary C programs and stack buffer overflows specifically. Section 11 (Conclusion) states 'BASICS performs well on smaller codebases' and acknowledges 'limitations with scalability when applied to larger applications.' Section 8.4 explicitly discusses when BASICS does and does not scale. The title and framing are appropriately specific to buffer overflow mitigation."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations for results: the high false negative rate is attributed to two specific factors (concolic execution inaccuracies and missing atomic transitions). The timeout issues are attributed to state explosion in complex programs with loops. The paper also notes that SARD's better results are 'mainly due to SARD's programs being simpler, smaller, and presenting less complex state spaces.'"
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This paper does not use LLMs or pre-trained models. BASICS is a formal verification tool based on model checking and concolic execution. No AI/ML model versions are relevant."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting or LLMs. BASICS uses formal verification techniques (model checking, LTL formulas, concolic execution)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper does not report key hyperparameters such as the timeout threshold for concolic execution, the maximum number of loop iterations (described as 'previously defined by the user' in Section 4.2 but never specified for the experiments), or SMT solver configuration. These settings affect results but are not documented."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This paper does not use agentic scaffolding or LLM-based systems. BASICS is a formal verification tool."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 8.2 documents preprocessing: from 64,099 Juliet instances, 1,762 CWE-121 cases were identified, yielding 3,542 cases (positive + negative). After excluding 532 timeouts, the final dataset was 3,010 cases. For SARD, 135 programs were manually classified. The filtering criteria and counts at each stage are stated."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "While there is no dedicated 'Limitations' section, limitations are discussed substantively throughout: Section 8.2 analyzes false positives and false negatives in detail, Section 8.4 discusses scalability issues, and Section 11 (Conclusion) summarizes key limitations ('limitations with scalability when applied to larger applications due to state explosion issues'). This constitutes substantive discussion spread across multiple sections."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The paper discusses specific threats: state explosion causing 532 timeouts (15% of Juliet dataset), concolic execution inaccuracies in emulating certain C library functions, missing atomic transitions for certain overflow types, and the limitation that only 5 C library functions have patch templates. These are specific to this study, not generic disclaimers."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries: it focuses on 'stack buffer overflow vulnerabilities' in 'compiled C binaries' using 'x86-64 Assembly.' Section 8.3 notes scanf patching is unsupported. Section 8.4 acknowledges scalability depends on code complexity. Section 9 discusses extensibility to other vulnerability types, implicitly acknowledging what is NOT currently covered."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The evaluation uses publicly available datasets (Juliet C/C++ [43] and NIST SARD [44]) that anyone can download and verify. The real-world applications are from public repositories. The BASICS tool itself is open source [21], allowing independent replication."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 8.1-8.2 describes how datasets were obtained: Juliet C/C++ test suite with 64,099 instances filtered to 1,762 CWE-121 cases, SARD with 135 manually classified C programs. Section 8.4 describes real-world application selection criteria: 'ability to compile into a single binary file on Linux' and 'varying codebase sizes.'"
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data sources are standard benchmarks (Juliet, SARD) and publicly available open-source applications."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: Juliet starts with 64,099 instances → filter to 1,762 CWE-121 → expand to 3,542 (positive + negative) → remove 532 timeouts → 3,010 final cases. SARD: 135 programs, each manually classified. The transformation steps and counts are provided."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgments section (Section after 11) discloses funding: 'This work was partially supported by P2030 through project I2DT, ref. COMPETE2030-FEDER-00389100, an ITEA4 European project (ref. 22025), and by FCT through the LASIGE Research Unit, ref. UIDB/00408/2025-LASIGE.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: L. Ferreirinha at VUSec, Vrije Universiteit Amsterdam; I. Medeiros at LASIGE, DI, Faculdade de Ciências, Universidade de Lisboa. These are academic institutions with no apparent conflict with the tool being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders are European public research programs (P2030/COMPETE2030, ITEA4, FCT) and an academic research unit (LASIGE). These are government/academic funding bodies with no financial interest in the specific outcomes of the BASICS tool evaluation."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. While no obvious conflicts exist for academic researchers, the absence of an explicit declaration is noted."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "BASICS is not a pre-trained model. It is a formal verification tool based on model checking and concolic execution. There is no training data or training cutoff date relevant to its operation."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable. BASICS does not use machine learning and has no training data that could overlap with test data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable. BASICS is a formal verification tool that does not learn from data. There is no contamination risk from pre-training."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study. The evaluation is entirely based on automated tool execution on benchmark datasets and open-source applications."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 8 reports verification time for each real-world application (ranging from 3.46 to 759.73 seconds). Section 8.2 mentions 532 timeouts in the Juliet dataset. These provide practical cost/latency information."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While individual verification times are reported for the 6 real-world applications, the total computational budget for all experiments is not stated. No total runtime for the 3,010 Juliet or 135 SARD experiments is given. The timeout threshold used is not specified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "BASICS achieves 87% precision in buffer overflow detection on the Juliet C/C++ dataset, outperforming CWE Checker's 83% precision.",
    286       "evidence": "Table 5 shows BASICS precision of 0.87 vs CWE Checker precision of 0.83 on the Juliet dataset. Confusion matrices in Table 4 provide raw counts.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "On the SARD dataset, BASICS greatly outperforms CWE Checker across all metrics, achieving 87% accuracy, 92% precision, 68% recall, and 78% F1-Score.",
    291       "evidence": "Table 5 shows BASICS metrics vs CWE Checker (52% accuracy, 41% precision, 87% recall, 56% F1-Score) on SARD. Confusion matrices in Table 4.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "All 39 patches performed by BASICS were successful in removing buffer overflow vulnerabilities.",
    296       "evidence": "Table 7 shows 39 patches performed and 39 successful, covering strcpy (28), sprintf (10), and gets (1) functions. Section 8.3 describes the validation methodology.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "BASICS detected and successfully mitigated 3 buffer overflows in real open-source applications.",
    301       "evidence": "Table 8 and Section 8.4 describe detection of potential BOs in HTML Parser, IPV6 Validator, and Contacts Management. Manual source code review confirmed them. Two were patched; one (scanf-based) could not be patched due to missing template.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The approach scales depending on code complexity, with verification time not proportional to lines of code.",
    306       "evidence": "Table 8 shows Thread-Fifo (261 LoC) took 4.79 seconds while Macgen (15 LoC) took 10.10 seconds, due to loop constructs. Hash-Map with nested loops took 759.73 seconds. 532 Juliet timeouts attributed to state explosion.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "BASICS combines model checking with concolic execution to detect and patch stack buffer overflows in x86-64 binaries. On the Juliet C/C++ dataset (3,010 cases), BASICS achieved 87% precision but only 23% recall, performing comparably to CWE Checker overall. On the simpler SARD dataset (135 programs), BASICS outperformed CWE Checker across all metrics (87% accuracy, 92% precision, 78% F1-Score). All 39 patches applied were successful. Scalability remains a key limitation due to state explosion in concolic execution with complex programs containing loops and nested function calls.",
    312   "red_flags": [
    313     {
    314       "flag": "Low recall on primary benchmark",
    315       "detail": "BASICS achieved only 23% recall on the Juliet C/C++ dataset, meaning it missed 77% of known vulnerabilities. The 532 excluded timeout cases (15% of dataset) further reduce effective coverage. The abstract emphasizes 'accuracy and precision above 87%' without prominently noting the low recall."
    316     },
    317     {
    318       "flag": "Single weak baseline",
    319       "detail": "Only CWE Checker is used as a baseline. Both tools achieve identical 55% accuracy on Juliet, suggesting neither is particularly strong. No comparison against more sophisticated binary analysis tools mentioned in related work (Arbiter, Vyper) is provided."
    320     },
    321     {
    322       "flag": "Tiny real-world evaluation",
    323       "detail": "Only 6 real-world applications were tested, all very small (15-261 LoC). Three of six had no vulnerabilities. This is insufficient to support claims about practical applicability to real CPS systems mentioned in the motivation."
    324     },
    325     {
    326       "flag": "No statistical tests for comparisons",
    327       "detail": "Claims that BASICS outperforms CWE Checker are based on comparing point estimates without any statistical significance testing."
    328     },
    329     {
    330       "flag": "Selective metric emphasis",
    331       "detail": "The abstract highlights 'accuracy and precision above 87%' which references the SARD results (87% accuracy) and Juliet precision (87%), but does not mention the 55% accuracy on Juliet or the 23% recall. This framing is somewhat misleading."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "CorCA: An Automatic Program Repair Tool for Checking and Removing Effectively C Flaws",
    337       "authors": ["J. Inácio", "I. Medeiros"],
    338       "year": 2023,
    339       "relevance": "Automated program repair tool combining static and dynamic analysis for C vulnerability detection — relevant to AI-augmented code repair evaluation."
    340     },
    341     {
    342       "title": "SoK: (State of) The Art of War: Offensive Techniques in Binary Analysis",
    343       "authors": ["Y. Shoshitaishvili", "R. Wang", "C. Salls"],
    344       "year": 2016,
    345       "relevance": "Foundational survey on binary analysis techniques including the Angr platform used in BASICS — relevant to understanding binary analysis tool ecosystems."
    346     },
    347     {
    348       "title": "Automated vulnerability detection in source code using deep representation learning",
    349       "authors": ["R. L. Russell", "L. Kim", "L. H. Hamilton"],
    350       "year": 2018,
    351       "relevance": "ML-based vulnerability detection in source code — relevant to comparing formal verification vs AI/ML approaches for vulnerability detection."
    352     },
    353     {
    354       "title": "VulHawk: Cross-architecture vulnerability detection with entropy-based binary code search",
    355       "authors": ["Z. Luo", "P. Wang", "B. Wang"],
    356       "year": 2023,
    357       "relevance": "ML-based binary vulnerability detection using language processing models — relevant to comparing formal methods vs deep learning for binary analysis."
    358     },
    359     {
    360       "title": "HeapHopper: Bringing bounded model checking to heap implementation security",
    361       "authors": ["M. Eckert", "A. Bianchi", "R. Wang"],
    362       "year": 2018,
    363       "relevance": "Bounded model checking combined with symbolic execution for security analysis — closely related formal verification approach for binary security."
    364     },
    365     {
    366       "title": "Automatic software repair: A bibliography",
    367       "authors": ["M. Monperrus"],
    368       "year": 2018,
    369       "relevance": "Comprehensive survey of automated code repair techniques — provides taxonomy relevant to understanding where AI-based repair fits in the landscape."
    370     },
    371     {
    372       "title": "Binary rewriting without control flow recovery",
    373       "authors": ["G. J. Duck", "X. Gao", "A. Roychoudhury"],
    374       "year": 2020,
    375       "relevance": "E9Patch binary rewriting tool used by BASICS — relevant to understanding practical binary patching infrastructure."
    376     },
    377     {
    378       "title": "Arbiter: Bridging the Static and Dynamic Divide in Vulnerability Discovery on Binary Programs",
    379       "authors": ["J. Vadayath", "M. Eckert", "K. Zeng"],
    380       "relevance": "Hybrid static-dynamic binary vulnerability discovery tool — relevant baseline for binary analysis evaluation methodology."
    381     }
    382   ]
    383 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs