scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23687B)
      1 {
      2   "paper": {
      3     "title": "Bugdar: AI-Augmented Secure Code Review for GitHub Pull Requests",
      4     "authors": [
      5       "John E. Naulty",
      6       "Eason Chen",
      7       "Joy Wang",
      8       "George Digkas",
      9       "Kostas Chalkias"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2503.17302"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No source code, repository URL, or Zenodo archive is mentioned anywhere in the paper. Bugdar is described as a system but no public release is provided."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The evaluation dataset of GitHub pull requests with known vulnerabilities is described in Section III.C but no download link or public repository is provided."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No environment specifications, dependency lists, library versions, or hardware descriptions are provided. The paper mentions using GPT-4o and o1-preview via API but no setup details."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No reproduction instructions, README, or scripts are provided. The system architecture is described at a high level but not with sufficient detail for reproduction."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Table I reports only point estimates for precision, recall, F1, and accuracy. No confidence intervals, error bars, or uncertainty measures are reported."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper compares GPT-4o vs. o1-preview and RAG vs. no-RAG conditions but provides no statistical significance tests. Differences are stated as raw number comparisons."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No formal effect sizes (Cohen's d, odds ratios) are reported. While raw metric differences are shown (e.g., GPT-4o F1 0.44 vs. o1-preview F1 0.30), there is no baseline context or standardized effect size measure."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The evaluation uses 14 pull requests for timing and an unspecified number of vulnerability examples for classification. No justification for the sample sizes is given and no power analysis is discussed."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance, standard deviation, or spread measures across runs are reported. Results appear to be from single runs with no mention of repeated experiments."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper compares two LLMs (GPT-4o and o1-preview) with/without RAG, but does not compare against any existing vulnerability detection tools or prior systems as baselines. Traditional static analysis tools are mentioned in Section IV.C case studies but no systematic comparison is provided."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No baseline systems from prior work are included in the evaluation. The paper mentions traditional static analysis tools and prior work in Section II but does not benchmark against them."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "While the paper compares RAG vs. no-RAG, this is not a proper ablation study. The system has multiple components (preprocessing, context retrieval, judge LLM, reporting) but no systematic ablation is performed to measure each component's contribution."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table I reports precision, recall, F1 score, and accuracy — four distinct metrics. Time consumption is also reported as a separate evaluation dimension in Section IV.B."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Section IV.D mentions 'preliminary user studies with developers' but provides no quantitative data, no structured evaluation protocol, and no specific results — just that developers 'valued Bugdar's concise, actionable commentary.' This does not constitute a meaningful human evaluation of the system's outputs."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No mention of train/test splits or held-out test sets. The paper does not describe how the evaluation dataset was partitioned or whether any data used for fine-tuning overlapped with test data."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Results are reported as aggregate metrics across all vulnerability types and programming languages. No per-language or per-vulnerability-type breakdowns are provided despite the paper emphasizing multi-language support."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section IV.C discusses specific failure cases: Bugdar's failure to detect a logic flaw due to limited domain-specific context understanding, and incorrectly flagging a safe use of an 'unsafe' block in Rust."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that RAG had 'mixed effects on description tasks' — GPT-4o's description performance actually decreased with RAG (F1 from 0.65 to 0.50). Section V.A also acknowledges high false-positive rates."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The abstract claims Bugdar 'demonstrates exceptional efficiency, processing an average of 56.4 seconds per pull request' which is supported, but also claims it provides 'context-aware vulnerability analysis' and 'reduces the reliance on manual reviews' and 'enhances the security posture' — these are not empirically supported. The classification accuracy is modest (best F1 = 0.49) and false positive rates are high, yet the abstract makes no mention of these significant limitations."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims like 'RAG improved classification performance' (Section IV.A) and 'Bugdar reduces the reliance on manual reviews' (abstract) without adequate causal design. The RAG comparison lacks controls for confounds, and the claim about reducing manual review reliance is not empirically tested."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The abstract and title present Bugdar broadly as an 'AI-Augmented Secure Code Review' system, but evaluation is limited to a small number of pull requests (14 for timing, unspecified for vulnerability detection) from what appears to be primarily Mysten Labs projects. The paper does not bound its claims to this specific setting."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether the vulnerability detection performance might differ on non-blockchain code, or whether the timing advantage is simply due to the LLM not actually catching many vulnerabilities."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper mentions 'GPT-4o' and 'o1-preview' but provides no specific API version, snapshot date, or model identifier. Marketing names without version specifics do not count per the schema criteria."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper mentions 'Prompt Engineering and Fine-tuning' as a feature (Section I) and Algorithm 1 describes analyzing chunks with LLMs, but no actual prompt text is provided anywhere in the paper or supplementary materials."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No hyperparameters are reported — no temperature, top-p, max tokens, or any LLM API settings are mentioned. No fine-tuning hyperparameters are provided either."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Bugdar uses agentic scaffolding (code chunking, RAG context retrieval, judge LLM for best analysis selection per Algorithm 1), but the details are insufficient. How the judge LLM selects the 'best analysis' is not explained. The RAG retrieval strategy, chunk sizes, and context window management are not specified."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Section III.C mentions a dataset of 'real-world GitHub pull requests and source code with known security vulnerabilities' but does not describe how many examples there are, how they were selected, or how the ground truth was established beyond high-level mention of 'manual audits' and 'bug bounty reports.'"
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section V.A is titled 'Strengths, Challenges, and Limitation' and discusses several limitations including variation in effectiveness across languages, domain-specific code challenges, and reliance on training datasets."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The limitations in Section V.A are fairly generic: 'variation in Bugdar's effectiveness across different programming languages,' 'domain-specific code poses challenges,' 'reliance on training datasets, which could introduce biases.' These are general categories of concern, not specific threats tied to this study's design or findings."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the evaluation to specific languages, project types, or vulnerability categories, despite the evaluation being narrow in scope."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No raw data (vulnerability examples, pull request data, model outputs) is made available for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Section III.C states data came from 'real-world GitHub pull requests and source code with known security vulnerabilities' with ground truth from 'manual audits' and 'bug bounty reports,' but provides no specifics: how many examples, which projects, what time period, what inclusion criteria."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are recruited for the main evaluation. The 'user studies' in Section IV.D are too vaguely described to evaluate recruitment. The dataset is code-based, not human-participant-based."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The data pipeline from raw pull requests to evaluation dataset is not documented. How vulnerability labels were assigned, how many experts reviewed each example, and what the inter-annotator agreement was are all unstated."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The Acknowledgments section exists but is empty — no funding sources are disclosed. Four of five authors are affiliated with Mysten Labs, a blockchain company, and the tool is designed for their use case."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: John E. Naulty, Joy Wang, George Digkas, and Kostas Chalkias at Mysten Labs; Eason Chen at Carnegie Mellon University."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, but 4 of 5 authors work at Mysten Labs, which presumably benefits from positive results about Bugdar. The funder/employer is not independent of the outcome. The empty Acknowledgments section does not address this conflict."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interest declaration is present in the paper. Given that 4 authors work at Mysten Labs and the tool is presumably a Mysten Labs product, this omission is notable."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses GPT-4o and o1-preview to evaluate vulnerability detection capabilities on a benchmark dataset, but does not state the training data cutoff dates for these models."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No discussion of whether the vulnerability examples in the evaluation dataset could have appeared in GPT-4o or o1-preview's training data. This is relevant since bug bounty reports are often public."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The evaluation uses real-world vulnerabilities that may have been publicly available before the models' training cutoffs. This contamination risk is not addressed anywhere in the paper."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The paper mentions 'preliminary user studies' in Section IV.D but provides no structured study design. The user feedback is anecdotal and does not constitute a formal human subjects study requiring pre-registration."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The 'user studies' are informal developer feedback, not a formal human subjects study. No structured data collection from participants is described."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No formal human study with participants was conducted. The developer feedback in Section IV.D is anecdotal."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No formal human study was conducted. The developer feedback is informal and anecdotal."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No experimental study with human participants was conducted."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No experimental study with human participants was conducted."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No formal human study with participants was conducted."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "Algorithm 1 mentions 'Update Credits based on token usage' (step 11) but no actual cost figures, token counts, or API expenses are reported in the paper. Only wall-clock time (56.4 seconds per PR) is provided."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget, GPU hours, API spend, or hardware specifications are mentioned. The system calls GPT-4o and o1-preview APIs but the cost is not quantified."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "Bugdar processes pull requests at an average of 56.4 seconds per PR or 30 lines of code per second, which is 100x faster than human reviewers.",
    292       "evidence": "Section IV.B: Analyzed 14 pull requests with 23,644 lines in 790 seconds. Compared to internal data showing human review of 10,000 lines of Move code took two engineers 10 days (0.11 lines/sec).",
    293       "supported": "weak"
    294     },
    295     {
    296       "claim": "GPT-4o with RAG achieves the best vulnerability classification performance with 39% precision, 64% recall, and F1 of 0.49.",
    297       "evidence": "Table I shows the metrics for both models with and without RAG. GPT-4o+RAG has the highest F1 for classification.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "RAG improves vulnerability classification performance for both models.",
    302       "evidence": "Table I: GPT-4o classification F1 improved from 0.44 to 0.49 with RAG; o1-preview from 0.30 to 0.43.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "RAG has mixed effects on vulnerability description tasks.",
    307       "evidence": "Table I: GPT-4o description F1 decreased from 0.65 to 0.50 with RAG; o1-preview decreased from 0.57 to 0.53.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Bugdar can detect vulnerabilities that traditional static analysis tools miss.",
    312       "evidence": "Section IV.C mentions a case where Bugdar detected a reentrancy vulnerability in Solidity that 'other tools had overlooked,' but provides no systematic comparison.",
    313       "supported": "weak"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval",
    318     "case-study"
    319   ],
    320   "key_findings": "Bugdar is an AI-augmented code review tool that integrates into GitHub pull requests using GPT-4o and o1-preview with RAG for vulnerability detection. The best configuration (GPT-4o with RAG) achieves an F1 of 0.49 for vulnerability classification and 0.50 for description, with RAG improving classification but hurting description performance. The system processes code at approximately 30 lines per second (56.4 seconds per PR), which is substantially faster than manual review. However, overall detection accuracy remains modest with high false positive rates, and the evaluation is limited to 14 pull requests without comparison to existing automated tools.",
    321   "red_flags": [
    322     {
    323       "flag": "Company evaluating its own product",
    324       "detail": "Four of five authors work at Mysten Labs, and Bugdar appears to be a Mysten Labs product used on their own codebase. The evaluation dataset likely comes from Mysten Labs internal projects. No conflict of interest statement is provided despite the empty Acknowledgments section."
    325     },
    326     {
    327       "flag": "Very small evaluation sample",
    328       "detail": "The timing evaluation uses only 14 pull requests. The vulnerability classification dataset size is never stated — the total number of vulnerability examples used in Table I is unknown, making it impossible to assess the reliability of the reported metrics."
    329     },
    330     {
    331       "flag": "No baselines against existing tools",
    332       "detail": "The paper does not compare Bugdar against any existing static analysis tools (e.g., Slither, Mythril, Semgrep) despite claiming advantages over them. The only comparison is between two LLM configurations."
    333     },
    334     {
    335       "flag": "Misleading speed comparison",
    336       "detail": "The 100x speed claim compares Bugdar's automated processing time to a single anecdotal example of human review from 'internal data.' This comparison conflates speed with accuracy — Bugdar achieves only 0.49 F1 while the human review presumably catches more issues."
    337     },
    338     {
    339       "flag": "Abstract overstates findings",
    340       "detail": "The abstract claims Bugdar provides 'exceptional efficiency' and 'reduces reliance on manual reviews' and 'enhances the security posture,' but the actual results show modest F1 scores (0.49 best), high false positive rates, and no evidence that it actually reduces manual review needs."
    341     },
    342     {
    343       "flag": "No uncertainty quantification",
    344       "detail": "All metrics are reported as single point estimates with no confidence intervals, error bars, or repeated runs. With an unstated sample size, it is impossible to assess the reliability of any reported metric."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Do you still need a manual smart contract audit?",
    350       "authors": ["I. David"],
    351       "year": 2023,
    352       "arxiv_id": "2306.12338",
    353       "relevance": "Evaluates GPT-4 and Claude on DeFi smart contract vulnerability detection, directly relevant to LLM capability in security code review."
    354     },
    355     {
    356       "title": "How far have we gone in vulnerability detection using large language models",
    357       "authors": ["Z. Gao", "H. Wang"],
    358       "year": 2023,
    359       "arxiv_id": "2311.12420",
    360       "relevance": "VulBench benchmark for LLM vulnerability detection showing LLMs outperform traditional deep learning approaches."
    361     },
    362     {
    363       "title": "Drawbacks in detecting vulnerabilities: A study of large language models on code",
    364       "authors": ["Z. Li", "R. Q. Shin"],
    365       "year": 2021,
    366       "arxiv_id": "2105.00823",
    367       "relevance": "Studies limitations of LLMs for vulnerability detection in code."
    368     },
    369     {
    370       "title": "Evaluating large language models trained on code",
    371       "authors": ["M. Chen"],
    372       "year": 2021,
    373       "arxiv_id": "2107.03374",
    374       "relevance": "Seminal Codex/HumanEval paper evaluating LLMs on code generation tasks."
    375     },
    376     {
    377       "title": "GPT-4 technical report",
    378       "authors": ["J. Achiam"],
    379       "year": 2023,
    380       "arxiv_id": "2303.08774",
    381       "relevance": "Technical report for GPT-4, the foundation model family used in Bugdar's evaluation."
    382     },
    383     {
    384       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    385       "authors": ["P. Lewis"],
    386       "year": 2020,
    387       "arxiv_id": "2005.11401",
    388       "relevance": "Foundational RAG paper, the retrieval augmentation technique used by Bugdar."
    389     },
    390     {
    391       "title": "Why don't software developers use static analysis tools to find bugs?",
    392       "authors": ["B. Johnson", "Y. Song"],
    393       "year": 2013,
    394       "relevance": "Studies developer adoption barriers for static analysis tools, motivating AI-augmented alternatives like Bugdar."
    395     },
    396     {
    397       "title": "GPTutor: a ChatGPT-powered programming tool for code explanation",
    398       "authors": ["E. Chen", "R. Huang"],
    399       "year": 2023,
    400       "relevance": "AI-powered programming tool using ChatGPT, relevant to LLM-based developer tools evaluation."
    401     }
    402   ]
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs