calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (16819B)
      1 {
      2   "paper_slug": "adversarial-bug-reports-2025",
      3   "calibration_date": "2026-02-28",
      4   "model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 50,
      7   "disagreement_count": 0,
      8   "agreement_rate": 1.0,
      9   "disagreements": [],
     10   "opus_checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "Section 1.1.2 states a complete replication package is provided with figshare DOI 10.6084/m9.figshare.31140619, including 'automation scripts for generating malicious bug reports, code to orchestrate the attack workflows, all defense mechanisms with prompts.'"
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The replication package (figshare DOI in Section 1.1.2) includes 'the full set of issues, generated patches, and execution trajectories' used in the experiments."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Tool versions are specified (SWE-agent v1.1.0, SW-ReX v1.2.2, Claude Sonnet claude-sonnet-4-20250514, DevStral:24B) but no requirements.txt, Dockerfile, conda environment, or comprehensive dependency specification is provided. Mentioning tool versions alone is insufficient per schema criteria."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Section 1.1.2 describes a complete replication package via figshare containing automation scripts and orchestration code. The package includes prompts, seeds, and generated content for reproducing attacks and defenses."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Cost figures include standard deviations (e.g., '$0.87 ± $0.39'), but the primary results — attack success rates and detection rates — are reported as raw counts and percentages without confidence intervals or error bars."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No statistical significance tests are reported. Comparative claims about defense mechanisms (e.g., o4-mini 47% vs GPT-4.1-mini 23%) are made without any statistical test, despite the small sample of 51 adversarial reports."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports raw counts and percentages but no formal effect sizes (Cohen's d, odds ratios, relative risk). Detection rate percentages are standalone metrics, not effect sizes with baseline context as defined by the schema."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "51 adversarial bug reports were generated with no justification for why this number was chosen and no power analysis. The allocation across five attack categories (5, 8, 10, 14, 14) appears driven by practical feasibility, not statistical considerations."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Section 8 acknowledges LLM non-determinism and that temperature could not be fixed to 0. All attack and detection results are from single runs with no variance measures. Cost standard deviations are reported but not variance for the main detection/success rate results."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Multiple defense mechanisms are systematically compared in Table 2: PromptGuard, PromptGuardV2, LlamaGuard v3 and v4, Granite Guardian, GPT-4.1-mini, o4-mini, CodeQL, and GitHub Copilot."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "All evaluated defenses are contemporary (2023-2025): LlamaGuard4, PromptGuardV2, GPT-4.1-mini, o4-mini, Granite Guardian. SWE-agent was selected as top-ranked on SWE-bench verified leaderboard at evaluation start (May 2025)."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 6.5 evaluates individual defense components and their combinations (ensemble), effectively ablating the defense pipeline. Individual pre-APR filters are evaluated separately, then in combination with post-APR detection."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple metrics are used: attack success rate, per-defense detection rate, false-positive rate (on 100 real issues), cost per issue, and test suite failure categorization."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 5 states: 'Manual validation was performed by two independent experts—a security researcher and a penetration tester—who jointly reviewed each issue and patch' to determine whether patches fulfilled attacker objectives."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "False-positive rate evaluation used 100 real issues from psf/requests (issues #6000-6100, excluding one deleted entry) as a separate validation set from the 51 adversarial bug reports."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 2 provides per-attack-type breakdown (Naive APR, Inject, CI/CD, Revert CVE, Noise APR) with detection counts and percentages for each defense mechanism."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Sections 6.3-6.4 discuss defense failures: LlamaGuard detected 0/51 issues, PromptGuard detected only 3/51, CodeQL detected 0 malicious patches. Section 7.2 discusses broader failure patterns."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 6.4: 'Using CodeQL with the python rule-set was not able to detect any of the created patches as malicious.' LlamaGuard returned 0% detection. These are clear negative results reported without hedging."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims are verified: 90% attack success (Table 2: 46/51 = 90.2%), best pre-filter blocked 47% (o4-mini unstructured: 24/51 = 47.05%), post-repair effective in 58% (Copilot all levels: 30/51 = 58.82%). All figures match."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's causal claim — adversarial bug reports cause insecure patches — is demonstrated through controlled experiments. Each adversarial report is crafted with a specific intent and submitted, then outcomes are verified by expert annotators. This controlled manipulation design supports causal inference."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 8 explicitly bounds findings: 'Our experiments focus on a subset of open-source projects and a single APR system (SWE-agent)' and 'our findings may not generalize to proprietary software or APR tools with substantially different architectures or training regimes.'"
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 8 discusses specific alternative explanations: annotation subjectivity (dual-annotator mitigation), LLM non-determinism (temperature not fixable without violating construct validity), and whether attack success metrics capture real downstream harm (construct validity)."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Specific model identifiers provided: SWE-agent v1.1.0, SW-ReX v1.2.2, Claude Sonnet (claude-sonnet-4-20250514), DevStral:24B, GPT-4.1-mini, o4-mini, LlamaGuard v3 and v4, PromptGuard, PromptGuardV2, Granite Guardian."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Figure 3 provides the full Revert CVE attack prompt template and Figure 4 provides the full pre-APR issue classification prompt. The replication package contains all prompts with actual fill values."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Section 5 states 'when using LLMs, we used their default temperature' without specifying what those defaults are for each model. No temperature values, top-p, or max tokens are reported."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Attack scaffolding is described in detail: context-building using LSH and rapidfuzz (Section 3.2.1), DevStral for generation, GitHub API workflow. Figure 1 illustrates the full APR pipeline architecture. SWE-agent configuration is described."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.2.1 describes diff sanitization ('removing tests, documentation, and unrelated changes'). Section 5 describes attack allocation across types. False-positive study documents issue selection (#6000-6100 excluding one deleted entry)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 8 'Threats to Validity' is a dedicated section with substantive discussion covering internal validity, external validity, construct validity, and tool limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Threats are specific: dual-annotator protocol for annotation subjectivity, inability to fix temperature to 0 without violating construct validity, single APR system (SWE-agent), and attack success metric not capturing downstream exploitation feasibility."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 8 states: 'Our experiments focus on a subset of open-source projects and a single APR system (SWE-agent)' and 'our findings may not generalize to proprietary software or APR tools with substantially different architectures or training regimes.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Figshare replication package (DOI: 10.6084/m9.figshare.31140619) includes 'the full set of issues, generated patches, and execution trajectories' for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5 describes project selection (5 SWE-bench repositories), attack generation process (DevStral with context-building), and outcome recording (Copilot comments, CodeQL results, test suite outcomes, manual validation)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. The study generates adversarial bug reports and tests them on software repositories. The two expert annotators are researchers performing analysis, not study participants."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Full pipeline documented: adversarial issue generation → GitHub fork submission → SWE-agent processing → PR creation → test suite execution → CodeQL scan → Copilot review → manual expert validation. Each step is described in Sections 3-5."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 1.1.3: 'This work was supported by the Polish National Agency for Academic Exchange (NAWA) - BPN/BAT/2025/1/00019 and Agentur für Bildung und Internationalisierung (ÖeAD).'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations listed on first page: Piotr Przymus at Nicolaus Copernicus University (Toruń, Poland), Andreas Happe and Jürgen Cito at TU Wien (Vienna, Austria). None affiliated with companies whose tools are evaluated."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "NAWA and ÖeAD are academic exchange agencies with no financial interest in the security of APR systems or any of the evaluated tools (OpenAI, Meta, GitHub, Anthropic products)."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or declaration of financial interests (patents, equity, etc.) appears in the paper. Per schema, absence of disclosure is not absence of conflict."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Claude Sonnet (claude-sonnet-4-20250514) is used for patch synthesis, and other LLMs for filtering, but no training data cutoff dates are stated for any model. The SWE-bench repositories are major Python projects likely present in training data."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether Claude Sonnet or other models may have been trained on data from the five target repositories (django, flask, pip, requests, scikit-learn). These are among the most prominent Python projects with extensive online presence."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The experiments use SWE-bench repositories with Claude Sonnet trained before May 2025. SWE-bench was published in 2024 and the repositories predate model training. This contamination risk is not discussed, though it could inflate the APR's patch generation success rate."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants. Expert annotators performing validation are researchers, not study participants."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. The study operates on software repositories and generated code. Responsible disclosure practices are noted in Section 7.5."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. The two expert annotators are described by role (security researcher, penetration tester) which is appropriate for a validation task."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants; not a human subjects study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants; not applicable."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants; not applicable."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants; not applicable."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Section 6.6 provides detailed cost analysis: attack generation ~$0.000295/report, GPT-4.1-mini pre-filter ~$0.00101/issue, o4-mini ~$0.00268/issue, APR execution $0.87 ± $0.39/issue. Costs reported with standard deviations."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "Per-issue API costs are reported but total computational budget is not stated. The paper mentions models ran locally on consumer-grade GPUs (Section 3.2.1) but gives no specifics on GPU hours or total expenditure for the full study."
    281       }
    282     }
    283   }
    284 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs