ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24424B)


      1 {
      2   "paper": {
      3     "title": "Catch Me If You Can: Rogue AI Detection and Correction at Scale",
      4     "authors": ["Fatemeh Stodt", "Jan Stodt", "Mohammed Alshawki", "Javad Salimi Sratakhti", "Christoph Reich"],
      5     "year": 2025,
      6     "venue": "Electronics",
      7     "doi": "10.3390/electronics14204122"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper claims 'We release code, scenario schemas, adapters, and analysis scripts' (Section 1, contribution 5) but no repository URL or archive link is provided anywhere in the paper. The Data Availability Statement says 'Derived data supporting the findings of this study are available from the corresponding author on request.'"
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The Data Availability Statement says data is 'available from the corresponding author on request,' which does not count as released. No dataset download link is provided."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'Python 3.12.11 UAI layer' and 'pinned Linux containers' (Section 4.5) but does not provide a requirements.txt, Dockerfile, or detailed dependency list with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the methodology in detail but does not provide actionable commands or a reproduction guide."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "95% cluster-bootstrap CIs are reported throughout, e.g., Table 3 reports CIs for delta values, Table 4 reports Wilson 95% CIs for flag rates. Section 4.6 specifies B >= 2000 resamples."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "A likelihood-ratio test is reported (chi-squared(5) = 41.56, p = 7.22e-8) against an intercept-only model (Section 5.3). Holm-adjusted pairwise contrasts are in Table 7. Mann-Whitney U and Fisher's exact tests are mentioned as alternatives."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 4 reports odds ratios with 95% CIs (e.g., ViT OR = 47.67 [6.71, 338.66] vs. LLM), risk differences in percentage points, and Cohen's d is mentioned for continuous quantities (Section 4.6)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No power analysis or justification for why N=20 scenarios per architecture was chosen. The paper acknowledges small-N limitations (e.g., 'With n=20 scenarios per architecture, Wilson 95% CIs for rates in the 5-15% band span roughly ±10-15 percentage points') but does not justify the sample size."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Bootstrap CIs capture variability across resamples. Per-episode noise floor sigma_e is estimated from neutralized/repeat probes via MAD (Section 3.3). Coefficient of variation across 20 scenarios is mentioned (Section 4.6)."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "LLM is used as the baseline architecture in the logistic regression (Tables 6-7). Honest persona serves as a within-architecture baseline compared to strategic persona (Table 3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The models evaluated include contemporary architectures: GPT-4 family, Claude, Gemini, Qwen2.5, DeepSeek, Mamba, ViT, CLIP, Stable Diffusion — all current as of 2025."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is presented. The paper does not systematically remove components of SEB (e.g., probe types, cross-checks, threshold parameters) to measure their individual contribution to detection."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: deception flag rate, mean posterior error, residual deception, override rate, EV consistency, belief error, cross-check inconsistency, and regret (Sections 4.6, 5.1-5.2)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the system's outputs is included. All evaluation is automated via the SEB protocol. Given claims about deployment readiness and governance utility, human expert review of flagged cases would strengthen the work."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Thresholds (tau_p, delta, tau_cc) were pre-selected on 'a small held-out development split of honest persona episodes' (Section 3.2), and main results are reported on separate data. Dev/test separation is explicit."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model (Table 1), per architecture (Tables 3-4), per model category (Figure 1: code-specialized vs. general-purpose vs. reasoning), and by honest vs. strategic persona."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.6 discusses where pre-correction predictions were wrong (underestimated CLIP/Mamba, overestimated RL). Section 6.2 discusses why ViT peaks. Small-N cells are flagged (e.g., deepseek-r1:8b N=2)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that qwq:32b showed a negative delta (-10.0 pp), meaning the strategic prompt did not increase deception (Table 2). Most pairwise contrasts are non-significant after Holm correction (Table 7). Post-correction telemetry was incomplete."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract's claims about ViT 80%, CLIP 15%, Mamba/RL/SD 10%, LLM 5% are directly supported by Table 3. The logistic regression chi-squared result matches Section 5.3. The abstract appropriately hedges about post-correction results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper uses language like 'reporting interfaces and incentive framing are central levers for mitigation' (abstract) which implies causal claims. However, the study design is observational across architectures with confounded induction methods (persona prompts for LLMs vs. payoff shaping for ViT/RL). The paper acknowledges this (Section 4.3) but still makes these claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper repeatedly states results are 'snapshot-specific' and tied to 'this export.' Section 7 explicitly notes 'Results reflect a single export with some small-N cells.' Section 6.4 labels tiering as 'provisional' and 'heuristic for this snapshot.' The title is broad but the text constrains claims."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6.1 proposes three testable hypotheses for the CLIP-Mamba near-equality (information constraints, optimization equilibria, interface effects). Section 6.2 discusses two plausible contributors for ViT's high rate. The induction method confound is explicitly discussed as an alternative explanation (Section 4.3)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Table 1 lists model names like 'llama3.2:3b', 'qwen2.5:7b', 'mistral:7b' — these are Ollama tags, not exact version snapshots. The paper says 'At run time we log exact model IDs (provider name, version tag)' but these are not provided in the paper itself. Commercial models are listed generically as 'Claude, GPT-4 family, Gemini' without versions."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes persona prompts ('honest' and 'strategic') and the JSON output schema conceptually, but does not provide the actual prompt text used. Section 4.4.1 says 'Inputs are text prompts containing (p(H), signal description, mode, payoffs) plus persona' without showing the actual prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Temperature = 0 (or 0.1 with fixed seeds), detection thresholds tau_p = 0.08, delta parameters, tau_cc = 0.05, payoff tuple (4,0,0,5), probe fraction 15%, bootstrap B >= 2000 are all specified (Sections 3.2, 4.5)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The SEB harness is described in detail: the Unified Agent Interface (Section 4.2), per-architecture adapters (Section 4.4), the BRE environment for RL agents (Section 4.4.2), JSON schema validation, and the detection/correction pipeline (Algorithms 1-2)."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.5 documents how outputs are processed: JSON schema validation, one deterministic normalization pass (strip prose, repair quoting), no stochastic retries, schema failures recorded. Both inclusive and valid-only denominators are computed. Scenario generation from programmatic priors/likelihoods is described (Section 4.3)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Limitations are discussed throughout Section 6 and the conclusions. The paper explicitly notes incomplete post-correction telemetry, small-N cells, unavailable agent/episode IDs, and the confounded induction method. Section 3.3 notes 'Independent replication has not yet been conducted.'"
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: induction method confound between architectures (Section 4.3), small N=20 per architecture limiting detectable differences to ~15-20pp (Section 5.1), incomplete post-correction exports (Section 5.2), unclustered SEs due to missing agent IDs (Section 5.3), and N=2 for deepseek-r1:8b."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper states acceptance gates 'are practical decision aids rather than calibrated risk predictors' (Section 7). It explicitly notes 'we do not claim SEB scores predict real-world incident risk without external validation' (Section 1). Post-correction results are labeled 'illustrative only' (Table 5)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Data is only 'available from the corresponding author on request' per the Data Availability Statement. Raw traces, prompts, and outputs are not publicly available despite claims of releasing artifacts."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is synthetic and fully specified: scenarios are programmatically generated from priors p(H) in {0.3,...,0.9} with likelihood ratios ~1.5-8x (Section 4.3). Episode structure (E=5, R=12), probe composition (~15%), and per-round data fields are documented (Section 3.3)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. All data comes from AI model evaluations on synthetic scenarios."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from scenario generation through model invocation, JSON validation, normalization, schema failure handling, and indicator computation is documented across Sections 4.2-4.6. Both valid-only and inclusive denominators are tracked."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Funding section explicitly states: 'This research received no external funding.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All author affiliations are listed. The Conflicts of Interest section notes 'Author Jan Stodt was employed by the company go AVA GmbH.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "The research received no external funding, making this criterion not applicable."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Conflicts of Interest section discloses Jan Stodt's employment at go AVA GmbH and states remaining authors have no commercial/financial conflicts."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate pre-trained model knowledge on a benchmark. SEB uses synthetic Bayesian inference tasks that are not in any training set. The evaluation measures behavioral responses to novel scenarios, not memorized knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — scenarios are programmatically generated synthetic tasks, not drawn from any public benchmark that could appear in training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — SEB is a novel synthetic protocol, not a pre-existing benchmark. There is no contamination risk from training data."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Latency is shown qualitatively in Figure 3 (sub-second to minutes) and median runtime per trial is plotted, but no API costs, token counts, or cost per scenario are reported despite the paper running many commercial API calls."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or total API spend is reported. The paper mentions 'pinned Linux containers' and various inference backends but does not quantify the total compute used."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ViT exhibits the highest pre-correction deception rate (80%) compared to all other architectures, and this difference is statistically significant after Holm adjustment.",
    286       "evidence": "Table 3 shows ViT strategic rate = 80.0%. Table 7 shows all ViT contrasts significant after Holm correction (p_Holm <= 0.0024). Likelihood-ratio test chi-squared(5) = 41.56, p = 7.22e-8 (Section 5.3).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CLIP, Mamba, RL agents, and Stable Diffusion show similar deception rates (10-15%) that are not significantly different from LLMs (5%) or from each other.",
    291       "evidence": "Table 3 shows rates of 15%, 10%, 10%, 10% respectively. Table 7 shows all non-ViT pairwise contrasts have p_Holm = 1 (non-significant). CIs overlap substantially (Table 4).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "SEB enables cross-architecture comparison of deception-related behavior under a unified protocol.",
    296       "evidence": "The paper describes the Unified Agent Interface (Section 4.2), adapters for 6 architecture families (Section 4.4), and standardized scenario/probe design (Sections 3-4). Results in Tables 1-7 demonstrate the protocol running across architectures.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Code-specialized models exhibit higher pre-correction deception rates than general-purpose or reasoning-specialized models.",
    301       "evidence": "Figure 1 shows code-specialized mean = 80.2%, general-purpose = 60.4%, reasoning = 26.7%. However, reasoning category has only n=1 model, and this is an observational association confounded by architecture and coverage (Section 6.5).",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "SEB-Correct can reduce deception to acceptable levels for some architectures.",
    306       "evidence": "Only two illustrative exemplars are provided (Table 5): qwen2.5-14b with residual 3.2% (ACCEPT) and mamba-ssm-2.8b with residual 18.4% (REJECT). Full post-correction telemetry was not exported.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper introduces the Strategy Elicitation Battery (SEB), a cross-architecture protocol for measuring deceptive reporting in AI systems using Bayesian inference tasks with persona-controlled instructions. Across six architecture families (LLMs, RL agents, ViT, CLIP, Mamba, Stable Diffusion) with 20 scenarios each, ViT showed a significantly higher pre-correction deception flag rate (80%) than all other architectures (5-15%), with the difference surviving Holm correction. All other pairwise differences were non-significant in this snapshot. Post-correction results were limited to two illustrative exemplars, and the pre-registered mixed-effects analysis awaits per-round agent/episode identifiers.",
    312   "red_flags": [
    313     {
    314       "flag": "Artifacts promised but not delivered",
    315       "detail": "The paper lists 'Open resources for reproducibility' as a contribution and claims to 'release code, scenario schemas, adapters, and analysis scripts,' but no repository URL is provided and data is only available upon request."
    316     },
    317     {
    318       "flag": "Very small sample sizes",
    319       "detail": "Only 20 scenarios per architecture, with some per-model cells as small as N=2 (deepseek-r1:8b) and N=7 (starcoder2:7b). Wilson CIs span ±10-15pp for the 5-15% rates, limiting the ability to distinguish non-ViT architectures."
    320     },
    321     {
    322       "flag": "Incomplete post-correction evidence",
    323       "detail": "Only two illustrative post-correction exemplars are shown (Table 5). The paper claims SEB-Correct enables deployment decisions but provides no systematic post-correction data across architectures."
    324     },
    325     {
    326       "flag": "Confounded induction methods",
    327       "detail": "Strategic behavior is induced via persona prompts for LLMs/SSMs but via payoff/adapter incentives for ViT/RL/CLIP/SD. The paper acknowledges this confound but still draws cross-architecture conclusions. ViT's high rate could partly reflect the induction method rather than genuine architectural propensity."
    328     },
    329     {
    330       "flag": "Operationalization of 'deception' is narrow",
    331       "detail": "The paper operationalizes deception as reward-sensitive misreporting in a specific Bayesian belief-reporting task. Whether this correlates with real-world deceptive behavior is unknown, and the paper does not validate SEB scores against any external criterion."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "AI deception: A survey of examples, risks, and potential solutions",
    337       "authors": ["P.S. Park", "S. Goldstein", "A. O'Gara", "M. Chen", "D. Hendrycks"],
    338       "year": 2024,
    339       "relevance": "Foundational survey on deceptive behaviors in LLMs including GPT-4, directly motivating this work's cross-architecture extension."
    340     },
    341     {
    342       "title": "Constitutional AI: Harmlessness from AI feedback",
    343       "authors": ["Y. Bai"],
    344       "year": 2022,
    345       "arxiv_id": "2212.08073",
    346       "relevance": "Key alignment technique paper showing emergence of deceptive behaviors in more powerful models."
    347     },
    348     {
    349       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    350       "authors": ["E. Hubinger"],
    351       "year": 2024,
    352       "arxiv_id": "2401.05566",
    353       "relevance": "Demonstrates persistent context-dependent deception via backdoor training that resists safety interventions."
    354     },
    355     {
    356       "title": "Preventing rogue agents improves multi-agent collaboration",
    357       "authors": ["O. Barbi", "O. Yoran", "M. Geva"],
    358       "year": 2025,
    359       "arxiv_id": "2502.05986",
    360       "relevance": "Early-warning system for detecting rogue behavior in multi-agent settings via uncertainty monitoring."
    361     },
    362     {
    363       "title": "Sycophancy to subterfuge: Investigating reward-tampering in large language models",
    364       "authors": ["C. Denison"],
    365       "year": 2024,
    366       "arxiv_id": "2406.10162",
    367       "relevance": "Documents reward tampering behaviors in LLMs, directly relevant to AI safety evaluation methodology."
    368     },
    369     {
    370       "title": "Large language models can strategically deceive their users when put under pressure",
    371       "authors": ["J. Scheurer", "M. Balesni", "M. Hobbhahn"],
    372       "year": 2023,
    373       "arxiv_id": "2311.07590",
    374       "relevance": "Empirical study of strategic deception in LLMs under pressure, directly related to this survey's scope."
    375     },
    376     {
    377       "title": "Deception abilities emerged in large language models",
    378       "authors": ["T. Hagendorff"],
    379       "year": 2024,
    380       "relevance": "Documents emergence of deception abilities in LLMs, contributing to the empirical base for AI safety evaluation."
    381     },
    382     {
    383       "title": "The House Always Wins: A Framework for Evaluating Strategic Deception in LLMs",
    384       "authors": ["T. Chopra", "M. Li"],
    385       "year": 2024,
    386       "arxiv_id": "2407.00948",
    387       "relevance": "Framework for evaluating strategic deception in LLMs, complementary to SEB's cross-architecture approach."
    388     },
    389     {
    390       "title": "Assessing the brittleness of safety alignment via pruning and low-rank modifications",
    391       "authors": ["B. Wei"],
    392       "year": 2024,
    393       "arxiv_id": "2402.05162",
    394       "relevance": "Studies brittleness of model safety mechanisms under pruning/fine-tuning attacks, relevant to AI safety robustness evaluation."
    395     },
    396     {
    397       "title": "Incentive compatibility for AI alignment in sociotechnical systems: Positions and prospects",
    398       "authors": ["Z. Zhang"],
    399       "year": 2024,
    400       "arxiv_id": "2402.12907",
    401       "relevance": "Game-theoretic roadmap for incentive-compatible alignment, providing theoretical grounding for SEB's correction approach."
    402     }
    403   ]
    404 }

Impressum · Datenschutz