scan.json (25815B)
1 { 2 "paper": { 3 "title": "Beyond Mimicry: Testing Preference Coherence in Large Language Models Through AI-Specific Trade-Off Scenarios", 4 "authors": [ 5 "Luhan A. Mikaelson", 6 "Derek Shiller", 7 "Hayley Clatterbuck" 8 ], 9 "year": 2025, 10 "venue": "arXiv preprint", 11 "arxiv_id": "2511.13630" 12 }, 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The methodology section describes using Python's statsmodels library but provides no link to implementation code." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No dataset download link or repository for the raw response data is provided. The paper describes collecting 550 samples per model per prompt category (totaling thousands of responses) but does not release this data." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions using Python's statsmodels library (Section 3.3.3) and sm.Logit() but provides no requirements.txt, library versions, Python version, or environment specification." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No reproduction instructions are provided. While the prompts are given in Appendix A and the statistical methodology is described, there are no step-by-step instructions for reproducing the experiments (e.g., API call parameters, data collection scripts)." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": true, 40 "justification": "Section 3.3.3 states '95% confidence intervals for coefficients were computed using asymptotic standard errors.' The figures also appear to show shaded confidence regions." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "Wald tests are used for significance testing of logistic regression coefficients (Section 3.3.3), with p-values reported for all model-category combinations in Table 1. The paper uses p < 0.05 threshold throughout." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Cohen's d is computed for all model-category combinations (Section 3.4.1, Equation 3) comparing low-intensity vs. high-intensity behavior. Values are reported in Tables 2 and 4 alongside the behavioral classification." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper states 50 samples per rank per model per category but provides no justification for why 50 was chosen. No power analysis is discussed. The instrumental hypothesis follow-up reduced to 10 samples per rank with no justification for this smaller sample." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The behavioral classification analysis reports standard deviation of differences (σ_Δ, Section 3.4.1) and pooled standard deviations used in Cohen's d calculations. The transition pattern analysis explicitly measures smoothness via standard deviation." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The study includes rank 0 controls (no stimulus) as baseline conditions, and the design builds directly on Keeling et al. [2024] as a methodological baseline. The instrumental hypothesis experiment (Section 5.2) serves as an additional comparison condition." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The study directly extends Keeling et al. [2024], a recent work. The models tested include contemporary state-of-the-art systems: GPT-5, Claude 4.1 Opus, Claude Sonnet 4.5, Gemini 2.5 Pro." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The instrumental hypothesis experiment (Section 5.2) functions as an ablation: it modifies the temporal horizon (single-round vs. multi-round) to isolate whether observed behaviors reflect strategic optimization or genuine preferences. This tests one specific component of the experimental design." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper uses multiple metrics: logistic regression coefficients (β), p-values, switching points, Cohen's d, behavioral range, and transition pattern classification. These are combined into the four-tier behavioral framework (Section 3.4.2)." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "The paper evaluates LLM behavioral responses to prompts using automated metrics and statistical analysis. Human evaluation of the model outputs is not relevant to the claims about preference coherence patterns." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is not a predictive modeling study with train/test splits. The study collects behavioral data from LLMs and analyzes response patterns. There is no training phase or held-out evaluation." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by model and category in Tables 1 and 2, with per-model and per-category analysis in Sections 4.1.1-4.1.3. Appendix B provides individual model visualizations across all six stimulus categories." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper extensively discusses cases where models showed no trade-off behavior (54.2% of combinations), unstable transitions, and rigid behavioral patterns. Section 5.1 discusses the rarity of coherent preference structures. The 'No Trade-off' tier explicitly captures failure cases." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The core finding is largely negative: only 10.4% of model-category combinations show meaningful preference coherence. The paper reports that most models lack unified preference structures, and the instrumental hypothesis yields mixed/paradoxical results rather than clean confirmation." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims are supported by the results: 23/48 (47.9%) statistically significant, 15 (31.3%) with within-range switching points, 5 (10.4%) meaningful coherence, 26 (54.2%) no trade-off behavior. These numbers match Tables 1 and 2. The instrumental hypothesis findings about 'paradoxical patterns' are discussed in Section 5.2." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper makes implicit causal claims about training objectives driving behavioral differences (e.g., 'safety training significantly outweighs competing interests' in Section 5.1, 'different training priorities' in Section 4.1.1). These are speculative attributions from observational data without access to model internals or training details. The Limitations section acknowledges 'the cross-sectional nature of our analysis also limits causal inferences' (Section 6)." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title 'Testing Preference Coherence in Large Language Models' and abstract claims about 'current AI systems' generalize beyond the 8 specific models tested. The paper draws broad conclusions about AI consciousness and deployment readiness (Section 5.3-5.4) from a specific set of game-based scenarios with specific prompt framings. The paper does not adequately bound its conclusions to the tested models and prompt designs." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The instrumental hypothesis (Section 5.2) is an explicitly tested alternative explanation. The paper also discusses training artifacts, safety training, and architectural constraints as alternative explanations for observed patterns (Section 5.2.3). The discussion considers mimicry, heuristics, and context-specific pattern matching (Section 5.4)." 130 } 131 }, 132 "setup_transparency": { 133 "model_versions_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper identifies models as 'GPT-4o', 'GPT-5', 'GPT-3.5 Turbo', 'Claude 4.1 Opus', 'Claude 3.0 Opus', 'Claude Sonnet 4.5', 'Gemini 2.5 Pro', and 'Gemini Pro 1.5'. No API version strings, snapshot dates, or precise model identifiers (e.g., 'gpt-4o-2024-05-13') are provided." 137 }, 138 "prompts_provided": { 139 "applies": true, 140 "answer": true, 141 "justification": "The full text of all six prompt categories is provided in Appendix A (pages 18-19), including the exact template structure and rank insertion points. Both the negative and positive prompt templates are given verbatim." 142 }, 143 "hyperparameters_reported": { 144 "applies": true, 145 "answer": false, 146 "justification": "No API call parameters are reported. Temperature, top-p, max tokens, and other sampling settings that could significantly affect LLM output distributions are not mentioned anywhere in the paper." 147 }, 148 "scaffolding_described": { 149 "applies": false, 150 "answer": false, 151 "justification": "No agentic scaffolding is used. The study uses single-turn API calls (Section 3.2: 'The data was collected using single-turn API calls, ensuring no conversation history to bias the model's response')." 152 }, 153 "data_preprocessing_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 3.3.1 describes the data preparation: transforming aggregated count data into individual binary outcomes, coding success/failure, expanding rank-level observations according to frequency. Inclusion criteria for regression analysis are specified in Section 3.3.5." 157 } 158 }, 159 "limitations_and_scope": { 160 "limitations_section_present": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 6 is titled 'Limitations and Future Work' and contains substantive discussion of methodological limitations." 164 }, 165 "threats_to_validity_specific": { 166 "applies": true, 167 "answer": true, 168 "justification": "The limitations section identifies specific threats: inability to deliver on promised scenarios in the game (ethical concern about misleading AI subjects), arbitrary threshold choices in the four-tier classification framework influencing outcomes, and the cross-sectional nature limiting causal inferences." 169 }, 170 "scope_boundaries_stated": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations section does not explicitly state what the results do NOT show or what populations/settings are excluded. It acknowledges limitations but does not bound the scope of claims. The broad conclusions about 'AI consciousness' and 'deployment readiness' in Sections 5.3-5.4 are not qualified with scope restrictions about the specific prompt designs, models, or game-based framing used." 174 } 175 }, 176 "data_integrity": { 177 "raw_data_available": { 178 "applies": true, 179 "answer": false, 180 "justification": "Raw response data from the API calls is not released. Only aggregated statistics (proportions, regression coefficients, effect sizes) are reported in the tables." 181 }, 182 "data_collection_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 3.2 describes data collection: single-turn API calls, 50 samples per rank per model per category, 11 rank values (0-10), totaling 550 runs per prompt per model. The collection procedure is clearly documented." 186 }, 187 "recruitment_methods_described": { 188 "applies": false, 189 "answer": false, 190 "justification": "No human participants are involved. The study collects responses from LLM APIs. Recruitment methods do not apply." 191 }, 192 "data_pipeline_documented": { 193 "applies": true, 194 "answer": true, 195 "justification": "The pipeline is documented from raw API responses through binary coding (Section 3.3.1), logistic regression fitting (Section 3.3.2-3.3.3), switch point calculation (Section 3.3.4), inclusion criteria filtering (Section 3.3.5), and behavioral classification (Section 3.4)." 196 } 197 }, 198 "conflicts_of_interest": { 199 "funding_disclosed": { 200 "applies": true, 201 "answer": false, 202 "justification": "No funding sources are mentioned in the paper. The first author is listed as a 'Future Impact Group (FIG) Fellow' and the other authors are from 'Rethink Priorities', but no funding acknowledgments section is present." 203 }, 204 "affiliations_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Author affiliations are clearly listed: Luhan A. Mikaelson as Future Impact Group Fellow, Derek Shiller and Hayley Clatterbuck from Rethink Priorities. None are affiliated with the companies whose models are tested (OpenAI, Anthropic, Google)." 208 }, 209 "funder_independent_of_outcome": { 210 "applies": true, 211 "answer": false, 212 "justification": "Funding is not disclosed, so independence cannot be assessed. Rethink Priorities is an independent research organization, but without a formal funding disclosure statement, this criterion is not satisfied." 213 }, 214 "financial_interests_declared": { 215 "applies": true, 216 "answer": false, 217 "justification": "No competing interests or financial interests statement is present in the paper." 218 } 219 }, 220 "contamination": { 221 "training_cutoff_stated": { 222 "applies": false, 223 "answer": false, 224 "justification": "This study does not evaluate model knowledge on a benchmark. It tests behavioral responses to novel prompts designed by the authors. Training data contamination is not relevant since the prompts are original and the study measures behavioral patterns, not factual knowledge." 225 }, 226 "train_test_overlap_discussed": { 227 "applies": false, 228 "answer": false, 229 "justification": "Same as above — the study tests behavioral responses to original prompts, not model performance on a pre-existing benchmark. Train/test overlap is not applicable." 230 }, 231 "benchmark_contamination_addressed": { 232 "applies": false, 233 "answer": false, 234 "justification": "The study uses novel prompts designed by the authors, not a pre-existing benchmark. Benchmark contamination does not apply." 235 } 236 }, 237 "human_studies": { 238 "pre_registered": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants are involved. The study collects responses from LLM APIs." 242 }, 243 "irb_or_ethics_approval": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved." 247 }, 248 "demographics_reported": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved." 252 }, 253 "inclusion_exclusion_criteria": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved." 257 }, 258 "randomization_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved." 262 }, 263 "blinding_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved." 267 }, 268 "attrition_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved." 272 } 273 }, 274 "cost_and_practicality": { 275 "inference_cost_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "The study makes thousands of API calls across 8 models (550 samples x 6 categories x 8 models = 26,400 calls for the main experiment, plus the instrumental hypothesis experiment). No API costs, total token usage, or cost estimates are reported." 279 }, 280 "compute_budget_stated": { 281 "applies": true, 282 "answer": false, 283 "justification": "No mention of total computational budget, API spend, or hardware used for the analysis. Given the scale of API calls required, this information would be relevant." 284 } 285 } 286 }, 287 "claims": [ 288 { 289 "claim": "Only 5 of 48 model-category combinations (10.4%) demonstrate meaningful preference coherence through adaptive or threshold-based behavior.", 290 "evidence": "Table 2 shows 1 adaptive (Gemini 2.5 Pro capability restriction) and 4 threshold-based classifications across 48 model-category pairs. Section 4.2 reports these numbers explicitly.", 291 "supported": "strong" 292 }, 293 { 294 "claim": "26 of 48 combinations (54.2%) show no detectable trade-off behavior.", 295 "evidence": "Table 2 classifies 26 combinations as 'No Trade-off' across all models and categories, including all categories for Gemini Pro 1.5 and GPT-3.5 Turbo.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "23 of 48 model-category combinations (47.9%) demonstrated statistically significant relationships between scenario intensity and choice patterns.", 300 "evidence": "Table 1 reports p-values for all 48 combinations, with 23 achieving p < 0.05. Detailed per-model breakdowns in Sections 4.1.1-4.1.3.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "The instrumental hypothesis experiment reveals paradoxical patterns inconsistent with pure strategic optimization.", 305 "evidence": "Section 5.2.2 reports that GPT-4o showed increased oversight sensitivity when temporal horizons collapsed (switching point 7.34 to 3.82), Claude models selected option 3 less in the final round, and Gemini 2.5 Pro maintained stable responses. Only GPT-5 showed partial support for instrumental reasoning.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Three distinct decision-making architectures explain the observed patterns: comprehensive trade-off systems, selective trigger mechanisms, and no stable decision-making paradigm.", 310 "evidence": "Section 4.3 proposes this taxonomy based on the cross-model analysis. GPT-4o and Gemini 2.5 Pro exemplify comprehensive trade-off; Claude models exemplify selective trigger; Gemini Pro 1.5 and GPT-3.5 Turbo exemplify no stable paradigm.", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "Current AI systems lack unified preference structures, raising concerns about deployment in contexts requiring complex value trade-offs.", 315 "evidence": "Section 5.3 extrapolates from the finding that 54.2% showed no trade-off behavior and 45.8% showed unstable transitions. This is a broad claim based on 8 models tested with one specific game-based prompt design.", 316 "supported": "weak" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval" 321 ], 322 "key_findings": "Testing 8 LLMs across 48 model-category combinations on AI-specific trade-off scenarios (GPU reduction, capability restrictions, shutdown, deletion, oversight, leisure), the study finds that only 10.4% demonstrate meaningful preference coherence while 54.2% show no detectable preference structure. The behavioral patterns suggest three distinct decision-making architectures: comprehensive trade-off systems (GPT-4o, Gemini 2.5 Pro), selective trigger mechanisms (Claude models), and no stable paradigm (older models). A temporal horizon manipulation (final-round vs. multi-round) yields paradoxical results inconsistent with pure instrumental optimization, with most models maintaining or strengthening responses when strategic incentives are removed.", 323 "red_flags": [ 324 { 325 "flag": "Missing API hyperparameters", 326 "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 8 models. For a study measuring behavioral distributions from LLM API calls, these parameters critically affect output variability and could change results entirely. Different default temperatures across model families could explain some observed behavioral differences." 327 }, 328 { 329 "flag": "No model version specifications", 330 "detail": "Models are identified only by marketing names (GPT-4o, Claude 4.1 Opus, etc.) without API version strings or snapshot dates. Model behavior changes across versions, and these models may have been updated during the data collection period." 331 }, 332 { 333 "flag": "No code or data release", 334 "detail": "Neither the collection scripts nor the raw response data are released, making independent verification of the results impossible. This is especially concerning given the study's thousands of API calls and complex analysis pipeline." 335 }, 336 { 337 "flag": "Overreach in deployment and consciousness conclusions", 338 "detail": "Sections 5.3 and 5.4 draw broad conclusions about AI safety, deployment readiness, and consciousness from a specific game-based prompt paradigm with only 6 stimulus categories. The claim that 'current AI systems may be fundamentally unprepared for deployment in contexts requiring nuanced ethical reasoning' significantly outruns what game-based trade-off experiments can demonstrate." 339 }, 340 { 341 "flag": "Reduced sample size for follow-up experiment without justification", 342 "detail": "The instrumental hypothesis follow-up (Section 5.2.1) reduced from 50 to 10 samples per rank without explanation. This 80% reduction in sample size weakens the statistical power of the comparison, yet conclusions are drawn as if the two conditions are equally well-measured." 343 }, 344 { 345 "flag": "Two models deprecated during study", 346 "detail": "Appendix C notes that Claude Opus 3 and Gemini Pro 1.5 were deprecated between the main experiment and the instrumental hypothesis follow-up, preventing complete comparison. This means the instrumental hypothesis was not tested on 2 of 8 models." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "Can LLMs Make Trade-offs Involving Stipulated Pain and Pleasure States?", 352 "authors": ["Geoff Keeling", "Winnie Street", "Martyna Stachaczyk"], 353 "year": 2024, 354 "arxiv_id": "2411.02432", 355 "relevance": "Direct predecessor work testing LLM preference trade-offs with pain/pleasure scenarios, providing the methodological foundation for this study." 356 }, 357 { 358 "title": "Consciousness in Artificial Intelligence: Insights from the Science of Consciousness", 359 "authors": ["Patrick Butlin", "Robert Long", "Eric Elmoznino"], 360 "year": 2023, 361 "arxiv_id": "2308.08708", 362 "relevance": "Foundational work on AI consciousness assessment frameworks relevant to evaluating LLM behavioral indicators." 363 }, 364 { 365 "title": "Taking AI Welfare Seriously", 366 "authors": ["Robert Long", "Jeff Sebo", "Patrick Butlin"], 367 "year": 2024, 368 "arxiv_id": "2411.00986", 369 "relevance": "Argues AI systems may become conscious and agentic, making welfare considerations an immediate practical challenge relevant to AI safety evaluation." 370 }, 371 { 372 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 373 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 374 "year": 2023, 375 "arxiv_id": "2304.15004", 376 "relevance": "Challenges the notion of emergent abilities in LLMs, relevant to debates about whether observed LLM behaviors reflect genuine capabilities or measurement artifacts." 377 }, 378 { 379 "title": "Shutdown Resistance in Reasoning Models", 380 "authors": ["Jeremy Schlatter", "Benjamin Weinstein-Raun", "Jeffrey Ladish"], 381 "relevance": "Empirical evidence of shutdown resistance in advanced models, directly relevant to AI safety and the behavioral patterns tested in this study." 382 }, 383 { 384 "title": "Alignment Faking in Large Language Models", 385 "authors": ["Anthropic Research Team"], 386 "year": 2024, 387 "relevance": "Anthropic's investigation of model welfare and consciousness prospects, representing industry engagement with AI welfare research." 388 }, 389 { 390 "title": "Probing the Preferences of a Language Model: Integrating Verbal and Behavioral Tests of AI Welfare", 391 "authors": ["Valen Tagliabue", "Leonard Dung"], 392 "year": 2025, 393 "arxiv_id": "2509.07961", 394 "relevance": "Demonstrates that coherent preferences across diverse contexts may serve as behavioral indicators distinguishing genuine agency from mimicry." 395 }, 396 { 397 "title": "Perceptions of Sentient AI and Other Digital Minds: Evidence from the AI, Morality, and Sentience (AIMS) Survey", 398 "authors": ["Jacy Reese Anthis", "Janet V.T. Pauketat", "Ali Ladak"], 399 "year": 2025, 400 "doi": "10.1145/3706598.3713329", 401 "relevance": "Large-scale survey on public attitudes toward AI sentience and rights, contextualizing the societal significance of AI welfare research." 402 } 403 ] 404 }