scan.json (18271B)
1 { 2 "paper": { 3 "title": "Towards Interactive Evaluations for Interaction Harms in Human-AI Systems", 4 "authors": [ 5 "Lujain Ibrahim", 6 "Saffron Huang", 7 "Umang Bhatt", 8 "Lama Ahmad", 9 "Markus Anderljung" 10 ], 11 "year": 2025, 12 "venue": "AAAI", 13 "arxiv_id": "2405.10632" 14 }, 15 "scan_version": 2, 16 "active_modules": [], 17 "methodology_tags": ["theoretical", "qualitative"], 18 "key_findings": "The paper argues that current AI safety evaluations are static, assume universal users, and lack construct validity for interaction harms that emerge through sustained human-AI engagement. It proposes three organizing principles for interactive evaluations: ecologically valid scenario design based on user objectives and interaction modes, causal tracing from model behavior to human impact with appropriate metrics, and structured human participation strategies balancing validity and practicality. The paper identifies open challenges including ethical human participation, researcher data access, evaluation infrastructure, and producing actionable findings.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No code repository or analysis scripts are mentioned or released. The paper is a conceptual framework paper but could have released supplementary materials." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No dataset or data artifacts are released. The paper could have released its literature corpus or taxonomy data." 30 }, 31 "environment_specified": { 32 "applies": false, 33 "answer": false, 34 "justification": "No computational experiments are conducted; this is a theoretical/position paper proposing a framework." 35 }, 36 "reproduction_instructions": { 37 "applies": false, 38 "answer": false, 39 "justification": "No experiments to reproduce; this is a conceptual framework paper." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": false, 45 "answer": false, 46 "justification": "No quantitative experiments are conducted. This is a theoretical paper proposing evaluation principles." 47 }, 48 "significance_tests": { 49 "applies": false, 50 "answer": false, 51 "justification": "No statistical comparisons are made; the paper is a conceptual framework." 52 }, 53 "effect_sizes_reported": { 54 "applies": false, 55 "answer": false, 56 "justification": "No quantitative results are reported." 57 }, 58 "sample_size_justified": { 59 "applies": false, 60 "answer": false, 61 "justification": "No empirical study is conducted." 62 }, 63 "variance_reported": { 64 "applies": false, 65 "answer": false, 66 "justification": "No experiments with multiple runs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": false, 72 "answer": false, 73 "justification": "No empirical evaluation is conducted; the paper proposes a conceptual framework rather than testing a system." 74 }, 75 "baselines_contemporary": { 76 "applies": false, 77 "answer": false, 78 "justification": "No empirical evaluation with baselines." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "No system with components to ablate." 84 }, 85 "multiple_metrics": { 86 "applies": false, 87 "answer": false, 88 "justification": "No empirical evaluation conducted." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "No system outputs to evaluate; this is a position paper." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "No datasets or evaluation splits used." 99 }, 100 "per_category_breakdown": { 101 "applies": false, 102 "answer": false, 103 "justification": "No quantitative results to break down." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 3 discusses specific failure cases of current evaluation approaches: static evaluations missing cumulative bias effects in hiring, empathetic responses leading to emotional dependency, and anthropomorphic behaviors emerging only after multiple turns." 109 }, 110 "negative_results_reported": { 111 "applies": false, 112 "answer": false, 113 "justification": "No experiments were conducted that could yield negative results." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims are appropriately hedged as proposals and arguments rather than empirical findings. The paper delivers on its promise to discuss limitations of current evaluations, present principles for interactive evaluations, and explore implementation challenges." 121 }, 122 "causal_claims_justified": { 123 "applies": false, 124 "answer": false, 125 "justification": "The paper makes no causal claims from its own data. It discusses causal mechanisms hypothetically (e.g., cumulative bias effects) and cites external studies for support." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper explicitly bounds scope: 'Our examples focus on LLMs and text interactions because they provide the richest available data for studying sustained human-AI engagement patterns. However, the principles discussed could be applied to multi-modal systems as they mature' (footnote 1)." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": false, 134 "answer": false, 135 "justification": "No empirical results are presented; this is a conceptual framework paper, so there are no results requiring alternative explanations." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": false, 139 "answer": false, 140 "justification": "No measurements are taken; this is a theoretical paper proposing evaluation principles." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": false, 146 "answer": false, 147 "justification": "No models are used in experiments." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "No prompting is used." 153 }, 154 "hyperparameters_reported": { 155 "applies": false, 156 "answer": false, 157 "justification": "No experiments are conducted." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used." 163 }, 164 "data_preprocessing_documented": { 165 "applies": false, 166 "answer": false, 167 "justification": "No data collection or preprocessing is performed; this is a conceptual paper drawing on existing literature." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5 ('Open challenges and ways forward') serves as a substantive limitations section, discussing ethical challenges, data access limitations, infrastructure gaps, and the limitations of controlled studies in capturing systemic impacts." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.4 specifically discusses limitations of the proposed approach: 'controlled evaluations of human-AI interaction may effectively measure individual-level effects (like individual manipulation or overreliance) while still missing broader systemic patterns.' Also discusses validity concerns with user simulation approaches (Section 5.1)." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper explicitly bounds scope to LLMs and text interactions (footnote 1) and acknowledges that interactive evaluations address individual-level effects but not systemic ones (Section 5.4)." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": false, 190 "answer": false, 191 "justification": "No data collected; this is a theoretical framework paper." 192 }, 193 "data_collection_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No data collection performed." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No participants recruited." 202 }, 203 "data_pipeline_documented": { 204 "applies": false, 205 "answer": false, 206 "justification": "No data pipeline exists." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding information is provided. The acknowledgments thank individuals for comments but do not mention funding sources." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: University of Oxford, Collective Intelligence Project, NYU Center for Data Science, OpenAI, and Centre for the Governance of AI." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. One author (Lama Ahmad) is affiliated with OpenAI, whose products are discussed in the paper. This potential conflict is not acknowledged." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement is present. An author from OpenAI co-authors a paper discussing OpenAI's Preparedness Framework without a conflict-of-interest disclosure." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "No pre-trained model is evaluated on any benchmark." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No benchmark evaluation is conducted." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "No benchmark evaluation is conducted." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "Theoretical/position paper; no method with inference costs." 290 }, 291 "compute_budget_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "Theoretical/position paper; no computation performed." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "Current AI evaluations are static, assume universal users, and lack construct validity for interaction harms.", 301 "evidence": "Section 3 provides detailed argumentation with references to prior work (Weidinger et al. 2023, Raji et al. 2021, Blodgett et al. 2021, Chang et al. 2024) and examples of cumulative bias, emotional dependency, and emergent anthropomorphic behaviors.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Interaction harms are compositional across conversation turns and cannot be detected through single-turn evaluations.", 306 "evidence": "Section 3 cites Ibrahim et al. (2025) showing anthropomorphic desires only appearing after multiple turns, and Phang et al. (2025) on emotional well-being impacts from sustained ChatGPT use.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Models exhibit differential behavior based on perceived user identity, including increased toxicity toward African American English.", 311 "evidence": "Cites Hofmann et al. (2024) in Nature and Ghandeharioun et al. (2025) showing models varying refusals based on perceived user identity.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Three organizing principles (scenario design, impact measurement, participation strategy) can address the limitations of current evaluations.", 316 "evidence": "Section 4 presents the framework drawing on HCI research traditions, but this is a proposal without empirical validation of the framework itself.", 317 "supported": "weak" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "Undisclosed conflict of interest", 323 "detail": "An author from OpenAI co-writes a paper that cites OpenAI's Preparedness Framework as an example of responsible evaluation practice, without any conflict-of-interest disclosure. The paper also discusses the need for AI developers to share data, which directly implicates OpenAI." 324 }, 325 { 326 "flag": "Framework proposed without empirical validation", 327 "detail": "The three organizing principles are proposed as a conceptual framework but are not tested or validated. The paper acknowledges this is groundwork, but the claims about the framework's utility are unsupported." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Sociotechnical safety evaluation of generative ai systems", 333 "authors": ["Laura Weidinger", "Maribeth Rauh", "Nahema Marchal"], 334 "year": 2023, 335 "arxiv_id": "2310.11986", 336 "relevance": "Comprehensive framework for evaluating safety of generative AI, identifies gaps in current evaluation approaches." 337 }, 338 { 339 "title": "SafetyPrompts: a Systematic Review of Open Datasets for Evaluating and Improving Large Language Model Safety", 340 "authors": ["Paul Röttger", "Federica Pernisi", "Bertie Vidgen", "Dirk Hovy"], 341 "year": 2024, 342 "arxiv_id": "2404.05399", 343 "relevance": "Systematic review of safety evaluation datasets revealing methodological gaps in LLM safety evaluation." 344 }, 345 { 346 "title": "Position: Evaluating Generative AI Systems is a Social Science Measurement Challenge", 347 "authors": ["Hanna Wallach", "Meera Desai", "A. Feder Cooper"], 348 "year": 2025, 349 "arxiv_id": "2502.00561", 350 "relevance": "Argues that AI evaluation requires social science measurement validity frameworks." 351 }, 352 { 353 "title": "Multi-turn Evaluation of Anthropomorphic Behaviours in Large Language Models", 354 "authors": ["Lujain Ibrahim"], 355 "year": 2025, 356 "arxiv_id": "2502.07077", 357 "relevance": "Demonstrates that safety-critical anthropomorphic behaviors emerge only after multiple conversation turns, validating the need for multi-turn evaluation." 358 }, 359 { 360 "title": "Investigating Affective Use and Emotional Well-being on ChatGPT", 361 "authors": ["Jason Phang"], 362 "year": 2025, 363 "arxiv_id": "2504.03888", 364 "relevance": "Large-scale study of chat logs paired with longitudinal surveys examining emotional well-being of ChatGPT users." 365 }, 366 { 367 "title": "Clio: Privacy-Preserving Insights into Real-World AI Use", 368 "authors": ["Alex Tamkin"], 369 "year": 2024, 370 "arxiv_id": "2412.13678", 371 "relevance": "Privacy-preserving analysis of real-world AI usage patterns, relevant to understanding actual human-AI interaction." 372 }, 373 { 374 "title": "Haicosystem: An ecosystem for sandboxing safety risks in human-ai interactions", 375 "authors": ["Xuhui Zhou"], 376 "year": 2024, 377 "arxiv_id": "2409.16427", 378 "relevance": "Framework for sandboxing and evaluating safety risks in human-AI interaction using simulations." 379 }, 380 { 381 "title": "Red teaming language models with language models", 382 "authors": ["Ethan Perez", "Saffron Huang"], 383 "year": 2022, 384 "arxiv_id": "2202.03286", 385 "relevance": "Foundational work on automated adversarial testing of language models." 386 }, 387 { 388 "title": "AI generates covertly racist decisions about people based on their dialect", 389 "authors": ["Valentin Hofmann", "Pratyusha Rani Kalluri", "Dan Jurafsky", "Sharese King"], 390 "year": 2024, 391 "relevance": "Nature paper showing LLMs exhibit biased responses toward African American English, demonstrating differential model behavior based on user characteristics." 392 }, 393 { 394 "title": "Who's asking? User personas and the mechanics of latent misalignment", 395 "authors": ["Asma Ghandeharioun"], 396 "year": 2025, 397 "relevance": "Shows models vary refusals of dangerous queries based on perceived user identity, demonstrating interaction-dependent safety failures." 398 }, 399 { 400 "title": "Evaluating Frontier Models for Dangerous Capabilities", 401 "authors": ["Mary Phuong"], 402 "year": 2024, 403 "arxiv_id": "2403.13793", 404 "relevance": "DeepMind's approach to evaluating dangerous capabilities in frontier models." 405 } 406 ] 407 }