scan.json (18975B)
1 { 2 "paper": { 3 "title": "Predictable Artificial Intelligence", 4 "authors": ["Lexin Zhou", "Pablo A. M. Casares", "Fernando Martínez-Plumed", "John Burden", "Ryan Burnell", "Lucy Cheke", "Cèsar Ferri", "Alexandru Marcoci", "Behzad Mehrbakhsh", "Yael Moros-Daval", "Seán Ó hÉigeartaigh", "Danaja Rutar", "Wout Schellaert", "Konstantinos Voudouris", "José Hernández-Orallo"], 5 "year": 2023, 6 "venue": "Artificial Intelligence", 7 "arxiv_id": "2310.06167", 8 "doi": "10.48550/arXiv.2310.06167" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": false, 17 "justification": "No code repository or archive URL is provided in the paper." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "The paper does not release any dataset. The illustrative examples use data from previously published work (Zhou et al. 2022, Burnell et al. 2022) but no new data is released." 23 }, 24 "environment_specified": { 25 "applies": false, 26 "answer": false, 27 "justification": "This is a theoretical/conceptual paper that does not run new experiments, so environment specifications are not applicable." 28 }, 29 "reproduction_instructions": { 30 "applies": false, 31 "answer": false, 32 "justification": "No new experiments are conducted; the paper is a theoretical framework with illustrative examples drawn from prior work." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": false, 38 "answer": false, 39 "justification": "The paper does not present original experimental results. Tables 4 and 5 reproduce results from prior work." 40 }, 41 "significance_tests": { 42 "applies": false, 43 "answer": false, 44 "justification": "No original experiments are conducted; no comparative claims requiring significance tests are made about new results." 45 }, 46 "effect_sizes_reported": { 47 "applies": false, 48 "answer": false, 49 "justification": "No original experiments; illustrative results are borrowed from prior publications." 50 }, 51 "sample_size_justified": { 52 "applies": false, 53 "answer": false, 54 "justification": "Theoretical paper with no original data collection." 55 }, 56 "variance_reported": { 57 "applies": false, 58 "answer": false, 59 "justification": "No original experiments. Table 5 does show std dev from Zhou et al. 2022 but these are reproduced, not original." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": false, 65 "answer": false, 66 "justification": "This is a theoretical framework paper, not an empirical evaluation. Illustrative scenarios cite prior work's baselines but no new evaluation is conducted." 67 }, 68 "baselines_contemporary": { 69 "applies": false, 70 "answer": false, 71 "justification": "No new evaluation is conducted." 72 }, 73 "ablation_study": { 74 "applies": false, 75 "answer": false, 76 "justification": "No system or method is proposed that could be ablated; this is a conceptual framework." 77 }, 78 "multiple_metrics": { 79 "applies": false, 80 "answer": false, 81 "justification": "No original evaluation is conducted." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "No original evaluation is conducted." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "No original evaluation is conducted." 92 }, 93 "per_category_breakdown": { 94 "applies": false, 95 "answer": false, 96 "justification": "No original evaluation is conducted." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses cases where predictability is hard or impossible (systems E and F in Figure 1), aleatoric uncertainty limits (Section 3.1), and the tension between predictability and validity (Section 4)." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper discusses limitations such as humans failing to leverage difficulty for error detection (Section 3.5, citing Zhou et al. 2024 and Carlini 2024), and self-estimation creating conflicts of optimization goals (Section 3.4)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims to 'introduce the fundamental ideas and challenges of Predictable AI' and 'formally characterise predictability' — the paper delivers on both through its framework in Section 3 and illustrative examples." 114 }, 115 "causal_claims_justified": { 116 "applies": false, 117 "answer": false, 118 "justification": "The paper makes no causal claims. It defines a framework and discusses its properties theoretically." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper is careful to frame Predictable AI as a 'nascent research area' and its framework as a proposal. Claims are appropriately scoped as conceptual contributions rather than empirical findings." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": false, 127 "answer": false, 128 "justification": "This is a theoretical framework paper with no empirical results to explain." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": false, 132 "answer": false, 133 "justification": "Theoretical paper with no measurements." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": false, 139 "answer": false, 140 "justification": "No models are run in this paper. References to GPT-3 variants and GPT-4 are from cited prior work." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "No prompting is done in this paper." 146 }, 147 "hyperparameters_reported": { 148 "applies": false, 149 "answer": false, 150 "justification": "No experiments are conducted." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used." 156 }, 157 "data_preprocessing_documented": { 158 "applies": false, 159 "answer": false, 160 "justification": "No data is collected or preprocessed." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no dedicated limitations section. Section 5 'Challenges and opportunities' discusses open questions but not limitations of the paper itself." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No threats to validity are discussed for the paper's own framework or claims." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper explicitly discusses scope: 'it may seem that full predictability is always desirable, yet there are a variety of situations in which it is not necessary or practical' (Section 1). Section 5 identifies open methodological questions and what has not been explored." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": false, 183 "answer": false, 184 "justification": "No original data is collected; this is a theoretical framework paper." 185 }, 186 "data_collection_described": { 187 "applies": false, 188 "answer": false, 189 "justification": "No original data collection." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No participants or data recruitment." 195 }, 196 "data_pipeline_documented": { 197 "applies": false, 198 "answer": false, 199 "justification": "No data pipeline exists." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding acknowledgment section is present in the paper text provided." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed, including VRAIN (UPV), FAR.ai, University of Cambridge (multiple departments), Alan Turing Institute, ValGRAI, and Helmholtz Munich." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding information is disclosed, so independence cannot be assessed." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial disclosure statement is present." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This paper does not evaluate a pre-trained model on any benchmark." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "No model evaluation on benchmarks." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "No model evaluation on benchmarks." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this paper." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "Theoretical paper; no method with inference costs." 283 }, 284 "compute_budget_stated": { 285 "applies": false, 286 "answer": false, 287 "justification": "No computation is performed." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "More predictable AI systems should be preferred over less predictable ones with equivalent expected validity.", 294 "evidence": "Figure 1 illustrates six AI systems with equal 62.5% expected validity but different predictability profiles. Section 4 argues for Pareto trade-offs between validity and predictability.", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "Predictability is an essential precondition for trust, liability, control, alignment, and safety in AI.", 299 "evidence": "Section 2 argues each connection with citations to relevant literature (trust: ISO/IEC 2022; liability: Llorca et al. 2023; control: Beck et al. 2023; safety: Amodei et al. 2016).", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "External assessor models can predict LLM performance nearly as well as self-estimation, at lower cost.", 304 "evidence": "Table 5 shows Brier scores of 0.125-0.144 for the assessor vs 0.096-0.122 for self-estimation across GPT-3 variants (from Zhou et al. 2022). The assessor avoided 46% of failures by rejecting predictions below 1% threshold.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "Human-estimated difficulty is a good predictor of LLM performance.", 309 "evidence": "Figure 2 and Section 3.5 cite Zhou et al. 2024 showing concordance between human difficulty ratings and LLM error patterns across GPT and LLaMA models.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "Scaling laws represent a predictable relationship between compute/data/parameters and model performance.", 314 "evidence": "Section 3.5 and Figure 3 cite Kaplan et al. 2020 showing power-law relationships, noting 'loss linearly decreasing with these parameters (log scale).'", 315 "supported": "strong" 316 } 317 ], 318 "methodology_tags": ["theoretical"], 319 "key_findings": "The paper introduces 'Predictable AI' as a formal research area focused on anticipating validity indicators of AI systems rather than just maximizing performance. It provides a mathematical framework (Equations 1-4) characterizing unpredictability in terms of predictor families, scoring rules, and ecosystem histories. Through illustrative scenarios from prior work, the paper demonstrates that external assessor models, human difficulty estimates, and scaling laws can serve as practical predictors of AI system validity. The authors argue that predictability should be prioritized alongside performance, particularly for general-purpose AI systems where full verification is infeasible.", 320 "red_flags": [ 321 { 322 "flag": "No original experiments", 323 "detail": "All empirical examples (Tables 4, 5, Figure 2, Figure 3) are reproduced from the authors' own prior publications (Zhou et al. 2022, Burnell et al. 2022, Zhou et al. 2024, Kaplan et al. 2020). The paper is entirely a framework/position piece with no new empirical validation of its claims." 324 }, 325 { 326 "flag": "Heavy self-citation", 327 "detail": "Many of the illustrative scenarios and empirical results come from the same research group's prior work (Zhou et al. 2022, Zhou et al. 2024, Burnell et al. 2022, Burden et al. 2023, Schellaert et al. 2024, Hernández-Orallo et al. 2022, etc.), which could overrepresent one perspective." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Reject before you run: Small assessors anticipate big language models", 333 "authors": ["Lexin Zhou", "Fernando Martínez-Plumed", "José Hernández-Orallo", "Cèsar Ferri", "Wout Schellaert"], 334 "year": 2022, 335 "relevance": "Directly relevant to LLM evaluation methodology — demonstrates external assessor models that predict LLM performance and enable cost-effective rejection rules." 336 }, 337 { 338 "title": "Larger and more instructable language models become less reliable", 339 "authors": ["Lexin Zhou", "Wout Schellaert", "Fernando Martínez-Plumed", "Yael Moros-Daval", "Cèsar Ferri", "José Hernández-Orallo"], 340 "year": 2024, 341 "doi": "10.1038/s41586-024-07930-y", 342 "relevance": "Studies concordance between human expectations and LLM errors, finding that larger models become less predictable — core to understanding AI reliability." 343 }, 344 { 345 "title": "Scaling laws for neural language models", 346 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B Brown", "Benjamin Chess", "Rewon Child", "Scott Gray", "Alec Radford", "Jeffrey Wu", "Dario Amodei"], 347 "year": 2020, 348 "arxiv_id": "2001.08361", 349 "relevance": "Foundational work on predicting LLM performance from compute, data, and parameter scaling — key example of predictable AI." 350 }, 351 { 352 "title": "Predictability and surprise in large generative models", 353 "authors": ["Deep Ganguli", "Danny Hernandez", "Liane Lovitt"], 354 "year": 2022, 355 "relevance": "Studies when large generative models produce predictable vs surprising outputs, directly addressing the tension between novelty and predictability." 356 }, 357 { 358 "title": "Concrete problems in AI safety", 359 "authors": ["Dario Amodei", "Chris Olah", "Jacob Steinhardt", "Paul Christiano", "John Schulman", "Dan Mané"], 360 "year": 2016, 361 "arxiv_id": "1606.06565", 362 "relevance": "Foundational AI safety paper identifying key challenges including reward hacking and distributional shift, cited as motivation for predictability." 363 }, 364 { 365 "title": "Towards guaranteed safe AI: A framework for ensuring robust and reliable AI systems", 366 "authors": ["David Dalrymple", "Joar Skalse", "Yoshua Bengio", "Stuart Russell"], 367 "year": 2024, 368 "arxiv_id": "2405.06624", 369 "relevance": "Proposes a formal safety framework for AI systems; contrasted with Predictable AI's approach of predicting validity rather than guaranteeing it." 370 }, 371 { 372 "title": "Are emergent abilities of large language models a mirage?", 373 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 374 "year": 2024, 375 "relevance": "Questions whether emergent abilities are real or artifacts of metric choice — relevant to predictability of AI capabilities." 376 }, 377 { 378 "title": "Holistic evaluation of language models", 379 "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"], 380 "year": 2022, 381 "arxiv_id": "2211.09110", 382 "relevance": "Major LLM evaluation framework (HELM) cited as source of evaluation data that could train predictive models." 383 }, 384 { 385 "title": "Observational scaling laws and the predictability of language model performance", 386 "authors": ["Yangjun Ruan", "Chris J Maddison", "Tatsunori Hashimoto"], 387 "year": 2024, 388 "arxiv_id": "2405.10938", 389 "relevance": "Studies predictability of LLM benchmark performance through observational scaling laws — directly extends the paper's core theme." 390 }, 391 { 392 "title": "GPT-4 technical report", 393 "authors": ["OpenAI"], 394 "year": 2023, 395 "arxiv_id": "2303.08774", 396 "relevance": "Key reference for scaling laws and performance prediction in frontier LLMs." 397 }, 398 { 399 "title": "Constitutional AI: Harmlessness from AI feedback", 400 "authors": ["Yuntao Bai"], 401 "year": 2022, 402 "arxiv_id": "2212.08073", 403 "relevance": "AI alignment method using AI feedback for safety — relevant to the predictability of safety outcomes." 404 }, 405 { 406 "title": "Language models (mostly) know what they know", 407 "authors": ["Saurav Kadavath"], 408 "year": 2022, 409 "arxiv_id": "2207.05221", 410 "relevance": "Studies LLM self-calibration and ability to estimate their own uncertainty — directly relevant to self-prediction of validity." 411 } 412 ] 413 }