scan.json (24075B)
1 { 2 "paper": { 3 "title": "Benchmarks for Automated Commonsense Reasoning: A Survey", 4 "authors": ["Ernest Davis"], 5 "year": 2023, 6 "venue": "ACM Computing Surveys", 7 "arxiv_id": "2302.04752", 8 "doi": "10.1145/3615355" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No source code or analysis scripts are released. The paper references a companion website (http://cs.nyu.edu/~davise/Benchmarks/) listing benchmarks, but no analysis code is provided." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The companion website provides an annotated list of commonsense benchmarks with descriptions, examples, sizes, construction methods, and links. The survey's data (the catalog of 139 benchmarks) is publicly accessible via this site." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No environment or dependency specifications are provided. This is a survey paper with no computational experiments requiring reproducible environments, but analysis scripts could still benefit from such specification." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No reproduction instructions are provided. The methodology for identifying and cataloging benchmarks is described in general terms but no step-by-step process for reproducing the survey is given." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": false, 36 "answer": false, 37 "justification": "This is a survey paper that does not run experiments or report quantitative results requiring confidence intervals." 38 }, 39 "significance_tests": { 40 "applies": false, 41 "answer": false, 42 "justification": "This is a survey paper that does not make statistical comparisons between systems or methods." 43 }, 44 "effect_sizes_reported": { 45 "applies": false, 46 "answer": false, 47 "justification": "This is a survey paper that does not report its own experimental effect sizes." 48 }, 49 "sample_size_justified": { 50 "applies": false, 51 "answer": false, 52 "justification": "This is a survey paper without experiments requiring sample size justification." 53 }, 54 "variance_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "This is a survey paper that does not run experiments with multiple runs." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": false, 64 "justification": "The paper does not compare its survey approach against prior surveys. While it cites prior work on AI benchmarks generally (e.g., references [128, 75, 108, 118]), it does not systematically compare its coverage, methodology, or conclusions to these prior surveys." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": false, 69 "justification": "No systematic comparison to prior surveys of commonsense benchmarks is provided. The paper mentions prior surveys in passing but does not evaluate how its coverage or methodology compares." 70 }, 71 "ablation_study": { 72 "applies": false, 73 "answer": false, 74 "justification": "As a survey paper, there are no system components to ablate." 75 }, 76 "multiple_metrics": { 77 "applies": false, 78 "answer": false, 79 "justification": "This is a survey paper that does not evaluate systems using metrics." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "Human evaluation of system outputs is not applicable to a survey paper cataloging benchmarks." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is a survey paper without train/test splits." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper provides detailed breakdowns of benchmarks by category: text-based (102), image-based (18), video-based (12), and simulated physical environments (7), with further subcategorization within BIG-bench (48 tagged as common sense) and tables (2-10) organizing benchmarks by type." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper extensively discusses failures and flaws in existing benchmarks. Section 4.1.1 documents defective examples with a detailed taxonomy (incorrect answers, multiple correct answers, ill-formed questions, etc.). Table 1 provides concrete examples, and Appendix A analyzes 17 flawed items from the first 100 examples of OpenBookQA." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 10 ('Uncovered forms of commonsense reasoning') extensively discusses what existing benchmarks fail to test. The paper's central conclusion is negative: 'we do not currently have any reliable way of measuring to what extent existing AI systems have achieved common sense.'" 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims (1) more than 100 benchmarks exist (paper catalogs 139), (2) benchmarks are often flawed (supported by extensive examples in Section 4, Table 1, Appendix A), and (3) many aspects of common sense remain untested (supported by Section 10). All claims are well-supported by the paper's content." 112 }, 113 "causal_claims_justified": { 114 "applies": false, 115 "answer": false, 116 "justification": "The paper makes no causal claims. It surveys, catalogs, and critically evaluates benchmarks without claiming causal relationships." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper is appropriately scoped to commonsense reasoning benchmarks specifically, and explicitly notes at the start that 'the primary focus in this paper is therefore on issues that are specific to commonsense reasoning, rather than on problems common to AI benchmarks generally.' The scope is well-bounded throughout." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper extensively discusses alternative explanations for AI performance on benchmarks. Section 2.1 discusses artifacts (e.g., SNLI/MultiNLI exploiting annotation artifacts). Section 5.2 discusses how adversarial filtering can create misleading improvement measurements. Section 5.3 discusses issues with interpreting 'human-level' performance." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": false, 132 "answer": false, 133 "justification": "This is a survey paper that does not run experiments with AI models. The ChatGPT examples in Section 9 and Table 18-19 are anecdotal illustrations, not systematic experiments." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "This is a survey paper that does not use prompting for experiments." 139 }, 140 "hyperparameters_reported": { 141 "applies": false, 142 "answer": false, 143 "justification": "This is a survey paper with no model experiments requiring hyperparameters." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used in this survey paper." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper does not document a systematic methodology for identifying and selecting the 139 benchmarks. The inclusion criterion is loosely stated: 'a benchmark is included in this collection if the authors describe it as a commonsense benchmarks' with 'a few exceptions.' No search strategy, databases queried, date ranges, or systematic filtering criteria are provided." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section. The paper includes recommendations (Section 11) but does not discuss the limitations of its own survey methodology." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to the validity of the survey itself are discussed. The paper does not acknowledge potential biases in its benchmark selection, the author's own involvement in several of the benchmarks reviewed (e.g., Winograd Schema Challenge), or the completeness of the catalog." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper explicitly states scope boundaries: it focuses on commonsense reasoning benchmarks specifically rather than AI benchmarks generally, and explicitly notes exclusions (e.g., 'datasets that the authors do not characterize as commonsensical and that are certainly only partially so have mostly been excluded'). Section 7.2 explains why mathematical word problem collections are excluded." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The annotated list of benchmarks is available on the companion website (http://cs.nyu.edu/~davise/Benchmarks/), allowing independent verification. Each benchmark entry includes name, description, examples, size, construction method, and links to the original paper and online site." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": false, 182 "justification": "The data collection procedure for identifying the 139 benchmarks is not described in detail. There is no discussion of what databases were searched, what search terms were used, or how comprehensiveness was assessed. The inclusion criterion is stated only loosely." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited for this survey. The paper catalogs existing benchmarks rather than involving human subjects." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": false, 192 "justification": "No data pipeline from initial search to final catalog is documented. It is unclear how the author went from the space of all AI benchmarks to the final set of 139 commonsense benchmarks." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding information is disclosed. The acknowledgements section thanks individuals for feedback but does not mention any funding sources or grants." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "The author's affiliation (Dept. of Computer Science, New York University) is clearly stated at the beginning of the paper." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed, making it impossible to assess funder independence. The absence of a funding disclosure is itself a gap." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is provided. The author is a co-creator of the Winograd Schema Challenge (reference [82]) and has published extensively on commonsense reasoning, which represents a potential intellectual conflict of interest that is not disclosed." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "This is a survey paper that does not evaluate pre-trained models on benchmarks. The ChatGPT anecdotes in Section 9 are illustrative, not systematic evaluations." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "This is a survey paper. However, it notably discusses benchmark contamination as a concern for other researchers in Section 5.4, which is one of its strengths." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants were involved in this survey paper." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants were involved in this survey paper." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants were involved in this survey paper." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants were involved in this survey paper." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants were involved in this survey paper." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants were involved in this survey paper." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants were involved in this survey paper." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "This is a survey paper. Cost reporting is not applicable as no method is proposed or evaluated." 276 }, 277 "compute_budget_stated": { 278 "applies": false, 279 "answer": false, 280 "justification": "This is a survey paper with no computational experiments requiring a compute budget." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "More than one hundred benchmarks have been developed to test commonsense knowledge and reasoning abilities of AI systems.", 287 "evidence": "The paper catalogs 139 benchmarks across tables 2-10: 102 text-based, 18 image-based, 12 video-based, and 7 simulated physical environments (Section 7, abstract).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Many commonsense benchmarks are significantly flawed, limiting their usefulness as measures.", 292 "evidence": "Section 4.1.1 and Table 1 provide numerous concrete examples of flaws (incorrect answers, multiple correct answers, encyclopedic rather than commonsense knowledge, ill-formed questions). Appendix A shows 17 of the first 100 OpenBookQA test items are substantially flawed. Sections 8.1-8.3 document quality issues in specific benchmarks including CommonsenseQA 2.0, HellaSwag, and McTaco.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Many significant aspects of commonsense reasoning remain untested by any existing benchmark.", 297 "evidence": "Section 10 identifies untested areas including temporal interval reasoning, hypothetical timelines, commonsense psychological reasoning (beyond ATOMIC), spatial knowledge, physical knowledge, biological knowledge, and social knowledge. The paper argues these are 'not covered more than extremely partially and haphazardly.'", 298 "supported": "strong" 299 }, 300 { 301 "claim": "For test sets, small and clean is better than large and noisy.", 302 "evidence": "Section 5.1 presents a detailed argument that statistical precision from large noisy datasets is illusory because the underlying quantity being measured is not defined to high precision, and clean datasets avoid penalizing systems for knowing too much (e.g., recognizing multiple correct answers). The author estimates expert vetting of a 500-example set requires about 5.5 hours.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "Adversarial filtering in benchmark construction can exaggerate measured improvement between AI systems.", 307 "evidence": "Section 5.2 argues that removing questions system X answers correctly biases measurement in favor of any later system Y that answers different questions, even if Y is not overall better. The paper cites concrete evidence of retrogression between GPT-3 versions (Section 5.2, reference [34]).", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["meta-analysis", "qualitative"], 312 "key_findings": "This survey catalogs 139 commonsense reasoning benchmarks (102 text-based, 18 image-based, 12 video-based, 7 simulated environments) and finds that many are significantly flawed, with defective examples including incorrect answers, multiple correct answers, and reliance on encyclopedic rather than commonsense knowledge. The paper identifies substantial gaps in benchmark coverage, particularly in temporal interval reasoning, spatial and physical knowledge, psychological reasoning, and social commonsense. The author argues that small, carefully vetted test sets are preferable to large noisy ones, and that no existing benchmark reliably measures whether AI systems have achieved commonsense reasoning abilities.", 313 "red_flags": [ 314 { 315 "flag": "No systematic search methodology", 316 "detail": "The paper does not describe a systematic literature search process (databases, search terms, date ranges, inclusion/exclusion criteria with counts) for identifying the 139 benchmarks. The inclusion criterion is loosely stated as self-description by benchmark authors, with 'a few exceptions' at the surveyor's discretion. This makes the survey's comprehensiveness unverifiable." 317 }, 318 { 319 "flag": "No quality assessment framework for reviewed benchmarks", 320 "detail": "While the paper provides insightful qualitative analysis of individual benchmarks, there is no systematic quality assessment rubric applied consistently across all 139 benchmarks. Most benchmarks get only a brief table entry; 12 get detailed treatment. The selection of which 12 to examine in detail is not justified by explicit criteria." 321 }, 322 { 323 "flag": "Author conflict of interest not disclosed", 324 "detail": "Ernest Davis is a co-creator of the Winograd Schema Challenge (reference [82]), one of the most prominent benchmarks surveyed. While the paper notes 'Having reviewed the Winograd Schema Challenge in detail elsewhere, we deliberately omit it here' (Section 8), the intellectual conflict is not formally disclosed, and the author's own benchmarks and test examples appear throughout the paper." 325 }, 326 { 327 "flag": "No limitations section", 328 "detail": "The survey lacks any discussion of its own limitations, such as potential selection bias in benchmark inclusion, English-language bias in the survey itself, or the completeness of the catalog." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capacities of Language Models", 334 "authors": ["Aarohi Srivastava"], 335 "year": 2022, 336 "arxiv_id": "2206.04615", 337 "relevance": "BIG-bench is a major multi-task benchmark collection (212 datasets) for evaluating LLM capabilities, directly relevant to evaluating LLM evaluation methodology." 338 }, 339 { 340 "title": "Holistic Evaluation of Language Models", 341 "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"], 342 "year": 2022, 343 "arxiv_id": "2211.09110", 344 "relevance": "HELM is a comprehensive LLM evaluation framework that selected commonsense benchmarks as part of its evaluation suite, directly relevant to LLM evaluation methodology." 345 }, 346 { 347 "title": "Large Language Models are Few-Shot Learners", 348 "authors": ["Tom Brown"], 349 "year": 2020, 350 "relevance": "GPT-3 paper that established the foundation model paradigm affecting how benchmarks are used and the challenge of test set contamination." 351 }, 352 { 353 "title": "On the Opportunities and Risks of Foundation Models", 354 "authors": ["Rishi Bommasani"], 355 "year": 2021, 356 "arxiv_id": "2108.07258", 357 "relevance": "Foundational paper on foundation models that discusses risks and evaluation challenges relevant to understanding LLM capabilities." 358 }, 359 { 360 "title": "Annotation Artifacts in Natural Language Inference Data", 361 "authors": ["Suchin Gururangan", "Swabha Swayamdipta", "Omer Levy"], 362 "year": 2018, 363 "arxiv_id": "1803.02324", 364 "relevance": "Demonstrates how annotation artifacts in SNLI/MultiNLI allow solving problems without seeing premises, directly relevant to benchmark quality and evaluation methodology." 365 }, 366 { 367 "title": "Reduced, Reused and Recycled: The Life of a Dataset in Machine Learning Research", 368 "authors": ["Bernard Koch", "Emily Denton", "Alex Hanna"], 369 "year": 2021, 370 "arxiv_id": "2112.01716", 371 "relevance": "Studies dataset reuse patterns in ML research, relevant to understanding benchmark methodology and dataset quality issues." 372 }, 373 { 374 "title": "Targeting the Benchmark: On Methodology in Current Natural Language Processing Research", 375 "authors": ["David Schlangen"], 376 "year": 2020, 377 "arxiv_id": "2007.04792", 378 "relevance": "Critical analysis of benchmark-driven NLP research methodology, directly relevant to questions about whether benchmarks measure what they claim to measure." 379 }, 380 { 381 "title": "Data and Its (Dis)contents: A Survey of Dataset Development and Use in Machine Learning Research", 382 "authors": ["Amandalynne Paullada", "Inioluwa Deborah Raji", "Emily M. Bender"], 383 "year": 2021, 384 "relevance": "Survey of dataset development practices in ML research, relevant to understanding data quality and documentation issues in AI evaluation." 385 }, 386 { 387 "title": "Theory of Mind may have Spontaneously Emerged in Large Language Models", 388 "authors": ["Michael Kosinski"], 389 "year": 2023, 390 "arxiv_id": "2302.02083", 391 "relevance": "Controversial claim about emergent LLM capabilities that the survey critiques as an example of overclaiming based on test contamination, relevant to LLM evaluation methodology." 392 }, 393 { 394 "title": "How Not to Test GPT-3", 395 "authors": ["Gary Marcus", "Ernest Davis"], 396 "year": 2023, 397 "relevance": "Critique of methodology for testing LLM capabilities showing that GPT-3's apparent theory of mind reflects memorization rather than understanding." 398 }, 399 { 400 "title": "Winogrande: An Adversarial Winograd Schema Challenge at Scale", 401 "authors": ["Keisuke Sakaguchi", "Ronan Le Bras", "Chandra Bhagavatula"], 402 "year": 2021, 403 "relevance": "Major adversarially-constructed commonsense benchmark demonstrating adversarial filtering methodology for LLM evaluation." 404 }, 405 { 406 "title": "HellaSwag: Can a Machine Really Finish Your Sentence?", 407 "authors": ["Rowan Zellers", "Ari Holtzman", "Yonatan Bisk"], 408 "year": 2019, 409 "arxiv_id": "1905.07830", 410 "relevance": "Widely-used commonsense benchmark for LLM evaluation, selected as part of HELM, demonstrating adversarial generation methodology." 411 } 412 ] 413 }