scan.json (22819B)
1 { 2 "paper": { 3 "title": "CORE: Comprehensive Ontological Relation Evaluation for Large Language Models", 4 "authors": ["Satyam Dwivedi", "Sanjukta Ghosh", "Shivam Dwivedi", "Nishi Kumari", "Anil Thakur", "Anurag Purushottam", "Deepak Alok", "Praveen Gatla", "Manjuprasad B", "Bipasha Patgiri"], 5 "year": 2026, 6 "venue": "" 7 }, 8 "checklist": { 9 "artifacts": { 10 "code_released": { 11 "applies": true, 12 "answer": true, 13 "justification": "Footnote 1 states 'data and code are hosted at Hugging Face and GitHub' and references core.vaikhari.ai." 14 }, 15 "data_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper states the benchmark and dataset are available at core.vaikhari.ai with data hosted on Hugging Face." 19 }, 20 "environment_specified": { 21 "applies": true, 22 "answer": false, 23 "justification": "No environment specifications, requirements.txt, or dependency details are provided. The paper mentions 'identical hardware specifications' for open-source models but does not specify them." 24 }, 25 "reproduction_instructions": { 26 "applies": true, 27 "answer": false, 28 "justification": "No step-by-step reproduction instructions are provided in the paper. The prompt is given in Appendix A but no instructions for running the evaluation pipeline." 29 } 30 }, 31 "statistical_methodology": { 32 "confidence_intervals_or_error_bars": { 33 "applies": true, 34 "answer": false, 35 "justification": "Results are reported as point estimates (e.g., '48.25–70.9% overall accuracy') with no confidence intervals or error bars." 36 }, 37 "significance_tests": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper claims systematic differences between related and unrelated pair performance but uses no statistical significance tests to support these comparisons." 41 }, 42 "effect_sizes_reported": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper reports effect sizes in context: accuracy on related pairs (86.5–100%) vs unrelated pairs (0–41.35%), ECE increase of 2–4x on unrelated pairs, and semantic collapse rate of 37.6%. These provide magnitude with baseline context." 46 }, 47 "sample_size_justified": { 48 "applies": true, 49 "answer": false, 50 "justification": "The benchmark has 203 questions (102 open, 101 blind). No justification is given for why this sample size is sufficient, nor is any power analysis provided." 51 }, 52 "variance_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "No variance, standard deviation, or spread measures are reported. Each model appears to have been run once with deterministic settings, with no repeated runs." 56 } 57 }, 58 "evaluation_design": { 59 "baselines_included": { 60 "applies": true, 61 "answer": true, 62 "justification": "A human baseline from 1,000+ participants is included (Table 1), achieving 92.6% overall accuracy. 29 LLMs are compared against each other and against this human baseline." 63 }, 64 "baselines_contemporary": { 65 "applies": true, 66 "answer": true, 67 "justification": "The 29 models include contemporary frontier models with cutoff date January 22, 2026, including GPT-5.2, Claude-Opus-4.5, Gemini-3-flash, Llama-4-scout, etc." 68 }, 69 "ablation_study": { 70 "applies": false, 71 "answer": false, 72 "justification": "This is a benchmark evaluation paper, not a system with components to ablate." 73 }, 74 "multiple_metrics": { 75 "applies": true, 76 "answer": true, 77 "justification": "Multiple metrics are used: accuracy, balanced accuracy, ECE, overconfident error rate, and semantic collapse rate (Section 4.3)." 78 }, 79 "human_evaluation": { 80 "applies": true, 81 "answer": true, 82 "justification": "Human baseline evaluation from 1,000+ participants is included, establishing human performance on the benchmark (Section 3.4, Table 1)." 83 }, 84 "held_out_test_set": { 85 "applies": true, 86 "answer": true, 87 "justification": "The benchmark includes a blind subset of 101 questions 'withheld for internal analysis and validation' (Section 3.1), separate from the 102 open questions." 88 }, 89 "per_category_breakdown": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are broken down by related vs. unrelated pairs (Tables 3, 4), by difficulty level (Table 5), and per-relation performance is discussed in Section 5.4." 93 }, 94 "failure_cases_discussed": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section 5.3 provides a qualitative failure example ('Hospital is to flying as wolf is to_?') showing how models construct spurious relational narratives. Section 6.3 discusses confidence-coherence misalignment." 98 }, 99 "negative_results_reported": { 100 "applies": true, 101 "answer": true, 102 "justification": "The entire paper reports a negative result: LLMs systematically fail on unrelated pairs (0–41.35% accuracy). Section 5.5 reports further degradation to ~2% on the 225K dataset." 103 } 104 }, 105 "claims_and_evidence": { 106 "abstract_claims_supported": { 107 "applies": true, 108 "answer": true, 109 "justification": "Abstract claims (48.25–70.9% accuracy, 0–41.35% on unrelated, 37.6% semantic collapse rate, ~2% on 225K dataset) are supported by results in Sections 5.1–5.5 and Tables 3–5." 110 }, 111 "causal_claims_justified": { 112 "applies": true, 113 "answer": false, 114 "justification": "Section 6.2 makes causal claims about architectural and training objective biases causing failures ('The observed failures likely arise from the interaction between model architecture, training objectives, and evaluation formulation'). These are speculative explanations without controlled experiments to support them." 115 }, 116 "generalization_bounded": { 117 "applies": true, 118 "answer": true, 119 "justification": "Section 8 (Limitations) explicitly bounds scope: English-only, multiple-choice format only, text-only evaluation. Claims are hedged appropriately (e.g., 'suggests' rather than definitive statements)." 120 }, 121 "alternative_explanations_discussed": { 122 "applies": true, 123 "answer": true, 124 "justification": "Section 6.2 discusses multiple contributing factors (architecture, training objectives, evaluation formulation). Section 6.3 discusses confidence-coherence misalignment as an alternative interpretation. The paper acknowledges it cannot establish definitive causal mechanisms." 125 } 126 }, 127 "setup_transparency": { 128 "model_versions_specified": { 129 "applies": true, 130 "answer": false, 131 "justification": "Table 2 lists models by marketing names (e.g., 'Claude-Opus-4.5', 'GPT-5.2', 'Gemini-3-flash') without specific API versions or snapshot dates." 132 }, 133 "prompts_provided": { 134 "applies": true, 135 "answer": true, 136 "justification": "Appendix A provides the full CORE_PROMPT template used for model inference, including the complete JSON response format and field specifications. The placeholders {question_text} and {options_block} are filled from the benchmark questions which are also released." 137 }, 138 "hyperparameters_reported": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper states 'deterministic inference with recommended inference settings' for proprietary models and 'default configurations' for open-source models, but does not specify temperature, top-p, or other specific hyperparameter values." 142 }, 143 "scaffolding_described": { 144 "applies": false, 145 "answer": false, 146 "justification": "No agentic scaffolding is used. Models are evaluated with single-turn prompting." 147 }, 148 "data_preprocessing_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 3.4 describes the validation pipeline: 250 initial questions → three-pass expert review → 203 questions with perfect inter-annotator agreement (Cohen's κ = 1.0). The question design and relation type selection process is documented in Sections 3.1–3.3." 152 } 153 }, 154 "limitations_and_scope": { 155 "limitations_section_present": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 8 'Limitations' provides a dedicated limitations discussion covering language scope, format limitations, and text-only evaluation constraints." 159 }, 160 "threats_to_validity_specific": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 8 discusses specific threats: English-only evaluation may reflect language-specific properties, multiple-choice format may scaffold performance differently than open-ended generation, and text-only evaluation excludes multimodal reasoning." 164 }, 165 "scope_boundaries_stated": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 8 explicitly states what was NOT tested: non-English languages, open-ended generation, multimodal settings. These are specific exclusions rather than generic disclaimers." 169 } 170 }, 171 "data_integrity": { 172 "raw_data_available": { 173 "applies": true, 174 "answer": true, 175 "justification": "The benchmark data and 225K dataset are stated to be available on Hugging Face and at core.vaikhari.ai." 176 }, 177 "data_collection_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "Sections 3.1–3.4 describe the dataset construction: 225K MCQs spanning 74 disciplines, 203-question benchmark subset, question format, relation type selection based on ontological frameworks, and expert validation process." 181 }, 182 "recruitment_methods_described": { 183 "applies": true, 184 "answer": false, 185 "justification": "The human baseline used 'over 1,000 participants in India, spanning undergraduate to postdoctoral education levels' but does not describe how participants were recruited, through what channels, or whether recruitment could introduce bias." 186 }, 187 "data_pipeline_documented": { 188 "applies": true, 189 "answer": true, 190 "justification": "The pipeline from 250 initial questions through three-pass expert review to 203 final questions with Cohen's κ = 1.0 is documented in Section 3.4. The benchmark split into open (102) and blind (101) subsets is described." 191 } 192 }, 193 "conflicts_of_interest": { 194 "funding_disclosed": { 195 "applies": true, 196 "answer": false, 197 "justification": "No funding sources are disclosed anywhere in the paper. The Acknowledgments section thanks participants and academicians but does not mention funding." 198 }, 199 "affiliations_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "Author affiliations are listed: Vaikhari AI, IIT BHU, IIT Delhi, BHU, GSSSIETW Mysore, and Tezpur University. The lead author is affiliated with Vaikhari AI, which hosts the benchmark at core.vaikhari.ai." 203 }, 204 "funder_independent_of_outcome": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding is disclosed, so independence cannot be assessed. The lead author is from Vaikhari AI which hosts the benchmark, creating a potential conflict of interest that is not acknowledged." 208 }, 209 "financial_interests_declared": { 210 "applies": true, 211 "answer": false, 212 "justification": "No competing interests statement is present. The lead author's affiliation with Vaikhari AI (which hosts the benchmark) is a potential financial interest that is not declared." 213 } 214 }, 215 "contamination": { 216 "training_cutoff_stated": { 217 "applies": true, 218 "answer": false, 219 "justification": "The paper states a model selection cutoff date of January 22, 2026, but does not state training data cutoff dates for the individual models evaluated." 220 }, 221 "train_test_overlap_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 3.1 discusses contamination mitigation: 'To mitigate evaluation contamination and overfitting risks, different portions serve different purposes.' The blind subset (101 questions) is withheld to reduce contamination risk." 225 }, 226 "benchmark_contamination_addressed": { 227 "applies": true, 228 "answer": true, 229 "justification": "The benchmark is newly created, and the paper takes steps to mitigate contamination by splitting into open/blind subsets and separating training-purpose data from evaluation data (Section 3.1)." 230 } 231 }, 232 "human_studies": { 233 "pre_registered": { 234 "applies": true, 235 "answer": false, 236 "justification": "The human baseline study with 1,000+ participants has no mention of pre-registration." 237 }, 238 "irb_or_ethics_approval": { 239 "applies": true, 240 "answer": false, 241 "justification": "No IRB or ethics approval is mentioned for the human baseline study involving 1,000+ participants." 242 }, 243 "demographics_reported": { 244 "applies": true, 245 "answer": false, 246 "justification": "Participants are described only as 'over 1,000 participants in India, spanning undergraduate to postdoctoral education levels.' No detailed demographics (age, gender, field of study, etc.) are reported." 247 }, 248 "inclusion_exclusion_criteria": { 249 "applies": true, 250 "answer": false, 251 "justification": "No inclusion or exclusion criteria are stated for the human baseline participants beyond education level range." 252 }, 253 "randomization_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is not a between-subjects experimental study; all participants took the same benchmark. Randomization is not applicable." 257 }, 258 "blinding_described": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section 3.4 states participants 'completed the benchmark under blind evaluation conditions.'" 262 }, 263 "attrition_reported": { 264 "applies": true, 265 "answer": false, 266 "justification": "No information on how many participants started vs. finished or whether any were excluded from analysis." 267 } 268 }, 269 "cost_and_practicality": { 270 "inference_cost_reported": { 271 "applies": true, 272 "answer": false, 273 "justification": "The Ethical Considerations section acknowledges 'significant computational cost and associated carbon emissions' but does not quantify inference costs, API costs, or per-model evaluation time." 274 }, 275 "compute_budget_stated": { 276 "applies": true, 277 "answer": false, 278 "justification": "No compute budget, GPU hours, API spend, or hardware details are stated despite evaluating 29 models on 203+ questions." 279 } 280 } 281 }, 282 "claims": [ 283 { 284 "claim": "29 state-of-the-art LLMs achieve 48.25–70.9% overall accuracy on the CORE benchmark, with near-ceiling performance on related pairs (86.5–100%) but severe degradation on unrelated pairs (0–41.35%).", 285 "evidence": "Table 3 and Section 5.1 report these accuracy ranges across 29 models.", 286 "supported": "strong" 287 }, 288 { 289 "claim": "Models maintain similar confidence (~92–94%) on both related and unrelated pairs despite 40–80 percentage point accuracy differences.", 290 "evidence": "Table 3 reports confidence ranges of ~93–95% for related and ~91–94% for unrelated pairs. Section 5.1 and Figure 1 illustrate this.", 291 "supported": "strong" 292 }, 293 { 294 "claim": "Expected Calibration Error increases 2–4x on unrelated pairs compared to related pairs.", 295 "evidence": "Table 4 shows ECE of ~8.0–15.0% for related pairs vs ~24.0–51.0% for unrelated pairs. Section 5.2 and Figure 2.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "The mean semantic collapse rate of 37.6% indicates systematic generation of spurious relations rather than random guessing.", 300 "evidence": "Section 5.3 reports this rate and contrasts it with the 75% expected error from random guessing.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "On the 225K MCQ CORE dataset, accuracy drops to approximately 2%.", 305 "evidence": "Section 5.5 states this but provides no detailed breakdown, model-by-model results, or methodology specifics for this evaluation.", 306 "supported": "weak" 307 }, 308 { 309 "claim": "The failure reflects a shared limitation in how current models handle relation absence, potentially from architectural inductive biases.", 310 "evidence": "Section 6.1–6.2 provides speculative architectural and training-objective explanations without controlled experiments to support them.", 311 "supported": "weak" 312 } 313 ], 314 "methodology_tags": ["benchmark-eval"], 315 "key_findings": "CORE reveals a dramatic asymmetry in LLM reasoning: 29 frontier models achieve 86.5–100% accuracy on related concept pairs but only 0–41.35% on unrelated pairs, while maintaining similar confidence (~92–94%) in both cases. This confidence-accuracy inversion is accompanied by ECE increases of 2–4x on unrelated pairs and a 37.6% mean semantic collapse rate, where models fabricate plausible but spurious relational structures. A human baseline of 1,000+ participants achieves 92.6% overall and 95.1% on unrelated pairs, demonstrating the task is not inherently difficult for humans.", 316 "red_flags": [ 317 { 318 "flag": "Small benchmark size", 319 "detail": "The core benchmark contains only 203 questions (102 open, 101 blind). Performance differences across models on such a small test set may not be statistically meaningful, and no significance tests or confidence intervals are provided." 320 }, 321 { 322 "flag": "Vague 225K dataset evaluation", 323 "detail": "The claim of ~2% accuracy on the 225K dataset (Section 5.5) is stated without detailed methodology, model-by-model results, or explanation of how this evaluation was conducted. This is a dramatic claim with minimal supporting detail." 324 }, 325 { 326 "flag": "Potential conflict of interest", 327 "detail": "The lead author is affiliated with Vaikhari AI, which hosts the benchmark at core.vaikhari.ai. This potential commercial interest in the benchmark's prominence is not disclosed or discussed." 328 }, 329 { 330 "flag": "Missing hyperparameters", 331 "detail": "The paper uses 'recommended inference settings' and 'default configurations' without specifying actual temperature, top-p, or other parameters. Different settings could substantially affect results." 332 }, 333 { 334 "flag": "Human baseline methodology gaps", 335 "detail": "The 1,000+ participant human baseline lacks IRB approval, detailed demographics, recruitment methods, inclusion criteria, and attrition reporting. The participants are all from India, which may limit generalizability of the human baseline." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 341 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 342 "year": 2023, 343 "relevance": "Examines whether emergent abilities in LLMs are measurement artifacts, directly relevant to interpreting benchmark performance discontinuities." 344 }, 345 { 346 "title": "AbstentionBench: Reasoning LLMs Fail on Unanswerable Questions", 347 "authors": ["Polina Kirichenko", "Mark Ibrahim", "Kamalika Chaudhuri", "Samuel J. Bell"], 348 "year": 2025, 349 "arxiv_id": "2506.09038", 350 "relevance": "Evaluates LLM ability to abstain from answering, closely related to CORE's evaluation of unrelatedness recognition." 351 }, 352 { 353 "title": "SimpleQA: Measuring Short-form Factuality in Large Language Models", 354 "authors": ["Jason Wei"], 355 "year": 2024, 356 "arxiv_id": "2411.04368", 357 "relevance": "Evaluates LLM factuality and refusal rates, complementary to CORE's assessment of spurious relation generation." 358 }, 359 { 360 "title": "Towards Understanding Sycophancy in Language Models", 361 "authors": ["Mrinank Sharma", "Meg Tong", "Tomasz Korbak"], 362 "year": 2023, 363 "arxiv_id": "2310.13548", 364 "relevance": "Studies sycophancy in LLMs, relevant to understanding why models fabricate relational structures rather than expressing uncertainty." 365 }, 366 { 367 "title": "Language Models Don't Always Say What They Think: Unfaithful Explanations in Chain-of-Thought Prompting", 368 "authors": ["Miles Turpin", "Julian Michael", "Ethan Perez", "Samuel Bowman"], 369 "year": 2023, 370 "relevance": "Examines unfaithful reasoning in LLMs, relevant to CORE's finding of coherent but incorrect relational explanations." 371 }, 372 { 373 "title": "Holistic Evaluation of Language Models", 374 "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"], 375 "year": 2022, 376 "arxiv_id": "2211.09110", 377 "relevance": "Major LLM evaluation framework; CORE extends evaluation to unrelatedness reasoning as a new dimension." 378 }, 379 { 380 "title": "Language Models (Mostly) Know What They Know", 381 "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"], 382 "year": 2022, 383 "arxiv_id": "2207.05221", 384 "relevance": "Studies LLM calibration and self-knowledge, directly relevant to CORE's finding of systematic miscalibration on unrelated pairs." 385 }, 386 { 387 "title": "The Reversal Curse: LLMs trained on 'A is B' fail to learn 'B is A'", 388 "authors": ["Lukas Berglund", "Meg Tong", "Max Kaufmann"], 389 "year": 2024, 390 "relevance": "Demonstrates systematic reasoning failures in LLMs related to relational knowledge, complementary to CORE's findings." 391 }, 392 { 393 "title": "GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models", 394 "authors": ["Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi"], 395 "year": 2025, 396 "relevance": "Exposes limitations in LLM reasoning through controlled benchmark variations, similar methodology to CORE's approach." 397 }, 398 { 399 "title": "Emergent Analogical Reasoning with Large Language Models", 400 "authors": ["Taylor Webb", "Keith J. Holyoak", "Hongjing Lu"], 401 "year": 2023, 402 "relevance": "Evaluates LLM analogical reasoning capability, directly related to CORE's analogy-based evaluation format." 403 } 404 ] 405 }