scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18580B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of LLMs as Mathematical Problem Solvers",
      6     "authors": [
      7       "Qintong Li",
      8       "Leyang Cui",
      9       "Xueliang Zhao",
     10       "Lingpeng Kong",
     11       "Wei Bi"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2402.19255",
     16     "doi": "10.48550/arXiv.2402.19255"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims (up to 20% gap, 25 LLMs tested, human performance unaffected) are all directly supported by Table 4 and Figure 6 results.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Claims like 'task-specific optimization… gives better accuracy while robustness depends more on base model and fine-tuning dataset' are stated causally but derived from cross-model comparisons where models differ on multiple confounding dimensions simultaneously.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Limitations section explicitly bounds scope to grade school level math; broader generalizations about LLM robustness are appropriately hedged to math word problems on GSM8K variants.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper acknowledges GPT-4 generation bias (Table 3) but does not discuss whether performance drops reflect increased task difficulty rather than robustness failures, or why critical thinking is a different task type entirely rather than a harder version.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly measures answer accuracy on perturbed questions and defines specific metrics (PDR, ASP) that operationalize robustness; the paper does not conflate these with deeper understanding.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated Limitations section lists three explicit limitations: scope limited to elementary-level math, solution chain accuracy not assessed, and no investigation of failure causes.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations are specific but miss key threats: potential data contamination in model training, moderate inter-annotator agreement (Krippendorff's α=0.567) is not discussed as a threat, and prompting experiment using only 120/1319 questions is not flagged.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states it covers only grade school math, only the listed perturbation types, and does not investigate solution chain accuracy or underlying failure reasons.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding from NSFC-RGC joint scheme (grant N_HKU714/21) is disclosed in the Acknowledgements section.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations (HKU and Tencent AI Lab) are clearly stated on the first page; the internship relationship of two authors at Tencent is also footnoted.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "NSFC and RGC are government research funding bodies with no stake in LLM benchmark outcomes; Tencent affiliation is disclosed but the paper evaluates third-party models.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present beyond the funding acknowledgment.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Robustness is operationalized through PDR and ASP metrics (defined with formulas), and all 8 perturbation types are precisely defined in Table 6 with formal descriptions.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states it contributes GSM-Plus—an adversarial benchmark dataset with 10,552 question variations across 8 perturbation types—and a systematic evaluation of 25 LLMs.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 provides a structured comparison of GSM-Plus against six prior datasets, explicitly noting which perturbation types existing work covers and where GSM-Plus extends coverage.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper grounds the 5 perturbation perspectives in Polya's four-stage problem-solving framework, arguing these dimensions capture distinct cognitive capabilities required for mathematical reasoning.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper provides GPT-4 pass rates by perturbation type (Table 7) but does not formally characterize difficulty tiers (easy/medium/hard) or measure item-level difficulty for the 10,552 questions.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Floor effects are implicitly visible (near-0% accuracy for most models on critical thinking in Figure 6) but the paper does not explicitly discuss these as a discriminability concern or check ceiling effects for the benchmark overall.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Human performance is reported in Table 4 (96.77% on GSM8K, 98.75% on GSM-Plus, ASP of 92.11%), established using qualified annotators with bachelor's degrees and a qualifying exam.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Exact-match accuracy is the standard for GSM8K; PDR and ASP are formally defined with equations; the critical thinking evaluation provides an explicit list of accepted expressions for the 'no valid answer' case.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper cites contamination concerns (Golchin and Surdeanu, 2023) but implements no anti-contamination measures such as temporal splits, canary strings, or dynamic generation in GSM-Plus itself.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss whether future models could overfit to the fixed 10,552 variations or become obsolete, and provides no update mechanism or versioning plan.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The limitations section notes what GSM-Plus does not measure (solution chains, higher-level math, failure causes), but the paper does not discuss failure modes of the benchmark itself—e.g., that rephrased questions may be gamed by GPT-4-family models.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Full prompts for all four prompting methods (CoT, LTM, COMP, SC) are provided in Appendix C.4; the paper states the dataset and evaluation suite will be released at the project website.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper documents GPT-4 generation prompts, human annotation process (qualifying exam, batch process, 10% cross-annotation, IAA α=0.567), and per-perturbation pass rates in Table 7.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper states the dataset will be released but does not specify licensing terms, usage restrictions, or access conditions beyond the project website URL.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The paper specifies GSM-Plus is intended for evaluating robustness of LLMs on grade school math word problems and explicitly states it should not be used to assess higher-level math reasoning or solution chain quality.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "LLMs exhibit a performance gap of up to 20% between GSM8K accuracy and GSM-Plus accuracy, while human performance is unaffected (PDR of -2.05%)",
    203       "evidence": "Table 4: GPT-4 drops from 93.25% to 85.58% (PDR 8.23%), GPT-3.5 from 73.62% to 61.19% (PDR 16.88%); humans show 96.77% to 98.75%",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Critical thinking and arithmetic variation are the most challenging perturbation types, with most open-source models showing near-complete failure on critical thinking",
    208       "evidence": "Figure 6 shows ~100% PDR for most open-source models on critical thinking; Table 8 shows near-0% accuracy for many models on critical thinking variations",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Math-specific fine-tuning improves benchmark accuracy but does not proportionally improve robustness (PDR remains similar between foundation and SFT models)",
    213       "evidence": "Table 4: LLaMA-2-13B PDR 34.76%, Abel-13B PDR 31.97%, MAmmoTH-13B PDR 34.58% — minimal improvement despite large accuracy gains",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Larger models generally exhibit lower PDR, with base model quality being more important than fine-tuning dataset for robustness",
    218       "evidence": "Abel series: PDR 37.67% (7B) → 31.97% (13B) → 28.52% (70B); MetaMath-Mistral-7B (PDR 27.69%) outperforms larger LLaMA-2-based models",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "The compositional prompting method COMP improves robustness over standard CoT, particularly on critical thinking variations",
    223       "evidence": "Table 5: COMP improves critical thinking from 40.83% (CoT) to 54.17%; COMP+SC to 55.83% on GPT-3.5-Turbo on a 120-question subset",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Program-based models are more robust to numerical variation but less robust to distractor insertion than NL-based models",
    228       "evidence": "Section 5.2: MAmmoTH-Coder-13B PDR 11.80% vs Abel-13B 14.03% for numerical variation; reversed for distractor insertion (23.2% vs 16.1%)",
    229       "supported": "weak"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "benchmark-creation"
    235   ],
    236   "key_findings": "GSM-Plus reveals that all 25 tested LLMs show substantial performance degradation (PDR 8–40%) when GSM8K questions are perturbed across 8 variation types, while human annotators show negative PDR (-2.05%), suggesting LLMs rely on superficial patterns rather than genuine mathematical understanding. Critical thinking—requiring models to recognize missing information—is the most catastrophic failure mode, with most open-source models collapsing to near-0% accuracy. Math fine-tuning consistently improves absolute accuracy but shows minimal benefit for robustness, while base model quality and fine-tuning dataset composition matter more than the SFT process itself. None of the four investigated prompting techniques achieves robust performance across all variation types, though the proposed compositional prompting method COMP modestly improves performance, especially on critical thinking.",
    237   "red_flags": [
    238     {
    239       "flag": "GPT-4 self-preference bias",
    240       "detail": "Question variations are generated by GPT-4 and the primary contamination check (Table 3) only compares GPT-4's performance on its own vs. human-corrected questions — it does not test whether GPT-4-family models systematically outperform on GPT-4-reworded questions vs. true adversarial rephrasing."
    241     },
    242     {
    243       "flag": "Prompting experiments on 9% subset",
    244       "detail": "The prompting analysis (Section 5.4) uses only 120 seed questions (960 variations) out of 1,319, with no justification for why this subset is representative or power calculation."
    245     },
    246     {
    247       "flag": "Moderate inter-annotator agreement",
    248       "detail": "Krippendorff's α = 0.567 is only moderate reliability; the paper does not discuss implications for annotation quality or how disagreements were resolved beyond 'authors manually review'."
    249     },
    250     {
    251       "flag": "No contamination resistance",
    252       "detail": "The benchmark is a fixed public dataset with no temporal splits or dynamic generation, making it susceptible to future model training on GSM-Plus variations, yet the paper provides no mitigation strategy."
    253     },
    254     {
    255       "flag": "Critical thinking as different task type",
    256       "detail": "The 'critical thinking' perturbation fundamentally changes the answer type (from numerical to 'no valid answer'), making it a different task rather than a variation of the original, which confounds robustness measurement with task-type generalization."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "Training verifiers to solve math word problems (GSM8K)",
    262       "relevance": "Parent dataset that GSM-Plus extends; provides the 1,319 seed questions"
    263     },
    264     {
    265       "title": "Are NLP models really able to solve simple math word problems? (SVAMP)",
    266       "relevance": "Prior robustness evaluation benchmark for grade school math; direct predecessor"
    267     },
    268     {
    269       "title": "Large language models can be easily distracted by irrelevant context (GSM-IC)",
    270       "relevance": "Prior work on distractor insertion perturbations; GSM-Plus extends this with 7 additional perturbation types"
    271     },
    272     {
    273       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    274       "relevance": "Baseline prompting method evaluated across all perturbation types in GSM-Plus"
    275     },
    276     {
    277       "title": "MetaMath: Bootstrap your own mathematical questions for large language models",
    278       "relevance": "Evaluated SFT model; also directly relevant as question bootstrapping approach"
    279     },
    280     {
    281       "title": "MAmmoTH: Building math generalist models through hybrid instruction tuning",
    282       "relevance": "Evaluated SFT model family across 7B/13B/70B scales"
    283     },
    284     {
    285       "title": "Time travel in LLMs: Tracing data contamination in large language models",
    286       "relevance": "Motivates robustness evaluation as alternative to contaminated standard benchmarks"
    287     },
    288     {
    289       "title": "Least-to-most prompting enables complex reasoning in large language models",
    290       "relevance": "Prompting method (LTM) evaluated and compared against proposed COMP method"
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 3,
    296       "justification": "Directly actionable for anyone evaluating LLM math capabilities — provides a drop-in replacement for GSM8K with robustness measurement."
    297     },
    298     "surprise_contrarian": {
    299       "score": 2,
    300       "justification": "Challenges the prevailing narrative that >90% GSM8K accuracy means strong mathematical understanding, showing large gaps under minor perturbations."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "Raises mild concern that LLMs deployed in math-adjacent tasks may fail on slight problem variations, but doesn't connect to safety-critical scenarios."
    305     },
    306     "drama_conflict": {
    307       "score": 2,
    308       "justification": "Directly engages the ongoing community debate about whether LLMs 'truly understand' math or exploit patterns, with concrete evidence for the skeptical side."
    309     },
    310     "demo_ability": {
    311       "score": 3,
    312       "justification": "Dataset and evaluation code are released publicly; anyone can run models on GSM-Plus immediately and reproduce the robustness analysis."
    313     },
    314     "brand_recognition": {
    315       "score": 2,
    316       "justification": "Evaluates GPT-4, GPT-3.5, LLaMA-2, and CodeLlama — well-known models — but the authors are from HKU/Tencent, not a tier-1 AI lab."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "42306766",
    323         "title": "Beyond Language Models: Byte Models Are Digital World Simulators",
    324         "points": 4,
    325         "comments": 0,
    326         "url": "https://news.ycombinator.com/item?id=42306766",
    327         "created_at": "2024-12-03T14:58:15Z"
    328       },
    329       {
    330         "hn_id": "46502322",
    331         "title": "Beyond Language Models: Byte Models Are Digital World Simulators (2024)",
    332         "points": 1,
    333         "comments": 0,
    334         "url": "https://news.ycombinator.com/item?id=46502322",
    335         "created_at": "2026-01-05T18:06:20Z"
    336       }
    337     ],
    338     "top_points": 4,
    339     "total_points": 5,
    340     "total_comments": 0
    341   }
    342 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs