ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25712B)


      1 {
      2   "paper": {
      3     "title": "Evaluating LLM Reasoning Beyond Correctness and CoT",
      4     "authors": ["Soheil Abbasloo"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.18134"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "SIEV, a dialectical evaluation framework based on thesis-antithesis-synthesis interactions, reveals substantial hidden reasoning gaps in state-of-the-art LLMs on saturated benchmarks GSM8K and MMLU. Models with near-identical correctness scores (e.g., GPT-5-chat at 96.4% vs O3 at 97.1% on GSM) diverge dramatically at synthesis (56.2% vs 93.6%), suggesting high static accuracy masks fragile reasoning. Cross-model dialectics show models often benefit from external antitheses more than self-generated ones, raising questions about whether LLM reasoning is a general capability or context-sensitive pattern matching.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper states 'the SIEV source code is publicly available at https://github.com/microsoft/siev' in Section 3."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Uses publicly available GSM8K and MMLU benchmarks. No proprietary data was collected."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency details are provided in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no reproduction guide is described in the text."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Table 1 reports ± values for pT and pS (e.g., '97.1±0.1', '93.6±0.7'), indicating uncertainty across runs."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes extensive comparative claims between models (rankings, performance differences) but uses no statistical significance tests. Differences are assessed by raw score comparison only."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports specific score drops with baseline context, e.g., 'GPT-5-chat loses more than 40 points (out of 100) on GSM' and Δ values showing thesis-to-synthesis change throughout Table 1."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for why 21 models were selected, or why the full GSM8K/MMLU datasets (rather than subsets) were used. No power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Table 1 reports standard deviation across runs using ± notation for both pT and pS columns."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The thesis score (pT) serves as the conventional correctness baseline, and SIEV metrics (pS, DS, Δ) are compared against it across all 21 models."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Includes very recent models: O3, GPT-5, GPT-5-chat, O4-mini, Kimi-K2, DeepSeek-R1, alongside older models like GPT-3.5 for range."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study of SIEV's components. The λ and γ parameters are set (λ=0.7, γ=1) without exploring alternatives. No ablation of the antithesis stage or synthesis stage independently."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Reports four complementary metrics: Synthesis Score (pS), Dialectic Score (DS), Improvement (Δ), and Opposition Compliance (OC)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Section 4 explicitly acknowledges the absence: 'some aspects of reasoning... may require expert human judgment. Human-in-the-loop extensions could validate and refine SIEV's automated signals.' Evaluation is entirely automated."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Uses the standard test sets of GSM8K and MMLU, which are established benchmarks with defined test splits."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figures 1, 6, and 10 provide per-topic MMLU breakdowns. Figure 4 shows per-pattern breakdowns. Table 2 compares overall vs representative topic performance."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Figures 3 and 8 provide detailed qualitative failure examples. Figure 3 shows DeepSeek-R1 failing at synthesis; Figure 8 shows GPT-5 ignoring a valid antithesis."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Reports that Δ is negative across all models on average (Table 1), meaning synthesis generally degrades performance. Section 3.1: 'Δ is negative on average, indicating limited refinement at synthesis.'"
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claim 'GPT-5-chat loses more than 40 points on GSM' is confirmed in Table 1 (Δ=-40.2). Claims about substantial gaps are supported by the wide pS ranges shown."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper is appropriately hedged with causal language. Section 3.2: 'These patterns do not settle the debate, but they add weight to an existing view.' Section 3.1: 'The performance drops may hint at underlying issues in models training, though diagnosing these is beyond our current scope.'"
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title 'Evaluating LLM Reasoning Beyond Correctness and CoT' and abstract claim SIEV enables 'a clearer foundation for assessing and understanding the reasoning capabilities of LLMs' are broad generalizations from only two benchmarks (GSM8K and MMLU). The limitations section acknowledges this but the framing exceeds the evidence."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 3.2 'Key Takeaway' extensively discusses whether improvements reflect genuine reasoning or context-sensitive pattern matching, citing prior skeptical work (Dziri et al., Kambhampati, McCoy et al.)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 2.4 acknowledges 'these signals do not necessarily certify authentic reasoning, they offer stronger grounds for interpreting whether and how a model's apparent reasoning reflects a stable, integrative process.' The gap between measurement (synthesis accuracy) and claim (reasoning quality) is explicitly discussed."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are referred to by marketing names only: 'O3', 'GPT-5', 'GPT-4', 'DeepSeek-R1', 'Kimi-K2'. No API versions, snapshot dates, or model IDs are provided. This is critical since model behavior changes across versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix A provides the full prompt text for all three stages (thesis, antithesis, synthesis) with both MMLU and GSM parameter specifications."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No temperature, top-p, max tokens, or other API parameters are reported. Only the DS formula parameters (λ=0.7, γ=1) are stated."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The two-agent pipeline is described in detail in Section 2.2 and Figure 2: Agent A produces thesis and synthesis, Agent B produces antithesis. Thinking token handling for R1 is also documented in Figure 3 notes."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No description of how model outputs were parsed, how answer extraction worked, how edge cases (malformed outputs, refusals) were handled, or whether any filtering was applied to the results."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4 'Brief Discussion and Limitations' provides substantive discussion of multiple specific limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 4 discusses specific threats: OC measures opposition but not semantic quality of antitheses, synthesis evaluation via correctness alone misses multi-dimensional quality, absence of human-judged reasoning traces limits validation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 4: 'it remains to be seen how these findings generalize to emerging benchmarks, multimodal settings, or tasks that demand long-horizon planning or domain-specific symbolic reasoning.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw model outputs, reasoning traces, or per-question results are released. Only aggregate statistics are reported in the paper."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No details on when API calls were made, which API endpoints were used, or how the 21 models were accessed. The pipeline structure is described but not the data collection specifics."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Uses standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The TAS pipeline is described conceptually (Figure 2) but operational details are missing: how outputs were parsed, how correctness was determined, how many items failed parsing, whether any were excluded."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is mentioned. The author is at Microsoft Research but no funding acknowledgment section exists."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliation 'Microsoft Research, Vancouver, Canada' is clearly stated on the first page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "The author works at Microsoft Research. Microsoft has a major partnership with OpenAI whose models (GPT-4, GPT-5, O3, etc.) are extensively evaluated. This financial relationship is not discussed. Several OpenAI models rank highly (O3 ranks #1 on GSM)."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the 21 models evaluated. This is relevant since GSM8K (2021) and MMLU (2020) predate all tested models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper claims SIEV has 'lower susceptibility to contamination' (Section 1) but does not analyze whether thesis scores are inflated by contamination. GSM8K and MMLU are known to be heavily contaminated in modern LLMs."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "GSM8K and MMLU were both published years before any tested model's training. The paper acknowledges these are 'saturated benchmarks' but does not address whether saturation is partly due to contamination, which would undermine the thesis-score baseline."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "SIEV requires 3 inference passes per question across 21 models on two full benchmarks. No API costs, token counts, or latency figures are reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total compute budget, API spend, or hardware information is provided despite what must be substantial inference costs across 21 models."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Table 1 reports ± values across runs for all models (e.g., O3: 97.1±0.1), indicating multiple runs with variance measurement."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The ± values in Table 1 imply multiple runs but the exact number of runs is never stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The DS formula uses λ=0.7 and γ=1 without reporting how these values were selected or what alternatives were tried."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "λ=0.7 and γ=1 are presented without justification. Table 1 note states these values but no sensitivity analysis or selection rationale is provided."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The author proposes SIEV and evaluates it without acknowledging self-comparison bias or seeking independent validation of the framework."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "SIEV triples inference cost (3 passes per question) compared to standard evaluation. This cost overhead is never discussed or quantified relative to the diagnostic value gained."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The entire paper is motivated by questioning construct validity of correctness-based benchmarks. Section 1 argues that accuracy alone 'reveals little about the process' and SIEV is proposed as having better construct validity for measuring reasoning."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "The two-agent scaffold is identical across all model comparisons. Thinking token handling for R1 is explicitly documented: 'we redact thinking tokens when passing them' (Figure 3 notes), addressing a potential confound."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "GSM8K (2021) and MMLU (2020) both predate all 21 tested models. The paper does not discuss whether models trained on these benchmarks affects thesis scores, which serve as the baseline for all SIEV comparisons."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "In the synthesis stage, models receive the antithesis as input, which may contain answer information. This structural feature leakage is not discussed as a potential confound."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether GSM8K and MMLU items share structural patterns that could advantage certain models."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection or prevention method is used despite evaluating on heavily contaminated benchmarks."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "GPT-5-chat loses more than 40 points on GSM when evaluated through SIEV's process-oriented lens",
    364       "evidence": "Table 1 shows GPT-5-chat pT=96.4 vs pS=56.2, Δ=-40.2 on GSM8K",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "SIEV reveals substantial hidden reasoning gaps (often exceeding 20%) on MMLU, a benchmark often treated as broadly solved",
    369       "evidence": "Table 1 shows multiple models with >20% drops from pT to pS on MMLU (e.g., GPT-5-chat: 88.2→50.5, GPT-4.1-mini: 84.9→54.7, Llama-3.3-70B: 85.1→58.3)",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Models with similar static correctness can mask very different reasoning trajectories",
    374       "evidence": "Table 1: GPT-5-chat (pT=96.4) and O3 (pT=97.1) have similar thesis scores but diverge at synthesis (56.2 vs 93.6 on GSM)",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Models' reasoning is strongly topic-dependent rather than a uniform general capability",
    379       "evidence": "Figure 6 shows wide pS variation across MMLU domains. Llama-3.3-70B-Inst performs well in Elementary Math but poorly in Moral Disputes. Section 3.1 discusses topic-dependent patterns.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Cross-model antitheses can improve reasoning performance compared to self-dialectics",
    384       "evidence": "Figure 7 (middle plot) shows GPT-5 improves +5.4 to +14 points in pS when paired with different antithesis models on GSM",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "SIEV has lower susceptibility to contamination compared to correctness-based evaluation",
    389       "evidence": "Claimed in Section 1 as a key contribution but no empirical test of this claim is provided",
    390       "supported": "unsupported"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Company evaluating related products",
    396       "detail": "Author is at Microsoft Research. Microsoft has a major partnership with OpenAI. Multiple OpenAI models (O3, GPT-5, O1) rank highly on SIEV metrics, particularly on GSM. This conflict is not acknowledged."
    397     },
    398     {
    399       "flag": "No model version specificity",
    400       "detail": "21 models evaluated using only marketing names (O3, GPT-5, etc.) without API versions or snapshot dates. Results are not reproducible since model behavior changes across versions."
    401     },
    402     {
    403       "flag": "Unsupported contamination resistance claim",
    404       "detail": "The paper claims SIEV has 'lower susceptibility to contamination' as a key contribution but provides no empirical evidence. Meanwhile, thesis scores on heavily contaminated benchmarks serve as the baseline for all SIEV comparisons."
    405     },
    406     {
    407       "flag": "Missing hyperparameters",
    408       "detail": "Temperature, top-p, and other sampling parameters not reported for any of 21 models. These significantly affect LLM output quality and could influence synthesis/antithesis quality."
    409     },
    410     {
    411       "flag": "Unjustified metric parameters",
    412       "detail": "DS formula uses λ=0.7 and γ=1 without justification or sensitivity analysis. These parameters directly affect model rankings."
    413     },
    414     {
    415       "flag": "No cost reporting for expensive evaluation",
    416       "detail": "Evaluating 21 models × 2 benchmarks × 3 passes each represents massive inference cost that is never quantified, making practical adoption assessment impossible."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    422       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    423       "year": 2022,
    424       "arxiv_id": "2201.11903",
    425       "relevance": "Foundational work on CoT prompting that SIEV extends beyond by evaluating reasoning process quality."
    426     },
    427     {
    428       "title": "Measuring massive multitask language understanding",
    429       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    430       "year": 2020,
    431       "arxiv_id": "2009.03300",
    432       "relevance": "MMLU benchmark used as primary evaluation testbed; relevant to benchmark evaluation methodology."
    433     },
    434     {
    435       "title": "Training verifiers to solve math word problems",
    436       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    437       "year": 2021,
    438       "arxiv_id": "2110.14168",
    439       "relevance": "GSM8K benchmark used as primary evaluation testbed for SIEV."
    440     },
    441     {
    442       "title": "Faith and fate: Limits of transformers on compositionality",
    443       "authors": ["Nouha Dziri", "Ximing Lu"],
    444       "year": 2023,
    445       "arxiv_id": "2305.10403",
    446       "relevance": "Key prior work questioning whether LLMs genuinely reason, providing symbolic/graph-based reasoning analysis."
    447     },
    448     {
    449       "title": "GSM-Plus: A comprehensive benchmark for evaluating the robustness of LLMs as mathematical problem solvers",
    450       "authors": ["Qintong Li", "Leyang Cui", "Xueliang Zhao"],
    451       "year": 2024,
    452       "arxiv_id": "2402.19255",
    453       "relevance": "Related work on probing LLM reasoning robustness through benchmark perturbations."
    454     },
    455     {
    456       "title": "GSM-Symbolic: Understanding the limitations of mathematical reasoning in large language models",
    457       "authors": ["Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi"],
    458       "year": 2024,
    459       "arxiv_id": "2410.05229",
    460       "relevance": "Introduces symbolic perturbations to test LLM mathematical reasoning sensitivity."
    461     },
    462     {
    463       "title": "Embers of autoregression: Understanding large language models through the problem they are trained to solve",
    464       "authors": ["R. Thomas McCoy", "Shunyu Yao", "Dan Friedman"],
    465       "year": 2023,
    466       "arxiv_id": "2309.13638",
    467       "relevance": "Foundational work questioning whether LLMs reason or imitate reasoning through pattern matching."
    468     },
    469     {
    470       "title": "Alice in wonderland: Simple tasks showing complete reasoning breakdown in state-of-the-art large language models",
    471       "authors": ["Marianna Nezhurina", "Lucia Cipolina-Kun", "Mehdi Cherti"],
    472       "year": 2024,
    473       "arxiv_id": "2406.02061",
    474       "relevance": "Demonstrates reasoning failures in frontier LLMs on simple tasks, supporting SIEV's motivation."
    475     },
    476     {
    477       "title": "A peek into token bias: Large language models are not yet genuine reasoners",
    478       "authors": ["Bowen Jiang", "Yangxinyu Xie", "Zhuoqun Hao"],
    479       "year": 2024,
    480       "arxiv_id": "2406.11050",
    481       "relevance": "Shows LLM reasoning fragility under token-level perturbations, complementary to SIEV's process-level evaluation."
    482     },
    483     {
    484       "title": "MMLU-Pro: A more robust and challenging multi-task language understanding benchmark",
    485       "authors": ["Yubo Wang", "Xueguang Ma", "Ge Zhang"],
    486       "year": 2024,
    487       "arxiv_id": "2406.01574",
    488       "relevance": "Addresses benchmark saturation in MMLU with harder variant; relevant to LLM evaluation methodology."
    489     }
    490   ]
    491 }

Impressum · Datenschutz