calibration.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

calibration.json (20137B)
      1 {
      2   "paper_slug": "advancing-language-model-2025",
      3   "calibration_date": "2026-02-28",
      4   "calibration_model": "opus",
      5   "total_questions": 50,
      6   "agreement_count": 48,
      7   "disagreement_count": 2,
      8   "agreement_rate": 0.96,
      9   "disagreements": [
     10     {
     11       "category": "evaluation_design",
     12       "question": "multiple_metrics",
     13       "sonnet_applies": true,
     14       "sonnet_answer": true,
     15       "opus_applies": true,
     16       "opus_answer": false,
     17       "direction": "sonnet_generous",
     18       "explanation": "The paper explicitly states 'Accuracy (Pass@1) is used as the primary evaluation metric' and reports only this single metric across all four benchmarks (MATH500, AIME, Omni-MATH-500, GPQA). The schema requires 'at least two different metrics (e.g., accuracy AND F1, or Pass@1 AND Pass@10).' Using the same metric (accuracy) on multiple datasets is not the same as using multiple metrics. Sonnet appears to have conflated multiple benchmarks with multiple metrics. The paper also tracks training-time metrics (response length, KL divergence, reward) but these are process metrics, not evaluation metrics of the system's output quality."
     19     },
     20     {
     21       "category": "setup_transparency",
     22       "question": "model_versions_specified",
     23       "sonnet_applies": true,
     24       "sonnet_answer": true,
     25       "opus_applies": true,
     26       "opus_answer": false,
     27       "direction": "sonnet_generous",
     28       "explanation": "While T1's own base models are specified (GLM-4-9B, Qwen2.5-14B, Qwen2.5-32B — identifiable checkpoints), the comparison baselines in Table 1 use marketing names without version specification: 'GPT-4o' and 'Claude-3.5-sonnet' lack snapshot dates or API versions. The schema explicitly states: 'Marketing names like \"Gemini-2.5\" or \"GPT-4o\" without a snapshot date or API version do NOT count as specified versions.' Since these baseline model results appear in the main comparison table and their exact versions affect reproducibility, this criterion is not satisfied."
     29     }
     30   ],
     31   "opus_checklist": {
     32     "artifacts": {
     33       "code_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper states 'The model weights and the data for SFT and RL training are publicly available at https://github.com/THUDM/T1.' A working GitHub URL is provided."
     37       },
     38       "data_released": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper states SFT and RL training data are publicly available at the GitHub repository. Training data sourced from publicly available datasets (MATH-train, NuminaMath). Evaluation benchmarks (AIME2024, MATH, GPQA, Omni-MATH) are also public."
     42       },
     43       "environment_specified": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper mentions SGLANG as the inference engine and provides training hyperparameters, but does not provide a requirements.txt, Dockerfile, conda environment, or detailed list of library versions sufficient to recreate the environment."
     47       },
     48       "reproduction_instructions": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Appendix A.1 provides training hyperparameters (learning rates, K, temperature, etc.) but does not include step-by-step reproduction instructions, README-level commands, or scripts to replicate the main experiments."
     52       }
     53     },
     54     "statistical_methodology": {
     55       "confidence_intervals_or_error_bars": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "All results in Table 1 are reported as single accuracy values (e.g., '92.4') with no confidence intervals, error bars, or uncertainty quantification. AIME is evaluated 32 times and averaged, but no variance or CI is reported for these repeated evaluations."
     59       },
     60       "significance_tests": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper claims T1 outperforms QwQ-32B-Preview and other baselines based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported."
     64       },
     65       "effect_sizes_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper reports absolute percentage improvements with baseline context: 'over a 10% improvement on Omni-MATH-500' and 'over a 20% improvement on AIME over its T1-SFT version,' with both baseline and T1 scores provided in Table 1 for comparison."
     69       },
     70       "sample_size_justified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "AIME uses only 30 problems. Omni-MATH-500 uses 500 examples described as 'efficient yet comprehensive evaluation' without statistical justification. No power analysis or formal sample size justification is provided for any benchmark."
     74       },
     75       "variance_reported": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "AIME is evaluated 32 times and averaged 'to get stable results,' but the standard deviation or other spread measure across these 32 runs is not reported. All other benchmarks appear to be single-run results. No variance, std dev, or IQR is reported anywhere."
     79       }
     80     },
     81     "evaluation_design": {
     82       "baselines_included": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 1 includes multiple baselines: GPT-4o, Claude-3.5-sonnet, Llama-3.3-70B-Instruct, Qwen2.5-Math-7B-Instruct, o1-preview, QwQ-32B-preview, and the original base models plus T1-SFT ablations."
     86       },
     87       "baselines_contemporary": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Baselines include contemporary systems from late 2024/early 2025: QwQ-32B-Preview, o1-preview, Claude-3.5-Sonnet, and GPT-4o, representing the state of the art at the time."
     91       },
     92       "ablation_study": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper includes systematic ablation studies: number of sampled responses K (Figure 3, Figure 4), sampling temperature (Table 2), penalty mechanism (Table 3), and SFT vs. SFT+RL (Table 1). Individual components are isolated and their contribution measured."
     96       },
     97       "multiple_metrics": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The paper explicitly states 'Accuracy (Pass@1) is used as the primary evaluation metric' and this is the only evaluation metric reported. The same metric (accuracy) is applied across four benchmarks (MATH500, AIME, Omni-MATH-500, GPQA). Using the same metric on multiple datasets is not the same as using multiple different metrics (e.g., Pass@1 AND Pass@10, or accuracy AND F1). Training metrics like response length and KL divergence are process metrics, not evaluation metrics."
    101       },
    102       "human_evaluation": {
    103         "applies": false,
    104         "answer": false,
    105         "justification": "This is a benchmark evaluation paper for mathematical reasoning where correctness is determined by matching against ground-truth answers. Human evaluation of model outputs is not applicable."
    106       },
    107       "held_out_test_set": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Training data uses MATH-train and NuminaMath; evaluation uses MATH-test/MATH500 (a separate split), AIME2024 (competition problems), Omni-MATH-500, and GPQA. Appendix A.1 confirms the train/test split: '12k for the SFT stage and the others for RL training.'"
    111       },
    112       "per_category_breakdown": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Results are broken down across four benchmarks of different difficulty levels (competition math AIME, olympiad math Omni-MATH, school math MATH500, and science GPQA). Figure 7 shows inference scaling separately for each dataset."
    116       },
    117       "failure_cases_discussed": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper presents a successful case study (Figure 9, AIME 2024 Question 15) but does not discuss failure cases, error patterns, or provide qualitative analysis of where T1 fails. No error analysis or examples of incorrect reasoning are shown."
    121       },
    122       "negative_results_reported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper reports several negative findings: min-p sampling causes training collapse (Table 2, MATH500 drops from 86.4 to 78.8), low temperatures (<=1.0) often cause training collapse, training without penalty leads to degradation and eventually fails (Table 3), and excessively high temperature (1.3) hurts performance."
    126       }
    127     },
    128     "claims_and_evidence": {
    129       "abstract_claims_supported": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract claims T1 'achieves superior performance on challenging math reasoning benchmarks' — supported by Table 1 showing SOTA on MATH500 and Omni-MATH-500. The abstract claims T1 'exhibits inference scaling behavior' — supported by Figures 6-7 showing monotonic improvement with more thinking tokens."
    133       },
    134       "causal_claims_justified": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Causal claims (e.g., 'sampling more responses encourages exploration,' 'penalty prevents collapse') are supported by controlled single-variable ablation studies: varying only K (Figure 3), only temperature (Table 2), only penalty (Table 3). This controlled manipulation design is adequate for the causal claims made."
    138       },
    139       "generalization_bounded": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The title says 'Advancing Language Model Reasoning' without bounding to math. The abstract claims the approach enables 'effective test-time scaling' and the conclusion says it enhances 'reasoning capabilities' broadly. But evaluation is limited to math benchmarks (MATH, AIME, Omni-MATH) and GPQA science. No code generation, commonsense reasoning, or other reasoning domains are tested. The paper notes OOD generalization to GPQA but does not explicitly bound its broader claims."
    143       },
    144       "alternative_explanations_discussed": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper does not discuss alternative explanations for the observed improvements. It does not consider whether improvements could be due to data overlap, whether the reward signal design alone (vs. exploration strategies) accounts for the gains, or whether the summarization model used in inference scaling analysis introduces confounds."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "T1's base models are specified as GLM-4-9B, Qwen2.5-14B, and Qwen2.5-32B (identifiable checkpoints). However, comparison baselines in Table 1 use marketing names: 'GPT-4o' and 'Claude-3.5-sonnet' without snapshot dates or API versions. The schema explicitly states 'Marketing names like \"Gemini-2.5\" or \"GPT-4o\" without a snapshot date or API version do NOT count as specified versions.' Some baseline results are reported 'from previous works if available,' but the versions remain unspecified."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Section 2.2.1 describes the prompting procedure in natural language: 'we prompt an LLM to thoroughly examine each attempt' and 'we further prompt an LLM to incorporate these refined attempts into a single output.' The actual prompt text used is not provided. The schema requires actual prompt text, not natural language descriptions."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Appendix A.1 reports: SFT learning rate 1e-5 with cosine decay, RL learning rate 1.5e-6, KL coefficient 2e-4, K=64 responses, temperature=1.2, top-p=0.95, max generation length 10240/16384, reward function (binary correctness), gradient updates every 32 prompts."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "T1 is a training methodology (SFT + RL), not an agentic scaffolding system. The model generates reasoning chains end-to-end in a single forward pass. No external tool use, memory, or multi-step scaffolding is involved."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Appendix A.1 documents data preprocessing: answer extraction from NuminaMath using an LLM, data filtering to retain instances with pass rate in (0, 0.3) from 16 generated responses per question, resulting in 30k RL training examples and 12k SFT examples. Filtering criteria and resulting sizes are stated."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper has an 'Impact Statement' section but no dedicated Limitations or Threats to Validity section. The Impact Statement is entirely generic boilerplate: 'The ethical aspects and societal implications of our work align with those commonly associated with advancing the field of machine learning.'"
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No specific threats to validity are discussed anywhere in the paper. The Impact Statement contains no methodological discussion. The Conclusion also lacks any discussion of limitations or threats."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper does not explicitly state what the results do NOT show. While it mentions T1 is 'primarily optimized for math-related tasks,' it does not explicitly enumerate scope boundaries or disclaim broader generalization of the claimed 'reasoning capabilities.'"
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Model weights and training data are released at https://github.com/THUDM/T1. Evaluation benchmarks (AIME2024, MATH, GPQA, Omni-MATH) are publicly available, enabling independent verification and re-evaluation."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Appendix A.1 describes data collection: training data from MATH-train (Hendrycks et al., 2021) and NuminaMath (Li et al., 2024b), with answer extraction and filtering procedures. Evaluation datasets and their sources are specified."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants are involved. All training and evaluation data come from publicly available mathematical datasets. This criterion is not applicable."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Appendix A.1 documents the data pipeline: raw data (NuminaMath + MATH-train) → answer extraction via LLM → data filtering (pass rate in (0, 0.3) from 16 generated responses) → 12k SFT split + 30k RL split. Transformation steps and resulting sizes are stated at each stage."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Acknowledgements section discloses funding from NSFC 62495063, Tsinghua University-Siemens Joint Research Center (JCIIOT), and the New Cornerstone Science Foundation (XPLORER PRIZE)."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are listed in the paper header: Tsinghua University (Hou, Lu, Zhang, Li, Yao, Li, Tang, Dong) and ZhipuAI (Lv). The ZhipuAI affiliation of co-author Xin Lv is disclosed."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Funders (NSFC, Tsinghua-Siemens joint research center, New Cornerstone Science Foundation) are academic/government funding bodies without a direct financial stake in T1's performance on math benchmarks. Although co-author Xin Lv is from ZhipuAI (maker of GLM models), ZhipuAI is not listed as a funder."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement is present in the paper. Co-author Xin Lv is affiliated with ZhipuAI, whose GLM-4-9B model is one of three base models evaluated, but this potential conflict of interest is not explicitly declared."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper does not state the training data cutoff dates for the base models (Qwen2.5, GLM-4) or the comparison models (GPT-4o, Claude-3.5-Sonnet, o1-preview, QwQ-32B-Preview). This information is needed to assess contamination risk."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of potential train/test overlap. The paper does not analyze whether MATH test examples, AIME2024 problems, or Omni-MATH problems could have appeared in the pre-training data of the base models (Qwen2.5, GLM-4)."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "MATH benchmark (Hendrycks et al., 2021) was published in 2021. The base models (Qwen2.5, GLM-4) were likely trained on data that includes MATH problems. AIME2024 is newer but competition problems circulate online. No contamination analysis or discussion is provided."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. Pre-registration is not applicable."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study. IRB approval is not applicable."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study. Demographics are not applicable."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study. Inclusion/exclusion criteria are not applicable."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study. Randomization of participants is not applicable."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study. Blinding is not applicable."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study. Attrition is not applicable."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper reports token counts for inference scaling analysis (up to ~6000 thinking tokens per response) and max generation lengths (10240-16384 tokens), but does not report wall-clock time, API costs, or cost per example for running T1."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not report GPU hours, hardware used (GPU type/count), total training time, or total compute budget. Training a 32B model with K=64 sampled responses per prompt is expensive, but the compute requirements are not quantified."
    302       }
    303     }
    304   }
    305 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs