scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26681B)
      1 {
      2   "paper": {
      3     "title": "GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of LLMs as Mathematical Problem Solvers",
      4     "authors": ["Qintong Li", "Leyang Cui", "Xueliang Zhao", "Lingpeng Kong", "Wei Bi"],
      5     "year": 2024,
      6     "venue": "Annual Meeting of the Association for Computational Linguistics",
      7     "arxiv_id": "2402.19255",
      8     "doi": "10.48550/arXiv.2402.19255"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "GSM-PLUS reveals that LLMs exhibit significant performance drops (up to 40% PDR) when math questions are perturbed, even when they correctly solve the original GSM8K questions. Critical thinking and arithmetic variation perturbations cause the largest drops, while numerical substitution and problem understanding are more resilient. Task-specific fine-tuning improves accuracy but does not substantially improve robustness (PDR). A compositional prompting method (COMP) shows modest improvements but does not close the gap between standard and adversarial benchmarks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides a project URL (qtli.github.io/GSM-Plus/) and states 'Dataset and evaluation suits will be released' in the conclusion. The abstract also links to the project page."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The GSM-PLUS dataset of 10,552 question variations is built on the public GSM8K dataset. The paper states 'Dataset and evaluation suits will be released' and provides a project page URL."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the experimental setup but does not provide scripts or commands to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 4, 5, 8, and Figure 6 are reported as point estimates without confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes numerous comparative claims (e.g., 'GPT-4 exhibits the highest level of robustness') based solely on comparing numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports PDR (performance drop rate) as a relative measure with baseline context, e.g., 'GPT-4 exhibits the highest level of robustness with the smallest PDR of 8.23%' and provides accuracy on both GSM8K and GSM-PLUS for context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for the subset size of 120 seed questions used in the prompting experiments (§5.4). The full dataset size of 10,552 is described but not justified."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Temperature is set to 0 for deterministic predictions, which means single-run results. No variance across runs or seeds is reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares 25 LLMs including closed-source (GPT-4, GPT-3.5-Turbo), open-source foundation models, and math-specific SFT models, plus a human baseline."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include GPT-4, Mistral-7B, LLaMA-2 variants, and recent math SFT models (MetaMath, ToRA, MAmmoTH, SEGO), which were contemporary at time of writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper effectively ablates by comparing performance across 8 perturbation types, different model sizes, foundation vs SFT models, and different prompting techniques. Table 5 compares COMP components."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses accuracy, PDR (performance drop rate), and ASP (accurately solved pairs) as distinct evaluation metrics (§3.3)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Human performance baseline was established using 3 qualified annotators on 450 questions (50 seed + 400 variations), reported in Table 4 and Appendix C.1."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "GSM-PLUS is built from the GSM8K test set (1,319 questions). The dataset is used purely for evaluation, not for training or tuning any of the models tested."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Extensive per-perturbation breakdowns are provided in Figure 2, Figure 6, and Tables 8-9 across all 8 perturbation types."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Detailed failure case examples are provided in §5.2 (Figure 1) and Appendix C.3 (Examples C.1-C.8) covering critical thinking, adding operation, distractor insertion, reversing operation, and IDF conversion failures."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that math SFT does not significantly improve robustness (§5.1), that no prompting technique is sufficient to close the performance gap (§5.4), and that compositional prompting has limited impact."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about performance drops, sensitivity to statement additions and target alterations, and limited prompting improvements are all supported by results in Tables 4-5 and Figures 2-4."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claims are largely about perturbation effects on performance, which are tested through controlled single-variable manipulations (each perturbation type applied independently). The ablation of SFT datasets in §5.1 uses controlled comparisons."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper explicitly bounds results to grade school math (§Limitations: 'we mainly focus on the robustness of math reasoning at the elementary school level') and states that other education levels are future work."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses training data leakage as an alternative explanation for high GSM8K performance (§1, citing Golchin and Surdeanu 2023), and GPT-4's potential preference bias in dataset construction (Table 3, §3.2)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly frames accuracy as measuring robustness of math reasoning rather than making broader claims. PDR and ASP are well-defined metrics that directly measure what is claimed (robustness to perturbations), and the paper acknowledges it does not investigate accuracy of solution chains (Limitations)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper refers to 'GPT-4' and 'GPT-3.5-Turbo' without specific version strings or snapshot dates. Open-source models are identified by name and size (e.g., 'Mistral-7B', 'LLaMA-2-7B') but without exact checkpoint versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompts for the COMP method are provided in Appendix C.4 (Examples C.9-C.14), including premise extraction, subgoal generation, calculation, and verification prompts. The paper also describes 8-shot prompting for open-source models."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper states 'The decoding temperature is set to 0 for deterministic predictions' (§4) and specifies 8 demonstrations for open-source base models."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The COMP method's scaffolding is described in detail in §5.4 and Figure 5: iterative subgoal generation, calculation, and verification with specific prompts for each step."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The dataset construction pipeline is well-documented in §3.2: GPT-4 generates variations, human annotators verify/correct them with a qualifying exam, batched workloads, and cross-annotation (10% with 3+ annotators, 90.02% IAA). Table 7 shows pass rates per perturbation type."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present after the conclusion, listing three specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations are specific to this study: (1) focus only on elementary school level math, (2) comparing only answer accuracy without evaluating solution chains, (3) not investigating underlying reasons for failures."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope boundaries: 'we mainly focus on the robustness of math reasoning at the elementary school level' and identifies evaluating other education levels, solution chain accuracy, and failure root causes as out of scope."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The GSM-PLUS dataset is stated to be released via the project page (qtli.github.io/GSM-Plus/), which would include the raw question-answer pairs."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Detailed data collection procedure is described in §3.2 and Appendix B: GPT-4 generates variations from 8 perturbation types on 1,319 GSM8K seed questions, followed by human verification with qualifying exams and batched annotation."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Appendix B.2 describes annotator recruitment: qualifying exam of 24 variations, individual review of submissions, selection of 5 evaluators with at least bachelor's degrees. Appendix C.1 describes human performance annotator recruitment via Tencent crowdsource platform with qualification requirements."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: GSM8K seed questions → GPT-4 variation generation → GPT-4 answer generation → human annotator verification/correction (18.85% revised) → cross-annotation quality check (10% with 3+ annotators). Table 7 provides pass rates per perturbation type."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section states: 'This research was supported in part by the joint research scheme of the National Natural Science Foundation of China (NSFC) and the Research Grants Council (RGC) under grant number N_HKU714/21.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: University of Hong Kong and Tencent AI Lab. No models from these organizations are being evaluated as primary subjects."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "NSFC and RGC are government research funding bodies with no financial stake in the benchmark evaluation outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the 25 LLMs evaluated, despite this being critical for assessing whether models may have seen GSM8K solutions during training."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "The paper explicitly discusses contamination: 'merely solve math word problems based on superficial patterns or even due to training data leakage (Golchin and Surdeanu, 2023)' (§1). The numerical variation perturbation is specifically designed to test overfitting. Table 3 analyzes GPT-4's potential preference bias."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The entire paper is motivated by contamination concerns. GSM-PLUS is designed to test whether models truly understand math or have memorized GSM8K solutions. The perturbation approach itself serves as a contamination detection method — if models solve originals but not perturbations, memorization is implicated."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The human annotation is for dataset construction and baseline performance, not a human subjects study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The human involvement is for annotation/evaluation tasks, not a human subjects research study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study; annotators are hired for data quality tasks."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, API costs, or latency figures are reported despite evaluating 25 models on 10,552+ questions and using GPT-4 for dataset construction."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated for either the dataset construction (GPT-4 calls) or the evaluation experiments."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Temperature is set to 0 for deterministic predictions, producing single-run results. No seed sensitivity analysis is reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states temperature=0 for deterministic predictions (§4), implying single runs. For COMP+SC, it states 'sampling 5 predictions' (Table 5)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The paper does not describe how the 8-shot examples were selected or whether any prompt tuning was performed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The selection of 8-shot demonstrations and 120-question subset for prompting experiments is not justified. No description of how configurations were selected."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes many comparisons across 25 models and 8 perturbation types without any correction for multiple comparisons. No statistical tests are performed at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The COMP prompting method is proposed and evaluated by the same authors without acknowledgment of self-comparison bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No discussion of compute differences between models or methods. The COMP method requires multiple LLM calls per question (premise extraction + iterative subgoal/calculation/verification) but cost comparison with simpler methods is not provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper explicitly discusses what GSM-PLUS measures vs. what is claimed, referencing Polya's principles (§3.1) and acknowledging limitations (e.g., not evaluating solution chain accuracy, focusing only on grade school level). Table 3 validates that GPT-4's dataset creation doesn't introduce bias."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved. The COMP method is a prompting strategy, not a scaffold, and models are compared using the same prompting approach within each experiment."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The entire paper is designed to address temporal leakage: GSM8K (2021) may be in training data of later models, so GSM-PLUS creates new perturbations to test whether models memorized solutions. The paper cites Golchin and Surdeanu (2023) on data contamination."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks answer information through context (e.g., the 8-shot demonstrations could provide structural hints)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "GSM-PLUS variations are derived from GSM8K questions, creating structural dependencies between test items. This non-independence is not discussed — performance on a variation may be correlated with performance on the seed question by construction."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The perturbation approach itself functions as a leakage detection method: if a model solves originals but fails on semantically equivalent perturbations, this suggests memorization rather than understanding. The numerical substitution perturbation specifically tests overfitting to training data."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "LLMs show substantial performance drops on GSM-PLUS compared to GSM8K, with PDR up to 40%",
    365       "evidence": "Table 4 shows PDR ranging from 8.23% (GPT-4) to 40.56% (CodeLlama-7B) across 25 models. All models show lower accuracy on GSM-PLUS than GSM8K.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Math SFT improves accuracy but does not significantly improve robustness (PDR)",
    370       "evidence": "§5.1: LLaMA-2-13B PDR 34.76% vs Abel-13B PDR 31.97%; LLaMA-2-70B PDR 29.40% vs MAmmoTH-70B PDR 29.62%. SFT models show similar or only slightly lower PDR than foundation models.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Critical thinking and arithmetic variation cause the largest performance drops",
    375       "evidence": "Figure 2 and Figure 6 show near-100% PDR for critical thinking on most open-source models, and >40% PDR for adding operation on most models.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The compositional prompting method COMP improves robustness over standard COT",
    380       "evidence": "Table 5: COMP achieves 65.52% on GSM-PLUS vs COT's 62.92%; COMP+SC achieves 69.47% vs COT+SC's 66.88%. Critical thinking improves from 40.83% to 54.17%.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Human performance is unaffected by the perturbations in GSM-PLUS",
    385       "evidence": "Table 4: Human accuracy is 96.77% on GSM8K and 98.75% on GSM-PLUS, with PDR of -2.05% (slight improvement).",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No statistical significance tests",
    392       "detail": "All comparative claims across 25 models and 8 perturbation types are based solely on point estimate comparisons without any statistical tests, despite making numerous claims about which models are more robust."
    393     },
    394     {
    395       "flag": "No error bars or variance reporting",
    396       "detail": "Using temperature=0 produces deterministic outputs, but this means results reflect a single decoding path. No assessment of sensitivity to prompt selection, demonstration examples, or other sources of variance."
    397     },
    398     {
    399       "flag": "Small subset for prompting experiments",
    400       "detail": "The prompting comparison (§5.4, Table 5) uses only 120 seed questions and 960 variations — a small subset that may not be representative. No justification for this sample size."
    401     },
    402     {
    403       "flag": "GPT-4 used for both dataset creation and evaluation",
    404       "detail": "GPT-4 generated the question variations and is also one of the evaluated models. While Table 3 shows similar performance on self-generated vs human-corrected variants, this creates a potential circular evaluation concern."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Training verifiers to solve math word problems",
    410       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    411       "year": 2021,
    412       "arxiv_id": "2110.14168",
    413       "relevance": "Introduces GSM8K, the foundational benchmark that GSM-PLUS extends for robustness evaluation."
    414     },
    415     {
    416       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    417       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    418       "year": 2022,
    419       "relevance": "Core prompting technique evaluated for robustness in mathematical reasoning."
    420     },
    421     {
    422       "title": "Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks",
    423       "authors": ["Wenhu Chen", "Xueguang Ma", "Xinyi Wang"],
    424       "year": 2022,
    425       "arxiv_id": "2211.12588",
    426       "relevance": "Program-of-thought prompting evaluated as a baseline prompting technique for math reasoning."
    427     },
    428     {
    429       "title": "Self-consistency improves chain of thought reasoning in language models",
    430       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    431       "year": 2022,
    432       "relevance": "Self-consistency technique used in COMP+SC ensemble for improving robustness."
    433     },
    434     {
    435       "title": "MetaMath: Bootstrap your own mathematical questions for large language models",
    436       "authors": ["Longhui Yu", "Weisen Jiang", "Han Shi"],
    437       "year": 2023,
    438       "arxiv_id": "2309.12284",
    439       "relevance": "Math SFT model evaluated for robustness; demonstrates question bootstrapping for math training."
    440     },
    441     {
    442       "title": "Large language models can be easily distracted by irrelevant context",
    443       "authors": ["Freda Shi", "Xinyun Chen", "Kanishka Misra"],
    444       "year": 2023,
    445       "relevance": "Prior work on distractor insertion in math reasoning (GSM-IC), which GSM-PLUS extends with additional perturbation types."
    446     },
    447     {
    448       "title": "Are NLP models really able to solve simple math word problems?",
    449       "authors": ["Arkil Patel", "Satwik Bhattamishra", "Navin Goyal"],
    450       "year": 2021,
    451       "relevance": "SVAMP benchmark for math robustness evaluation; early evidence that models rely on shortcuts."
    452     },
    453     {
    454       "title": "Time travel in LLMs: Tracing data contamination in large language models",
    455       "authors": ["Shahriar Golchin", "Mihai Surdeanu"],
    456       "year": 2023,
    457       "arxiv_id": "2308.08493",
    458       "relevance": "Directly relevant to benchmark contamination concerns that motivate GSM-PLUS."
    459     },
    460     {
    461       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    462       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    463       "year": 2023,
    464       "arxiv_id": "2305.10601",
    465       "relevance": "Advanced prompting technique for reasoning that is related to the compositional prompting approach explored."
    466     },
    467     {
    468       "title": "MAmmoTH: Building math generalist models through hybrid instruction tuning",
    469       "authors": ["Xiang Yue", "Xingwei Qu", "Ge Zhang"],
    470       "year": 2023,
    471       "arxiv_id": "2309.05653",
    472       "relevance": "Math SFT model evaluated for robustness; demonstrates hybrid training approach for math reasoning."
    473     }
    474   ]
    475 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs