scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26467B)
      1 {
      2   "paper": {
      3     "title": "Is GPT-OSS Good? A Comprehensive Evaluation of OpenAI's Latest Open Source Models",
      4     "authors": ["Ziqian Bi", "Keyu Chen", "Chiung-Yi Tseng", "Danyang Zhang", "Tianyang Wang", "Hongying Luo", "Lu Chen", "Junming Huang", "Jibin Guan", "Junfeng Hao", "Xinyuan Song", "Junhao Song"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.12461",
      8     "doi": "10.48550/arXiv.2508.12461"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "GPT-OSS models (20B and 120B MoE) show mid-tier performance among contemporary open-source LLMs. The 20B model outperforms the 120B on several benchmarks (MMLU, SciQ), contradicting scaling expectations with statistical significance (p<0.01, Cohen's d=0.73). Both models are weak on multilingual tasks (C-Eval: 20-28%). The 20B variant offers superior cost-performance, requiring 5x less GPU memory with comparable or better accuracy.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract mentions 'evaluation scripts are available at the Project Webpage' but no actual URL is provided in the paper text. No repository link found."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All ten benchmarks used (MMLU, GSM8K, HumanEval, FinQA, PIQA, SciQ, MedQA, LegalQA, DialogSum, C-Eval) are publicly available standard benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section IV-A mentions '8 NVIDIA H100 80GB GPUs, Ubuntu 22.04 LTS, vLLM' but no software versions, dependency files, or library versions are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the pipeline at a high level (Section IV-C) but provides no commands, scripts, or README."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Fig 3 shows error bars representing 95% bootstrap confidence intervals. Section V-H states bootstrap CIs with max width ±2.1%."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "McNemar's test with Benjamini-Hochberg correction is used for pairwise comparisons (Section III-D). Section V-H confirms p-values below 0.05 for all reported differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Cohen's d effect sizes are reported: d=0.73 for the inverse scaling finding (Section V-A), and range from d=0.52 to d=1.84 across comparisons (Section V-H)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No power analysis or sample size justification is provided. Section III-C mentions 'power analysis guidelines from Card et al.' but does not actually report a power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations or variance across runs are reported in the results tables. Results appear to be single-run numbers. Only bootstrap CIs on the aggregated metrics are shown."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Six contemporary open-source models serve as baselines: Qwen 3 235B, DeepSeek-R1 70B, Phi-4 Reasoning, Llama 4 Scout, Llama 3.3 70B, Gemma 3 27B (Table I)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All baseline models are contemporary (2024-2025): Qwen 3, DeepSeek-R1, Phi-4 Reasoning, Llama 4 Scout, Llama 3.3, Gemma 3. These represent state-of-the-art open-source models."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is a benchmark evaluation comparing existing models, not a system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Ten benchmarks across five capability domains are used: MMLU, GSM8K, HumanEval, FinQA, PIQA, SciQ, MedQA, LegalQA, DialogSum, C-Eval (Table II)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section V-E presents a case study with multi-dimensional quality assessment including human readability, answer clarity, and conciseness ratings (Table IV). A blind evaluation protocol is described in Section III-C."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard benchmark test sets are used. Section IV-B describes data quality assurance with 5% manual inspection of samples. The benchmarks have established train/test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Detailed per-benchmark breakdowns in Table II and Fig 4. Table III provides mathematical reasoning sub-category breakdown (basic arithmetic, multi-step algebra, word problems). Section V-B discusses per-subject MMLU breakdowns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section V-C discusses failure patterns: 'both GPT-OSS models consistently fail on problems requiring numerical precision maintenance, particularly those involving unit conversions.' Section V-D discusses code generation failure modes."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The inverse scaling finding is itself a negative result — the larger model underperforms the smaller one. GPT-OSS's severe weakness in multilingual tasks (C-Eval: 20-28%) is prominently reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about mid-tier performance, inverse scaling between 20B and 120B, strength in code generation, and weakness in multilingual tasks are all supported by Tables II and IV and Fig 7."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims like 'scaling in sparse architectures may not yield proportional performance gains' and attributes inverse scaling to 'potential inefficiencies in the MoE routing mechanism or suboptimal training configuration' (Section V-K) without evidence to support these causal mechanisms. The study is observational — it compares models but cannot isolate the cause of the scaling anomaly."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section VI (Conclusion) explicitly bounds claims: 'this study relies on established benchmarks that, though widely adopted, do not fully capture emerging capabilities or real-world robustness, and we did not exhaustively optimise prompting or decoding strategies for any single model.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section V-K discusses multiple explanations for inverse scaling: MoE routing inefficiencies, suboptimal training configuration for the larger model. Section V-E acknowledges that response length differences 'may represent presentation choices rather than fundamental capability differences.'"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match the granularity of its measurements — it reports benchmark accuracy scores and frames results in terms of those specific benchmarks rather than making broader capability claims. The abstract says 'mid-tier overall performance' based on the specific benchmarks tested."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Table I lists model names and parameter counts (e.g., 'GPT-OSS 120B', 'Qwen 3 235B', 'DeepSeek-R1 70B') but no specific version hashes, snapshot dates, or checkpoint identifiers are provided for any model."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompt text is provided. Section III-C mentions 'prompt engineering incorporates insights from Brown et al., Wei et al., and Kojima et al.' but the actual prompts used for each benchmark are not shown."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-A reports: temperature 0.7/0.1 by task type, top-p 0.95, top-k 50, max tokens 2000, repetition penalty 1.1."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models are evaluated directly on benchmarks via standard inference."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section IV-B documents a multi-stage validation: 5% stratified manual inspection (~2,100 examples), character encoding validation, quality filtering with 98.7% retention rate and documentation of removed examples."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section exists. The conclusion contains a single sentence acknowledging benchmark limitations and prompting non-optimization, but there is no substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The conclusion mentions generic limitations ('benchmarks do not fully capture emerging capabilities') without study-specific threats."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section VI states: 'this study relies on established benchmarks that...do not fully capture emerging capabilities or real-world robustness, and we did not exhaustively optimise prompting or decoding strategies for any single model.' Section V-J specifically notes GPT-OSS weakness in non-English domains."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw evaluation outputs, model responses, or per-example results are made available. Only aggregated accuracy numbers are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section IV-B describes the data preparation pipeline in detail: manual inspection of 5% samples, character encoding validation, quality filtering. Section III-B describes benchmark selection criteria."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section IV-C describes a five-stage evaluation pipeline (initialization, data loading, inference, post-processing, aggregation). Section IV-B documents filtering with 98.7% retention rate."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed. The acknowledgments section thanks the open source community and mentions 'computational resources provided by our institution' but names no specific funder or grant."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Purdue, Georgia Tech, ByteDance, LuxMuse AI, University of Minnesota, Emory, Imperial College London, plus 'AI Agent Lab, Vokram Group' for most authors."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funder is disclosed, making it impossible to assess independence. The paper cannot demonstrate funder independence without first disclosing the funder."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is included. Several authors are affiliated with commercial entities (ByteDance, LuxMuse AI, Vokram Group) but no financial interest declaration is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the eight evaluated models. This is critical for assessing whether benchmark data appeared in training."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "Section II-C mentions contamination as a general concern and cites Sainz et al. and Magar & Schwartz, but no actual analysis of train/test overlap is performed for the models tested."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Section IV-E mentions 'data contamination detection employs methods from Sainz et al. and Magar & Schwartz' but no results of contamination detection are reported. HumanEval (2021), MMLU (2021), GSM8K (2021) etc. all predate these models' training — high contamination risk is unaddressed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this benchmark evaluation study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section V-I reports GPU memory per device (80GB vs 16GB), throughput (128 vs 178 tokens/s), and energy consumption (2.6x difference). Token count distributions shown in Fig 6."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is described (8x H100 80GB GPUs) but total compute time, GPU hours, or wall-clock time for the full evaluation is not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No results across multiple random seeds are reported. Section IV-E mentions 'fixed random seeds' for reproducibility but no seed sensitivity analysis is shown."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs per benchmark per model is never stated. Results appear to be single-run."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Temperature and sampling parameters are stated but no search process is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper uses a single configuration per task type (temperature 0.7 for creative, 0.1 for factual) without justifying why these specific values were chosen or whether alternatives were tested."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section III-D explicitly applies Benjamini-Hochberg correction for multiple comparisons among 8 models. Section V-H confirms Bonferroni correction was also used."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No discussion of author-evaluation bias. The paper does not acknowledge that running all models through their own pipeline could introduce systematic biases."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section V-G and V-I explicitly compare efficiency vs performance: GPU memory (5x difference), energy per response (2.6x), throughput. Fig 2 plots parameter count vs accuracy."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the chosen benchmarks actually measure the claimed capabilities. The paper uses standard benchmarks without questioning their construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated directly via standard inference."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage despite all benchmarks (MMLU 2021, HumanEval 2021, GSM8K 2021, etc.) predating the 2025 models by years."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setups leak information through context or formatting."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark problems share structural similarities with training data or whether independence was verified."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Section IV-E claims contamination detection methods were used citing Sainz et al. but no results are reported. No canary strings, membership inference, or overlap analysis results are shown."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "GPT-OSS-20B consistently outperforms GPT-OSS-120B on several benchmarks including MMLU (69% vs 66%) and SciQ (87% vs 82%), demonstrating inverse scaling.",
    365       "evidence": "Table II, Fig 7, Section V-A. Statistical validation: p<0.01, Cohen's d=0.73.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Both GPT-OSS models demonstrate mid-tier overall performance within the current open-source landscape.",
    370       "evidence": "Table II shows GPT-OSS-20B averaging 67.7% and GPT-OSS-120B averaging 64.8%, placing them 7th and 8th among 8 models. Fig 3 confirms rankings.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "GPT-OSS-20B achieves comparable or superior accuracy with 5x less GPU memory and 2.6x lower energy per response.",
    375       "evidence": "Section V-I reports 80GB vs 16GB peak GPU memory and 2.6x energy reduction. Throughput: 178 vs 128 tokens/s.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Both GPT-OSS models show severe weakness in multilingual tasks with C-Eval scores of 28% (20B) and 20% (120B).",
    380       "evidence": "Fig 4 C-Eval panel and Table II. Well below the 45% threshold stated in Section V-J.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Scaling laws may be task-dependent rather than universal, as GSM8K shows conventional scaling (120B: 88% vs 20B: 82%) while MMLU shows inverse scaling.",
    385       "evidence": "Table II and Section V-C compare task-specific scaling patterns. Statistical testing confirms differences.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Inconsistent data between tables and figures",
    392       "detail": "Table II shows 'averaged benchmark results with published results for a more moderate view' — mixing the paper's own measurements with externally published numbers without clear separation. Fig 4 caption says 'collected from published benchmark results' while the text implies independent evaluation. It is unclear which numbers are the authors' measurements and which are borrowed from published sources."
    393     },
    394     {
    395       "flag": "Contamination claims without evidence",
    396       "detail": "Section IV-E claims 'data contamination detection employs methods from Sainz et al. and Magar & Schwartz' but no contamination analysis results are reported anywhere. All ten benchmarks predate the models by years, making contamination a serious concern that is never actually addressed."
    397     },
    398     {
    399       "flag": "No variance or multi-run reporting",
    400       "detail": "Despite claiming rigorous statistical methodology, the paper appears to report single-run results without variance across seeds. Bootstrap CIs on aggregated scores are not a substitute for multi-run variance assessment."
    401     },
    402     {
    403       "flag": "Qualitative quality ratings presented as objective",
    404       "detail": "Table IV assigns 'Excellent', 'Good', 'Medium', 'Poor' quality ratings to model responses on a single logic puzzle, but the rating methodology is subjective and the sample size is N=1 task."
    405     },
    406     {
    407       "flag": "Vague reference to 'Project Webpage'",
    408       "detail": "The abstract says 'evaluation scripts are available at the Project Webpage' but no URL is actually provided in the paper, making reproducibility claims unverifiable."
    409     },
    410     {
    411       "flag": "Author affiliations raise questions",
    412       "detail": "Most authors share affiliation with 'AI Agent Lab, Vokram Group' — an entity that does not appear to be a well-known research institution. Combined with ByteDance affiliation but no conflict-of-interest statement, this warrants scrutiny."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Scaling laws for neural language models",
    418       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan"],
    419       "year": 2020,
    420       "arxiv_id": "2001.08361",
    421       "relevance": "Foundational scaling laws paper — this paper's inverse scaling finding directly challenges its predictions."
    422     },
    423     {
    424       "title": "Training compute-optimal large language models",
    425       "authors": ["J. Hoffmann", "S. Borgeaud"],
    426       "year": 2022,
    427       "arxiv_id": "2203.15556",
    428       "relevance": "Chinchilla scaling approach, relevant to understanding compute-optimal model sizing."
    429     },
    430     {
    431       "title": "Are emergent abilities of large language models a mirage?",
    432       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    433       "year": 2024,
    434       "relevance": "Challenges emergent abilities narrative; relevant to understanding non-monotonic scaling."
    435     },
    436     {
    437       "title": "Holistic evaluation of language models",
    438       "authors": ["P. Liang", "R. Bommasani"],
    439       "year": 2023,
    440       "relevance": "HELM framework for LLM evaluation methodology — survey-relevant evaluation infrastructure."
    441     },
    442     {
    443       "title": "With little power comes great responsibility",
    444       "authors": ["D. Card", "P. Henderson"],
    445       "year": 2020,
    446       "relevance": "Statistical validity in NLP evaluation — directly relevant to methodology quality assessment."
    447     },
    448     {
    449       "title": "The hitchhiker's guide to testing statistical significance in natural language processing",
    450       "authors": ["R. Dror", "G. Baumer"],
    451       "year": 2018,
    452       "relevance": "Guidelines for statistical testing in NLP; relevant to evaluation methodology rigor."
    453     },
    454     {
    455       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    456       "authors": ["O. Sainz", "J. A. Campos"],
    457       "year": 2023,
    458       "arxiv_id": "2310.18018",
    459       "relevance": "Benchmark contamination detection methodology — directly relevant to data leakage assessment."
    460     },
    461     {
    462       "title": "Inverse scaling can become u-shaped",
    463       "authors": ["J. Wei", "N. Hou"],
    464       "year": 2023,
    465       "arxiv_id": "2211.02011",
    466       "relevance": "Documents inverse scaling phenomena in LLMs, directly relevant to this paper's main finding."
    467     },
    468     {
    469       "title": "Show your work: Improved reporting of experimental results",
    470       "authors": ["J. Dodge", "S. Gururangan", "D. Card"],
    471       "year": 2019,
    472       "relevance": "Best practices for reporting ML experimental results — methodology quality reference."
    473     },
    474     {
    475       "title": "Improving reproducibility in machine learning research",
    476       "authors": ["J. Pineau", "P. Vincent-Lamarre"],
    477       "year": 2021,
    478       "relevance": "NeurIPS reproducibility program report — directly relevant to our assessment of reproducibility practices."
    479     },
    480     {
    481       "title": "Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity",
    482       "authors": ["W. Fedus", "B. Zoph", "N. Shazeer"],
    483       "year": 2022,
    484       "relevance": "Foundational MoE architecture paper relevant to understanding sparse model scaling."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs