scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28827B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Is GPT-OSS Good? A Comprehensive Evaluation of OpenAI's Latest Open Source Models",
      6     "authors": [
      7       "Ziqian Bi",
      8       "Keyu Chen",
      9       "Chiung-Yi Tseng",
     10       "Danyang Zhang",
     11       "Tianyang Wang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2508.12461",
     16     "doi": "10.48550/arXiv.2508.12461"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims code generation is a 'relative strength' but Table II shows GPT-OSS ranks last in HumanEval (73%/71%), and there are major numerical inconsistencies between Table II (which 'averages with published results') and the actual reported results in the text — e.g., Section F reports FinQA as 120B:84% vs 20B:58%, directly contradicting Table II's 68% vs 65%.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper attributes inverse scaling to 'potential inefficiencies in the MoE routing mechanism or suboptimal training configuration' without any ablation studies or architectural analysis to support these causal attributions.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper draws broad conclusions that 'scaling laws may not universally apply across all cognitive domains' based on only two GPT-OSS variants from a single release, and the title claims 'Comprehensive Evaluation' while covering only 10 benchmarks with no real-world task testing.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes inverse scaling primarily to MoE routing inefficiency or training configuration, without discussing alternative explanations such as differences in training data composition, fine-tuning approach, or alignment tuning that could also explain the 20B vs 120B performance gap.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The conclusion explicitly acknowledges that 'established benchmarks... do not fully capture emerging capabilities or real-world robustness,' and the paper avoids conflating benchmark scores with general intelligence.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; limitation discussion is confined to two sentences in the conclusion paragraph.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The conclusion mentions 'established benchmarks that do not fully capture emerging capabilities' and 'we did not exhaustively optimise prompting,' which are generic disclaimers rather than specific threats such as benchmark saturation, sample size adequacy, or model API versioning uncertainty.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated regarding what the results do NOT show; the paper claims to be 'comprehensive' without bounding applicability to, e.g., non-English settings, instruction-following tasks, or safety evaluations.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The acknowledgments vaguely reference 'computational resources provided by our institution' but no funding source or grant number is named.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed on the title page, including Purdue University, Georgia Tech, ByteDance, Emory University, Imperial College London, and 'AI Agent Lab, Vokram Group.'",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "'Mid-tier performance,' 'inverse scaling,' and 'comprehensive evaluation' are used without precise definitions; 'inverse scaling' is borrowed from citations but not formally defined in context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction clearly states the paper's purpose: empirically evaluate GPT-OSS 20B and 120B against six contemporary open-source LLMs across ten benchmarks to examine performance, efficiency, and scaling behavior.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II covers LLM evaluation, model architectures/scaling laws, and benchmarking practices with substantial references, positioning this work relative to HELM, BIG-bench, scaling law literature, and prior MoE evaluations.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The abstract mentions evaluation scripts 'available at the Project Webpage' but no URL is provided in the paper text, making independent access impossible.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All ten benchmarks used (MMLU, GSM8K, HumanEval, FinQA, PIQA, SciQ, MedQA, LegalQA, DialogSum, C-Eval) are standard publicly available datasets.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions Ubuntu 22.04 LTS, 8 NVIDIA H100 GPUs, and vLLM, but no requirements file, Dockerfile, or software version list is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are included; the paper references a project webpage URL that is never provided.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Figure 3 shows 95% bootstrap confidence interval error bars on performance rankings, and Section V.H reports maximum CI width of ±2.1%.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "McNemar's test with multiple comparison correction (Benjamini-Hochberg in Section III.D, Bonferroni in Section V.H — inconsistently) is applied for pairwise model comparisons, with p < 0.05 for all reported differences.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Cohen's d effect sizes are reported (range d=0.52 to d=1.84 in Section V.H, d=0.73 cited for the inverse scaling finding).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper claims to follow 'power analysis guidelines from Card et al.' but never reports the actual number of examples used per benchmark or demonstrates that sample sizes are adequate for the comparisons made.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Bootstrap confidence intervals are reported with a stated maximum width of ±2.1%, and inter-rater reliability κ=0.87 is reported for human quality evaluation.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Six contemporary open-source LLMs are used as baselines: Qwen3 235B, DeepSeek-R1 70B, Phi-4 Reasoning 14B, Llama 4 Scout, Llama 3.3 70B, and Gemma 3 27B.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baseline models were released in 2024-2025, representing the state-of-the-art at the time of evaluation.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": false,
    193           "answer": false,
    194           "justification": "This is a pure evaluation study of existing models; no novel components are introduced that would require ablation.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Ten benchmarks across five capability domains are used, plus efficiency metrics (memory, throughput, energy) and a qualitative multi-dimensional quality assessment.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "A blind human evaluation protocol is used for quality assessment in the logic reasoning case study (Section III.C), with three raters evaluating anonymized model outputs across four quality dimensions.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Standard benchmark test sets are used for all evaluations, with 1.3% of examples removed during quality filtering.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per benchmark, and Table III provides sub-category breakdown for mathematical reasoning (basic arithmetic, multi-step algebra, word problems, CoT gain).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Failure cases are discussed including GPT-OSS numerical precision failures on unit conversions, multilingual weakness (C-Eval below 30%), and Qwen3 235B generating 132,000 characters of repetitive output.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The inverse scaling result (20B outperforming 120B) and multilingual failures (both models below 30% on C-Eval) are explicitly reported as negative/unexpected findings.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are identified by name only (e.g., 'GPT-OSS 20B,' 'Qwen3 235B') without snapshot dates, commit hashes, or version tags that would allow exact reproduction.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper references prompting strategies from Brown et al. and Wei et al. but does not provide the actual prompt templates used for any benchmark.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature (0.7 creative/0.1 factual), top-p=0.95, top-k=50, max tokens=2000, repetition penalty=1.1 are all reported in Section IV.A.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; this is direct inference evaluation.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section IV.B documents a 5% stratified manual inspection, UTF-8 encoding validation, and quality filtering resulting in 98.7% example retention, with removed examples documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw model outputs or per-example results are made publicly available; the paper only reports aggregate statistics.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The data collection procedure is described: standard benchmark test sets used with documented preprocessing steps, 5% manual inspection, and quality filtering criteria.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Standard public benchmarks are used; no participant recruitment was involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "A high-level 5-stage pipeline is described but the full chain from benchmark loading to final aggregated statistics is not documented with sufficient detail for independent replication.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The GPT-OSS models are described as released in August 2025 but their training data cutoff is not stated, nor are the cutoffs of any of the six baseline models.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Section IV.E claims contamination detection methods from Sainz et al. and Magar & Schwartz were employed, but no contamination analysis results are actually presented in the paper.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "All benchmarks (MMLU, GSM8K, HumanEval, etc.) predate August 2025 and could plausibly be in GPT-OSS training data; despite citing contamination detection methods, no overlap analysis or contamination scores are reported.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participant study was conducted.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participant study was conducted.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participant study was conducted.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participant study was conducted.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participant study was conducted.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participant study was conducted.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participant study was conducted.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section V.I reports peak GPU memory (80GB vs 16GB), throughput (128 vs 178 tokens/s), and energy per response (2.6x less for 20B) for the two GPT-OSS variants.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The total compute cost (GPU-hours, dollar cost) for running all evaluations across eight models and ten benchmarks is not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "GPT-OSS 20B outperforms GPT-OSS 120B on multiple benchmarks including MMLU (69% vs 66%) and SciQ (87% vs 82%)",
    375       "evidence": "Table II and Fig 7 show 20B ahead of 120B on most benchmarks; McNemar's test p<0.01, Cohen's d=0.73",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Both GPT-OSS models occupy a mid-tier position in the current open-source landscape, consistently outperformed by Qwen3-235B, DeepSeek-R1-70B, and Phi-4 Reasoning",
    380       "evidence": "Table II shows GPT-OSS averages of 67.7% and 64.8% vs top models at 82.4-85.2%; Fig 3 rankings confirm this ordering",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "GPT-OSS shows severe multilingual weakness with C-Eval scores of 45% (20B) and 42% (120B)",
    385       "evidence": "Table II C-Eval column; text Section V.J reports 28%/20% for 20B/120B — inconsistent with Table II's 45%/42%",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "GPT-OSS 20B requires 5x less GPU memory and 2.6x less energy per response compared to GPT-OSS 120B",
    390       "evidence": "Section V.I: peak memory 80GB vs 16GB, energy savings stated as 2.6x per completed response",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "GPT-OSS 120B shows superior response quality in logic reasoning compared to DeepSeek-R1 and Phi-4 Reasoning, which expose verbose internal chains",
    395       "evidence": "Table IV logic reasoning case study: GPT-OSS 120B rated 'Excellent' overall vs 'Poor' for DeepSeek-R1 and Phi-4; 2,399 vs 26,000+ characters",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Chain-of-thought prompting improved GSM8K accuracy by 15% for 20B and 14% for 120B",
    400       "evidence": "Table III CoT Gain column; no baseline vs CoT breakdown table provided, only summary numbers",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "GPT-OSS 20B consistently matches or outperforms the larger 120B variant across most benchmarks, contradicting conventional scaling law predictions, with p<0.01 statistical confirmation. Both models occupy a mid-tier position in the open-source landscape — outperformed by Qwen3-235B, DeepSeek-R1-70B, and Phi-4 Reasoning — with particular strength in structured reasoning and notable weakness in multilingual tasks (C-Eval below 30%). GPT-OSS 20B delivers comparable or superior accuracy with 5x lower GPU memory and 2.6x lower energy per response than the 120B model. However, the paper contains major numerical inconsistencies between Table II (which averages experimental and published results) and the results text, substantially undermining confidence in the reported findings.",
    408   "red_flags": [
    409     {
    410       "flag": "Numerical inconsistencies throughout",
    411       "detail": "Table II (which 'averages with published results') and the paper's own text report drastically different numbers for the same models on the same benchmarks. E.g., Section F reports FinQA as 120B:84% vs 20B:58%, while Table II shows 65% vs 68%; C-Eval text reports 20%/28% while Table II shows 42%/45%. This makes the primary claims unverifiable."
    412     },
    413     {
    414       "flag": "Methodologically unsound averaging of own results with published results",
    415       "detail": "Table II caption states results are averaged with 'published results for a more moderate view' — this conflates different testing conditions, prompts, and preprocessing pipelines into a single number, making comparisons uninterpretable."
    416     },
    417     {
    418       "flag": "Statistical test inconsistency",
    419       "detail": "Section III.D specifies Benjamini-Hochberg correction for multiple comparisons, but Section V.H reports using Bonferroni correction — these are different procedures with different statistical properties, and it is unclear which was actually applied."
    420     },
    421     {
    422       "flag": "No contamination analysis actually performed",
    423       "detail": "The paper claims contamination detection methods from Sainz et al. and Magar & Schwartz were used, but no contamination results, overlap rates, or exclusion decisions are reported anywhere in the paper."
    424     },
    425     {
    426       "flag": "No model version snapshots",
    427       "detail": "Models are identified by marketing name only (e.g., 'GPT-OSS 20B') with no snapshot dates, API version tags, or commit hashes — the models may have been updated between evaluation and publication."
    428     },
    429     {
    430       "flag": "Project webpage URL never provided",
    431       "detail": "The abstract promises evaluation scripts 'available at the Project Webpage' but no URL appears anywhere in the paper, making this a non-verifiable promise."
    432     },
    433     {
    434       "flag": "Qualitative quality evaluation on a single example",
    435       "detail": "The 'optimal character range' of 1,000-3,000 for response quality is stated without empirical basis, and the entire logic reasoning case study evaluates one puzzle — insufficient evidence to support claims about response quality differences."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "Measuring Massive Multitask Language Understanding",
    441       "relevance": "MMLU benchmark used as primary general knowledge evaluation metric"
    442     },
    443     {
    444       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    445       "relevance": "GSM8K benchmark used for mathematical reasoning evaluation"
    446     },
    447     {
    448       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    449       "relevance": "HumanEval benchmark used for code generation evaluation"
    450     },
    451     {
    452       "title": "Holistic Evaluation of Language Models (HELM)",
    453       "relevance": "Evaluation framework whose best practices the paper claims to follow"
    454     },
    455     {
    456       "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    457       "relevance": "MoE architecture that GPT-OSS is compared to in architectural discussion"
    458     },
    459     {
    460       "title": "Scaling Laws for Neural Language Models (Kaplan et al.)",
    461       "relevance": "Scaling law predictions that the paper claims GPT-OSS violates"
    462     },
    463     {
    464       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    465       "relevance": "Refined scaling law predictions used as baseline expectations for model performance"
    466     },
    467     {
    468       "title": "NLP Evaluation in Trouble: On the Need to Measure LLM Data Contamination for Each Benchmark",
    469       "relevance": "Contamination detection methodology claimed to be used in evaluation"
    470     },
    471     {
    472       "title": "With Little Power Comes Great Responsibility (Card et al.)",
    473       "relevance": "Statistical validity and effect size reporting guidelines followed in evaluation design"
    474     },
    475     {
    476       "title": "C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite",
    477       "relevance": "Multilingual benchmark used to expose GPT-OSS weakness in Chinese comprehension"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 2,
    483       "justification": "Practitioners choosing between GPT-OSS variants for deployment will find the efficiency comparison (5x memory, 2.6x energy, 20B vs 120B) directly actionable."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "The finding that a 20B MoE model outperforms its 120B sibling on most benchmarks directly challenges scaling law intuitions."
    488     },
    489     "fear_safety": {
    490       "score": 0,
    491       "justification": "No AI safety or risk content; purely a capability/efficiency benchmark study."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Mild interest as OpenAI's first open-weight release since GPT-2, but the paper's conclusion that GPT-OSS is mid-tier dampens hype."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "GPT-OSS models are publicly available as open weights, so readers can immediately run their own comparisons."
    500     },
    501     "brand_recognition": {
    502       "score": 3,
    503       "justification": "OpenAI's first open-weight model release in six years is a high-interest event; the GPT brand drives significant attention independent of results quality."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "42807387",
    510         "title": "A Faster Quantum Fourier Transform",
    511         "points": 89,
    512         "comments": 6,
    513         "url": "https://news.ycombinator.com/item?id=42807387",
    514         "created_at": "2025-01-23T19:49:59Z"
    515       },
    516       {
    517         "hn_id": "44792686",
    518         "title": "Language Models Improve When Pretraining Data Matches Target Tasks",
    519         "points": 7,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=44792686",
    522         "created_at": "2025-08-04T23:48:16Z"
    523       },
    524       {
    525         "hn_id": "43091208",
    526         "title": "Show HN: Fray: A controlled concurrency testing framework for the JVM",
    527         "points": 4,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=43091208",
    530         "created_at": "2025-02-18T16:11:24Z"
    531       },
    532       {
    533         "hn_id": "44982554",
    534         "title": "Is GPT-OSS Good? A Comprehensive Evaluation",
    535         "points": 2,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=44982554",
    538         "created_at": "2025-08-22T09:36:04Z"
    539       },
    540       {
    541         "hn_id": "37315357",
    542         "title": "Are ChatGPT and GPT-4 Good Poker Players? – A Pre-Flop Analysis",
    543         "points": 2,
    544         "comments": 1,
    545         "url": "https://news.ycombinator.com/item?id=37315357",
    546         "created_at": "2023-08-29T23:05:56Z"
    547       },
    548       {
    549         "hn_id": "44658474",
    550         "title": "Scalable Chrysopoeia via (N,2n) Reactions Driven by Deuterium-Tritium Fusion",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=44658474",
    554         "created_at": "2025-07-23T12:30:10Z"
    555       },
    556       {
    557         "hn_id": "37656429",
    558         "title": "Are ChatGPT and GPT-4 Good Poker Players? – A Pre-Flop Analysis",
    559         "points": 2,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=37656429",
    562         "created_at": "2023-09-26T09:03:09Z"
    563       },
    564       {
    565         "hn_id": "37409091",
    566         "title": "Are ChatGPT and GPT-4 Good Poker Players? Yes but Not Game Theory Optimal",
    567         "points": 2,
    568         "comments": 0,
    569         "url": "https://news.ycombinator.com/item?id=37409091",
    570         "created_at": "2023-09-06T18:27:32Z"
    571       },
    572       {
    573         "hn_id": "45319150",
    574         "title": "A Qualitative Study of Co-Creation, Communication, Flow, Trust in Vibe Coding",
    575         "points": 1,
    576         "comments": 0,
    577         "url": "https://news.ycombinator.com/item?id=45319150",
    578         "created_at": "2025-09-21T01:14:55Z"
    579       },
    580       {
    581         "hn_id": "43175795",
    582         "title": "Probing Non-Equilibrium Topological Order on a Quantum Processor",
    583         "points": 1,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=43175795",
    586         "created_at": "2025-02-25T18:53:09Z"
    587       }
    588     ],
    589     "top_points": 89,
    590     "total_points": 112,
    591     "total_comments": 9
    592   }
    593 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs