scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29039B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On The Fragility of Benchmark Contamination Detection in Reasoning Models",
      6     "authors": [
      7       "Han Wang",
      8       "Haoyu Li",
      9       "Brian Ko",
     10       "Huan Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2510.02386",
     15     "doi": "10.48550/arXiv.2510.02386"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All major abstract claims — that GRPO conceals SFT contamination, that CoT contamination on LRMs evades detection near random guess, and that PPO-style clipping is the root cause — are directly supported by Tables 1–5 and Theorem 3.1.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The causal claim that PPO-style importance sampling/clipping causes concealment is supported by controlled ablations removing the clipping term (Table 3) and a formal mathematical proof in Appendix C; both independently reach the same conclusion.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper claims 'a broad class of RL methods may inherently exhibit similar concealment capability' based solely on GRPO and RAFT++; this generalization is not bounded to which specific methods share the PPO clipping objective and which do not.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 3.1 explicitly addresses and rules out the alternative that GRPO simply causes models to forget contamination, using two experiments: RL on contaminated data showing comparable AUROC drops, and further clean SFT failing to conceal contamination (Figure 2, Table 14).",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "AUROC measures detection performance and pass@1 measures performance inflation; the paper distinguishes these throughout, and Section 3.1 explicitly discusses that performance inflation persists even when contamination is undetectable.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Appendix A is titled 'LIMITATIONS' and discusses that no new detection algorithm is proposed and that current detectors' failure stems from log-probability assumptions.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The limitations section does not discuss specific threats to experimental validity, such as the single base model used (Qwen2.5-7B-Instruct) for Stage I, the extremely small member/non-member sets (15 samples each for AIME), or the limited 156 RL training steps relative to real-world scales.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what its results do NOT show; there is no discussion of whether findings generalize beyond the tested model sizes, model families, or contamination ratios, and the limitations section focuses on what was not built rather than what was not proven.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment section appears anywhere in the paper; no grants, industry support, or other funding sources are mentioned.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors' institutional affiliations (University of Illinois Urbana-Champaign and University of Washington) are clearly disclosed on the first page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms including 'benchmark contamination,' 'SFT contamination,' 'RL contamination,' 'member/non-member sets,' and 'LRMs' are all explicitly defined in Section 1 and Section 3's contamination/detection setups.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly states it presents 'the first systematic study of benchmark contamination in LRMs' structured around two contamination scenarios, contributing empirical evidence of detection fragility and a theoretical explanation of the mechanism.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 provides dedicated related work covering LRMs, contamination detection methods (5 categories, 10+ methods), and prior concealment work, explicitly positioning this as the first algorithmic-level concealment analysis.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Source code is explicitly stated as available at https://github.com/ASTRAL-Group/LRM_Conta_Detection_Arena.git in the abstract.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All evaluation benchmarks used (AIME 2024/2025, GPQA Diamond, OlympiadBench, Minerva Math, AMC 2023) are standard publicly available datasets used without modification.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Appendix F describes hardware and names frameworks (LLaMA-Factory, Verl, vLLM, FlashAttention-2) but provides no requirements.txt, Dockerfile, or pinned dependency versions; exact software environment is not reproducible from this information alone.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Detailed hyperparameters are scattered across Appendix D.4 but no step-by-step reproduction instructions exist; a reader would need to reverse-engineer the pipeline from the code and appendix tables without a clear entry point.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All AUROC values in Tables 2, 3, 5, 11, and 12 are reported as point estimates only; no confidence intervals or error bars are reported despite results being averaged over 8 rollouts.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are performed for any comparative claims; AUROC differences between training conditions are presented without formal testing.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Delta (∆) columns in Tables 2, 3, 11, and 12 report absolute AUROC differences from baseline, providing effect size context for all comparative claims across detection methods and benchmarks.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The choice of 8 rollouts and 50/50 member/non-member split is not justified; AIME benchmarks have only 30 problems total, yielding 15 members and 15 non-members — too small for reliable AUROC estimation and not discussed as a concern.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No standard deviations or variance are reported across the 8 rollouts used to average detection scores; all result tables report only mean AUROC values.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Ten representative detection methods spanning generation-based, perturbation-based, reference-based, and reference-free approaches serve as baselines, compared before and after RL training.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Detection baselines span 2021–2025 including recent methods Min-K%++ (2024), CDD (2024), LLM Dataset Inference (2024), and Verbatim (2025), covering the current state of the art.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Table 3 ablates the clipping term in RAFT++ and GRPO, directly isolating its contribution to contamination concealment and confirming the theoretical prediction.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Both AUROC (contamination detection performance) and pass@1 (benchmark performance inflation) are reported throughout, measuring detection ability and practical impact separately.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "The evaluation involves automated scoring on math and science reasoning benchmarks; human evaluation is not relevant.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Each benchmark is split 50/50 into member (training) and non-member (held-out) sets; AUROC is computed by comparing detection scores between these disjoint sets.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "All result tables provide per-benchmark breakdowns across six benchmarks (OlympiadBench, GPQA Diamond, AIME 2025, AIME 2024, Minerva Math, AMC 2023), allowing assessment of variability across task types.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 4's discussion analyzes why detection fails on LRMs — LRMs generalize broadly from member to non-member distributions after CoT contamination, undermining the memorization assumption — providing mechanistic understanding of failure modes.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "The paper reports that RL contamination provides no performance benefit (Table 1), RAFT cannot conceal contamination (Tables 3, 12), and embedding-based methods fail to distinguish members from non-members even before contamination (Figures 10–12).",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model names are provided throughout: Qwen2.5-7B-Instruct, DeepSeek-R1-Distill-Llama-8B, DeepSeek-R1-Distill-Qwen-7B, OpenThought3-7B (15K), QwQ-32B (for distillation), and Bespoke-Stratos-7B (reference model).",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix D.4 provides exact prompt templates with full text for both math reasoning and multiple-choice formats, including complete worked examples.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Appendix D.4 provides complete hyperparameter tables for SFT (batch size, LR, context length, epochs, scheduler, warmup ratio) and RL (batch size, prompt/response length, ε, LR, rollout number, temperature) stages.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Appendix D.2 provides precise formulas for all 10 detection methods with implementation choices documented (e.g., α=0.5 for CDD, k=20 for Min-K%, why response tokens are used over question tokens).",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Appendix D.4 documents 13-gram overlap deduplication against benchmarks; Appendix D.1 describes the contamination pipeline including QwQ-32B distillation settings (temperature 0.6, top-p 0.95, 64 rollouts) and 3x question replication.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The contaminated training datasets (member sets with QwQ-32B-distilled reasoning chains) are not released as standalone artifacts; reproducing them requires running the full distillation pipeline.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Appendices D.1 and D.3 describe the contamination pipelines and benchmark sources in detail, including exact split procedures, benchmark sizes, and distillation parameters.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; standard public benchmarks are used without any participant recruitment.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The complete pipeline from benchmark selection → 50/50 member/non-member split → SFT/RL contamination training → detection evaluation is documented across Sections 3–4 and Appendices D.1–D.4.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs for the base models (Qwen2.5-7B-Instruct, DeepSeek-R1 distillations) are not stated; prior exposure to these benchmarks in pretraining is not discussed.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Train-test overlap is the paper's central subject; the authors use 13-gram deduplication of clean training data against benchmarks and explicitly control contamination through the experimental design.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Benchmark contamination is the core focus; the paper directly controls and measures it throughout, studying both deliberate contamination and RL-induced concealment in systematic experiments.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference cost or latency is reported; vLLM is mentioned and 8 rollouts per question are used, but no runtime or cost estimates are provided.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Appendix F specifies the hardware platform (9x NVIDIA L40S GPUs, 48 GiB each) but does not state total GPU-hours, wall-clock training time, or dollar cost for any experiment.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GRPO training with clean samples markedly conceals SFT contamination signals, with AUROC dropping 8–20pp across all detection methods",
    374       "evidence": "Table 2 shows systematic AUROC drops after 156 GRPO steps (Max-K%: −19.84pp, Loss: −16.68pp, Min-K%: −16.42pp); Figure 2 shows monotonic decline with each additional step",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "PPO-style importance sampling and clipping are the root cause of contamination concealment, not generic further training",
    379       "evidence": "Table 3 ablation: removing clipping from RAFT++ reduces its concealment effect from Δ−17.91 to Δ−1.09; RAFT (no clipping) shows no concealment (Δ+2.03); supported by Theorem 3.1",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Contamination inflation comes primarily from SFT, with RL contamination providing negligible additional benefit",
    384       "evidence": "Table 1: SFT contamination adds 8.82pp pass@1 on average; adding RL contamination shows no significant difference vs. clean RL training",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Extensive CoT contamination of advanced LRMs at the final training stage leaves barely any detectable evidence",
    389       "evidence": "Table 5: all 10 detection methods achieve near-random AUROC (~50%) across three LRM models and six benchmarks after extensive SFT CoT contamination",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "LRMs generalize from member to non-member distributions after CoT contamination, invalidating the memorization assumption behind all current detectors",
    394       "evidence": "Figure 4 shows log-prob of non-members increases at similar margin as members after contamination; Section 4 provides mechanistic analysis",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Further SFT training on clean data cannot make models forget contamination or suppress performance inflation",
    399       "evidence": "Table 14: additional 4 epochs of clean SFT on contaminated model shows pass@1 continuing to rise by 0.25pp and contamination remaining undetectable",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "theoretical"
    406   ],
    407   "key_findings": "Existing contamination detection methods are fundamentally fragile against Large Reasoning Models. In Stage I, GRPO training with clean samples after SFT contamination systematically reduces detection AUROC by 8–20pp, with the PPO-style importance sampling/clipping mechanism identified as the root cause via controlled ablation (removing clipping eliminates concealment) and formal proof. In Stage II, contaminating advanced LRMs with CoT data reduces all detection methods to near-random guessing (~50% AUROC) because LRMs generalize broadly from member to similar non-member distributions, invalidating the memorization assumption underlying all current detection methods. Together these findings reveal that developers could deliberately contaminate LRMs to inflate leaderboard scores while leaving minimal detectable traces.",
    408   "red_flags": [
    409     {
    410       "flag": "Single base model for Stage I",
    411       "detail": "All Stage I experiments use only Qwen2.5-7B-Instruct; generalization to other model families, scales, or architectures is asserted but not empirically tested."
    412     },
    413     {
    414       "flag": "Tiny member/non-member sets for AIME",
    415       "detail": "AIME 2024 and 2025 benchmarks have only 30 problems total, yielding 15 members and 15 non-members per set — insufficient for reliable AUROC estimation, yet used for primary claims."
    416     },
    417     {
    418       "flag": "No confidence intervals or significance tests",
    419       "detail": "All AUROC values are point estimates across 8 rollouts with no error bars or significance testing; it is impossible to assess whether reported differences are statistically meaningful."
    420     },
    421     {
    422       "flag": "Limited RL training steps vs. real-world scales",
    423       "detail": "Maximum 156 GRPO steps used; real-world LRM training uses orders of magnitude more steps. The paper acknowledges this gap but extrapolates that 'extensive GRPO training would render all existing detection methods to near-random performance.'"
    424     },
    425     {
    426       "flag": "Training cutoffs of base models unstated",
    427       "detail": "The paper does not state whether AIME, GPQA, or other benchmarks appeared in the pretraining data of Qwen2.5-7B-Instruct or DeepSeek distillation models, potentially confounding the controlled contamination setup."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Extracting training data from large language models (Carlini et al., 2021)",
    433       "relevance": "Foundational membership inference work providing Loss, Ref, and Zlib detection methods — three of the ten baselines directly evaluated in this paper"
    434     },
    435     {
    436       "title": "Detecting pretraining data from large language models (Shi et al., 2023)",
    437       "relevance": "Min-K% detection method — one of the primary detection baselines evaluated across all experiments"
    438     },
    439     {
    440       "title": "Min-K%++: Improved baseline for detecting pre-training data from large language models (Zhang et al., 2024)",
    441       "relevance": "Min-K%++ detection method — evaluated as the improved version of Min-K%, showing similar fragility under RL concealment"
    442     },
    443     {
    444       "title": "LLM dataset inference: Did you train on my dataset? (Maini et al., 2024)",
    445       "relevance": "Max-K% detection method and dataset inference framing — directly evaluated and shown to be fragile"
    446     },
    447     {
    448       "title": "An empirical analysis of memorization in fine-tuned autoregressive language models (Mireshghallah et al., 2022)",
    449       "relevance": "LiRA detection method — the highest-performing baseline in Stage I (89.13% AUROC) that still drops significantly after GRPO"
    450     },
    451     {
    452       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models (Dong et al., 2024)",
    453       "relevance": "CDD detection method and key prior work on contamination in non-reasoning models; this paper extends that work to the LRM setting"
    454     },
    455     {
    456       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning (Guo et al., 2025)",
    457       "relevance": "Source of DeepSeek-R1 distillation models used in Stage II experiments and the GRPO algorithm central to Stage I analysis"
    458     },
    459     {
    460       "title": "Does data contamination detection work (well) for LLMs? A survey and evaluation on detection assumptions (Fu et al., 2024)",
    461       "relevance": "Survey of contamination detection methods and their underlying assumptions — directly relevant to the paper's finding that those assumptions fail for LRMs"
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 3,
    467       "justification": "Directly threatens the integrity of all LLM leaderboards and benchmark evaluations used for model selection, procurement, and safety assessments."
    468     },
    469     "surprise_contrarian": {
    470       "score": 3,
    471       "justification": "Shows that standard RL training inadvertently (or deliberately) erases contamination traces, upending the assumption that contamination is detectable if you look hard enough."
    472     },
    473     "fear_safety": {
    474       "score": 2,
    475       "justification": "Raises concrete concerns about systematic capability overstating in frontier AI evaluations, with direct implications for AI governance and capability assessments."
    476     },
    477     "drama_conflict": {
    478       "score": 2,
    479       "justification": "Implies model developers can game leaderboards with near-zero detection risk, framing the LLM evaluation ecosystem as a fundamentally adversarial arms race developers are currently winning."
    480     },
    481     "demo_ability": {
    482       "score": 1,
    483       "justification": "Code is released but requires training LLMs from scratch on 9x NVIDIA L40S GPUs, making live demonstration impractical for most readers."
    484     },
    485     "brand_recognition": {
    486       "score": 1,
    487       "justification": "From UIUC and University of Washington — respected institutions but not top commercial AI labs with immediate leaderboard authority."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "45693591",
    494         "title": "ChunkLLM: A Lightweight Pluggable Framework for Accelerating LLMs Inference",
    495         "points": 96,
    496         "comments": 8,
    497         "url": "https://news.ycombinator.com/item?id=45693591"
    498       },
    499       {
    500         "hn_id": "45235119",
    501         "title": "Instruction-Following Pruning for Large Language Models",
    502         "points": 5,
    503         "comments": 0,
    504         "url": "https://news.ycombinator.com/item?id=45235119"
    505       },
    506       {
    507         "hn_id": "37921371",
    508         "title": "Quantum Computing: Principles and Applications",
    509         "points": 5,
    510         "comments": 0,
    511         "url": "https://news.ycombinator.com/item?id=37921371"
    512       },
    513       {
    514         "hn_id": "45843183",
    515         "title": "Mathematical Exploration and Discovery at Scale",
    516         "points": 4,
    517         "comments": 2,
    518         "url": "https://news.ycombinator.com/item?id=45843183"
    519       },
    520       {
    521         "hn_id": "45862519",
    522         "title": "Mathematical exploration and discovery at scale – Terence Tao et al.",
    523         "points": 4,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=45862519"
    526       },
    527       {
    528         "hn_id": "45721820",
    529         "title": "The Fragility of Benchmark Contamination Detection in Reasoning Models",
    530         "points": 2,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=45721820"
    533       },
    534       {
    535         "hn_id": "45837025",
    536         "title": "Mathematical Exploration and Discovery at Scale",
    537         "points": 2,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=45837025"
    540       },
    541       {
    542         "hn_id": "41801438",
    543         "title": "Comprehensive Survey of Mamba Architectures for Medical Image Analysis,Beyond",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=41801438"
    547       },
    548       {
    549         "hn_id": "45607220",
    550         "title": "Conceptualizing/Modeling Communication-Based Cyberattacks on Automated Vehicles",
    551         "points": 1,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=45607220"
    554       }
    555     ],
    556     "top_points": 96,
    557     "total_points": 121,
    558     "total_comments": 11
    559   }
    560 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs