scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29217B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
      6     "authors": [
      7       "Chunyuan Deng",
      8       "Yilun Zhao",
      9       "Xiangru Tang",
     10       "Mark Gerstein",
     11       "Arman Cohan"
     12     ],
     13     "year": 2023,
     14     "venue": "arXiv",
     15     "arxiv_id": "2311.09783",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims 52%/57% EM rates for ChatGPT/GPT-4 on MMLU are confirmed by Table 3. Claim about TruthfulQA commercial model performance is supported by Table 2.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper uses high TS-Guessing EM rates as evidence of contamination but cannot establish that contamination caused the performance—alternative explanations like statistical priors or reasoning ability are not fully ruled out despite filtering. The controlled contamination experiment validates the probe but not the original inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are largely bounded to the specific benchmarks tested (MMLU, TruthfulQA, etc.) and specific models; the paper uses hedged language like 'may suspect' and 'raises concerns' rather than asserting contamination as fact.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper applies filtering to reduce reasoning-based guessing but does not systematically discuss whether models could achieve high EM rates through statistical priors about common wrong answers rather than memorization; the only alternative considered is direct reasoning, which is partially controlled.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly discusses TS-Guessing as an indicator of potential contamination rather than proof, and Section 5 explicitly notes TS-Guessing is 'less reliable since it relies on inferred knowledge rather than direct retrieval.'",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Limitations' is a dedicated section listing BM25-only indexing, 2-3 minute computation time per data point, superficial nature of text generation scores, and LLM instruction comprehension dependency.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Limitations are reasonably specific: BM25 retrieval may miss semantic overlap, TS-Guessing depends on models following instructions (open-source models often predict correct answer regardless), and the 0.65 Rouge-L threshold was empirically chosen without principled justification.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 5 explicitly compares retrieval vs TS-Guessing trade-offs, stating retrieval is 'generally more reliable' but requires training data access, while TS-Guessing is 'less reliable' and may not work for reasoning benchmarks.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The acknowledgements section mentions colleagues and anonymous reviewers but no funding source is disclosed anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated on the first page: Georgia Institute of Technology, Yale University, and Allen Institute for AI.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, making independence of funder unverifiable; authors are from academic institutions with no apparent commercial stake in the evaluated models.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interests declaration appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Data contamination is operationally defined as benchmark data appearing in pretraining corpora; TS-Guessing is formally defined in Section 3.2 with mathematical formulations for both Question-based and Question-Multichoice settings.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states it contributes two methods: a retrieval-based IR system for open-source models and the TS-Guessing protocol applicable to both open and closed-source models.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 'Related Work' systematically situates the contribution against n-gram matching approaches (GPT-3, PaLM, LLaMA methods), corpus indexing tools (Dodge et al., Elazar et al.), and recent contamination detection methods (Golchin & Surdeanu, Oren et al., Shi et al.).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository or release link is mentioned anywhere in the paper; only the Pyserini toolkit is cited as a dependency.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All benchmark datasets used (MMLU, TruthfulQA, PIQA, etc.) and pretraining corpora (The Pile, C4) are publicly available; the paper does not release custom data.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions using Pyserini but provides no requirements.txt, Dockerfile, or specific version information for any dependencies.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The methodology is described at a conceptual level but no step-by-step instructions sufficient to reproduce the experiments are provided.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1-3 are reported as point estimates with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Comparative claims (e.g., GPT-4 57% vs ChatGPT 52% on MMLU, commercial vs open-source model differences) are made without any statistical significance testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Exact match rates (e.g., 52%, 57%) and Rouge-L F1 scores provide interpretable effect sizes with clear baselines (near-zero for open-source models).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 100-example human evaluation sample is not statistically justified; no power analysis is provided for any experiment in the paper.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviations, variance, or repeated run statistics are reported for any result.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Open-source models (LLaMA 2-13B, Mistral-7B) serve as implicit baselines showing near-zero contamination signals; the controlled contamination experiment (clean vs deliberately contaminated ChatGPT) provides an explicit comparison.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "LLaMA 2 (2023) and Mistral-7B (2023) are contemporary models at time of writing.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The retrieval system is ablated across query types (question-only, label-only, question+label) in Table 4; TS-Guessing is ablated across hint variants (no hint, type-hint, category-hint, URL-hint) in Table 2.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The retrieval system uses BM25, SacreBLEU, Rouge-L, BLEURT, and GPTscore; TS-Guessing reports both EM rate and Rouge-L F1.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "17 NLP volunteers evaluated 100 data points to validate IR system metrics, with inter-annotator agreement measured by Krippendorff's alpha (0.8673).",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "The paper's task is contamination detection methodology validation, not prediction; held-out test set is not applicable.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are reported separately for each benchmark (MMLU, TruthfulQA, HellaSwag, PIQA, etc.) across all tables.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses cases where the retrieval system fails (n-gram matching misses contaminated data found by retrieval), examples that are filtered out (Table 5), and why open-source models fail at TS-Guessing.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "LLaMA 2-7B and 13B show near-zero EM rates on TS-Guessing (0.00-0.04), and the paper reports that stronger models (GPT-4) do not necessarily outperform weaker ones (ChatGPT) on the probe.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are identified as 'ChatGPT (GPT-3.5-turbo)', 'GPT-4', 'Claude-instant-1-100k', 'Claude-2', 'LLaMa 2-13B', 'Mistral-7B' but no API snapshot dates are provided for any model.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figures 2a and 2b show the actual prompt templates for Question-based and Question-Multichoice settings with example questions and masking instructions.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other generation hyperparameters are reported for any model.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; models are prompted directly.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Filtering criteria are documented: removing Yes-No/True-False options, mathematical symbols, options with Rouge-L F1 > 0.65 between pairs, and TruthfulQA-specific filters for short questions and Indexical Error category.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The filtered benchmark subsets, retrieval results, and model outputs are not released; only the public source benchmarks are available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described: publicly available benchmarks are used and the filtering pipeline is documented in Section 4.2.1.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Human annotators are described as '17 volunteers with backgrounds in NLP' compensated at $9/hour, but recruitment method (colleagues, crowdsourcing, etc.) is not specified.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from benchmark selection → query construction → BM25 retrieval → 13-gram tokenization → scoring is documented in Section 3.1; TS-Guessing pipeline from data → filtering → keyword selection → masking → model querying is documented in Section 3.2.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The paper states 'According to OpenAI, their training data is current up to September 2021' and uses this to analyze TruthfulQA contamination risk.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "This is the central topic of the paper; both methods directly measure or infer train-test overlap.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The paper directly tests whether MMLU and other widely-used benchmarks were available before training cutoff and whether models exhibit memorization signatures.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "The Ethics Statement discusses compensation and public data use but mentions no IRB or ethics board approval.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Annotators are described only as '17 volunteers with backgrounds in NLP'; no demographics (age, gender, institution, experience level) are reported.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "NLP background is mentioned implicitly but no formal inclusion/exclusion criteria are stated.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "No randomization of annotation assignments or conditions is described.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding procedures are described; annotators appear to have been aware of the task framing.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No attrition is applicable for a one-shot annotation task.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper mentions API calls to ChatGPT/GPT-4 for TS-Guessing but reports no dollar cost or token counts; latency of 2-3 minutes per data point is noted only for the retrieval system.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Disk space (~2-4TB) is mentioned but no GPU hours, cloud compute budget, or total cost is reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ChatGPT and GPT-4 can guess missing incorrect options in MMLU at 52% and 57% exact match rates respectively",
    375       "evidence": "Table 3 shows MMLU EM rates: ChatGPT=0.52, GPT-4=0.57, compared to near-zero rates for LLaMA 2-13B (0.00) and Mistral-7B (0.01)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Deliberately contaminating ChatGPT with the MMLU test set causes EM rate to approach 100%",
    380       "evidence": "Figure 4 shows fine-tuned ChatGPT achieves near-100% EM rate in both Question-based and Question-Multichoice settings, validating the probe's sensitivity",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Stronger models do not show significantly higher TS-Guessing performance than weaker models",
    385       "evidence": "GPT-4 outperforms ChatGPT by only 1% on MMLU EM; similar patterns hold for Claude-2 vs Claude-instant-1 (Table 2, Table 3)",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "TruthfulQA exhibits significant contamination overlap with pretraining corpora",
    390       "evidence": "Appendix C shows concrete example of TruthfulQA question substantially overlapping with C4 document (BM25 score 50.24); GPTscore shows highest scores for TruthfulQA-C4 overlap (Table 1)",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "GPTscore aligns more closely with human judgment than traditional metrics (SacreBLEU, Rouge-L, BLEURT) for contamination detection",
    395       "evidence": "Figure 3 shows GPTscore with highest Spearman correlation to human evaluation scores across 100 examples; exact correlation values not reported",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Open-source models (LLaMA, Mistral) show minimal contamination signals on MMLU via TS-Guessing",
    400       "evidence": "Table 3 shows LLaMA 2-13B EM=0.00 and Mistral-7B EM=0.01 on MMLU, versus 0.52 and 0.57 for ChatGPT and GPT-4",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "n-gram matching is insufficient for detecting all contaminated data in pretraining corpora",
    405       "evidence": "The retrieval-based system identifies contaminated examples that evaded n-gram tokenization detection, and Appendix C shows a high-overlap TruthfulQA example",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "empirical"
    412   ],
    413   "key_findings": "The paper proposes two contamination detection methods: a BM25-based retrieval system for open training corpora and a novel TS-Guessing protocol for black-box models. The main finding is that ChatGPT and GPT-4 can guess missing incorrect options in MMLU at 52% and 57% exact match rates—far above open-source model baselines (near 0%)—suggesting MMLU may be contaminated in commercial model training data. A controlled experiment validates the probe: deliberately fine-tuning ChatGPT on the MMLU test set pushes EM rates to ~100%, confirming the method's sensitivity. TruthfulQA shows substantial lexical overlap with the C4 pretraining corpus, and GPTscore is identified as a better proxy for human-judged contamination than traditional NLP metrics.",
    414   "red_flags": [
    415     {
    416       "flag": "Internal inconsistency in reported EM rates",
    417       "detail": "The abstract states ChatGPT achieves 52% and GPT-4 57% EM on MMLU; Table 3 confirms this. However, Section 4.2.2 body text states 'ChatGPT demonstrated...achieving a 57% Exact Match (EM) rate'—attributing GPT-4's score to ChatGPT."
    418     },
    419     {
    420       "flag": "Correlation conflated with contamination",
    421       "detail": "High TS-Guessing scores are interpreted as evidence of contamination but alternative explanations (e.g., language models having statistical priors about common wrong answers in well-known datasets) are not rigorously ruled out beyond simple filtering."
    422     },
    423     {
    424       "flag": "No code or filtered data released",
    425       "detail": "Neither the implementation of the retrieval system nor the filtered benchmark subsets are released, making it impossible to replicate exact results."
    426     },
    427     {
    428       "flag": "No statistical significance tests",
    429       "detail": "All comparisons between models (ChatGPT 52% vs GPT-4 57%, commercial vs open-source) are made without significance tests despite the paper making strong comparative claims."
    430     },
    431     {
    432       "flag": "Empirically chosen threshold without validation",
    433       "detail": "The Rouge-L 0.65 threshold for filtering correlated options is chosen 'based on initial experiments' (footnote 1) without cross-validation or sensitivity analysis."
    434     },
    435     {
    436       "flag": "Underpowered human evaluation",
    437       "detail": "The correlation between automatic metrics and human judgment is assessed on only 100 data points with 17 annotators, of which only 23 were judged contaminated—too few for reliable metric ranking."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models",
    443       "relevance": "Complementary contamination detection method at dataset-level granularity; directly compared and contrasted with this paper's approach"
    444     },
    445     {
    446       "title": "Proving Test Set Contamination in Black Box Language Models",
    447       "relevance": "Prior black-box contamination detection using canonical ordering; limitation noted as dataset-level only"
    448     },
    449     {
    450       "title": "Detecting Pretraining Data from Large Language Models",
    451       "relevance": "Min-k% probability method for contamination detection; requires model internals access, contrasting with TS-Guessing"
    452     },
    453     {
    454       "title": "What's In My Big Data?",
    455       "relevance": "Analysis of contamination in GLUE/SuperGLUE benchmarks in open training corpora; foundation for this paper's retrieval approach"
    456     },
    457     {
    458       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    459       "relevance": "Motivating work documenting the severity of the contamination problem for NLP evaluation"
    460     },
    461     {
    462       "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks",
    463       "relevance": "Mitigation strategies for the contamination problem this paper diagnoses"
    464     },
    465     {
    466       "title": "Measuring Massive Multitask Language Understanding",
    467       "relevance": "MMLU benchmark—primary dataset used to demonstrate high contamination signals in commercial models"
    468     },
    469     {
    470       "title": "Data Contamination Through the Lens of Time",
    471       "relevance": "Temporal approach to contamination detection using pre/post-training data release dates"
    472     },
    473     {
    474       "title": "Data contamination: From memorization to exploitation",
    475       "relevance": "Studies correlation between pretraining memorization and downstream task performance, directly relevant to this paper's thesis"
    476     },
    477     {
    478       "title": "Investigating data contamination for pre-training language models",
    479       "relevance": "Concurrent work investigating contamination-performance correlation, cited as related approach"
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 3,
    485       "justification": "Directly challenges validity of widely-used benchmark scores; any practitioner using MMLU results to compare models should know about potential contamination."
    486     },
    487     "surprise_contrarian": {
    488       "score": 3,
    489       "justification": "52-57% EM rate for guessing wrong options in MMLU—a task with many possible answers—is genuinely surprising and undermines trust in published LLM benchmarks."
    490     },
    491     "fear_safety": {
    492       "score": 1,
    493       "justification": "Raises concerns about misleading capability assessments but does not address direct AI safety risks."
    494     },
    495     "drama_conflict": {
    496       "score": 2,
    497       "justification": "Implicates OpenAI's flagship models (ChatGPT, GPT-4) in potential training data leakage from widely-used benchmarks, with contrast against transparent open-source alternatives."
    498     },
    499     "demo_ability": {
    500       "score": 2,
    501       "justification": "TS-Guessing can be replicated by anyone with ChatGPT/GPT-4 API access using the prompt templates shown in Figure 2."
    502     },
    503     "brand_recognition": {
    504       "score": 3,
    505       "justification": "Evaluates ChatGPT, GPT-4 (OpenAI), Claude-2/Claude-instant (Anthropic), LLaMA 2 (Meta), Mistral—all high-profile models with significant public recognition."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [
    510       {
    511         "hn_id": "40229022",
    512         "title": "When can transformers reason with abstract symbols?",
    513         "points": 3,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=40229022",
    516         "created_at": "2024-05-01T20:34:43Z"
    517       },
    518       {
    519         "hn_id": "29493664",
    520         "title": "Training Neural Networks with Fixed Sparse Masks",
    521         "points": 2,
    522         "comments": 1,
    523         "url": "https://news.ycombinator.com/item?id=29493664",
    524         "created_at": "2021-12-09T03:58:40Z"
    525       },
    526       {
    527         "hn_id": "42299972",
    528         "title": "Modeling AdaGrad, RMSProp, and Adam with Integro-Differential Equations",
    529         "points": 2,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=42299972",
    532         "created_at": "2024-12-02T20:17:28Z"
    533       },
    534       {
    535         "hn_id": "42171440",
    536         "title": "Modeling AdaGrad, RMSProp, and Adam with Integro-Differential Equations",
    537         "points": 2,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=42171440",
    540         "created_at": "2024-11-18T11:19:17Z"
    541       },
    542       {
    543         "hn_id": "39313991",
    544         "title": "Information content of note transitions in the music of J. S. Bach",
    545         "points": 2,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=39313991",
    548         "created_at": "2024-02-09T12:09:43Z"
    549       },
    550       {
    551         "hn_id": "33625737",
    552         "title": "Optimal sizing of seasonal renewable energy storage considering degradation",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=33625737",
    556         "created_at": "2022-11-16T16:25:26Z"
    557       },
    558       {
    559         "hn_id": "38576071",
    560         "title": "Large Language Models on Graphs: A Comprehensive Survey",
    561         "points": 1,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=38576071",
    564         "created_at": "2023-12-08T23:15:13Z"
    565       },
    566       {
    567         "hn_id": "37771757",
    568         "title": "Can LLMs provide useful feedback on research papers? A broad empirical analysis",
    569         "points": 1,
    570         "comments": 0,
    571         "url": "https://news.ycombinator.com/item?id=37771757",
    572         "created_at": "2023-10-04T21:03:08Z"
    573       },
    574       {
    575         "hn_id": "35284995",
    576         "title": "Self Supervision Does Not Help Natural Language Supervision at Scale",
    577         "points": 1,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=35284995",
    580         "created_at": "2023-03-24T04:12:16Z"
    581       },
    582       {
    583         "hn_id": "29320363",
    584         "title": "ClipClap: Image Captioning with Clip Encoder and GPT2 [pdf]",
    585         "points": 1,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=29320363",
    588         "created_at": "2021-11-23T17:15:20Z"
    589       }
    590     ],
    591     "top_points": 3,
    592     "total_points": 17,
    593     "total_comments": 1
    594   }
    595 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs