scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31083B)
      1 {
      2   "paper": {
      3     "title": "Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle",
      4     "authors": ["Hui Dai", "Ryan Teehan", "Mengye Ren"],
      5     "year": 2024,
      6     "venue": "International Conference on Machine Learning",
      7     "arxiv_id": "2411.08324",
      8     "doi": "10.48550/arXiv.2411.08324"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "LLM forecasting accuracy degrades significantly over time, with average declines of 21.55% on True/False and 11.33% on Multiple Choice questions between Jan 2020 and Dec 2024. Degradation accelerates after models' knowledge cutoff dates. RAG partially mitigates this but the declining trend persists, and even gold article access (reading comprehension) shows performance decline, suggesting outdated internal representations compound the missing-knowledge problem.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract states 'Code and data are available at https://agenticlearning.ai/daily-oracle.' A URL is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Same URL provides dataset access. The paper describes 31,510 QA pairs and states code and data are available."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or specific library versions in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions found in the paper. Prompts are provided in the appendix but no instructions for running the full evaluation pipeline."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Table 3 and all figures report point estimates only (accuracy percentages and YoY changes). No confidence intervals, error bars, or uncertainty measures on any results."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims like 'performance degradation' and model comparisons are made by comparing raw accuracy numbers. No statistical significance tests (p-values, t-tests, etc.) are used anywhere in the paper."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage changes with baseline context: 'performance declines by 21.55% on TF questions (from 64.68% to 50.74%) and by 11.33% on MC questions (from 58.30% to 51.69%)' (Section 4.2). Table 3 provides per-model YoY changes with absolute yearly accuracies."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The dataset has 31,510 QA pairs (16,783 TF + 14,727 MC) and the human evaluation uses 60 pairs, but no justification for why these sizes are sufficient and no power analysis is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results appear to be single-run evaluations (each model answers each question once). No variance, standard deviation, or spread measures across runs are reported. 5-month moving averages are used for smoothing but this is not variance reporting."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Eight LLMs are compared against each other across three evaluation settings (closed-book, constrained open-book, gold article). The closed-book setting serves as baseline for the RAG settings."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Models include Claude-3.5-Sonnet, GPT-4, Llama-3-8B, Qwen-2-7B, Gemma-2-2B — all recent models at time of writing. Also includes older models (GPT-3.5, Mistral-7B) for temporal contrast."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Three evaluation settings (closed-book, constrained open-book with varying RAG cutoffs, gold article) systematically ablate the effect of external knowledge access. Multiple RAG cutoff dates isolate the effect of information recency."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Results reported separately for TF accuracy and MC accuracy, plus YoY accuracy change (pre-cutoff and post-cutoff), refusal rates, and temporal regression slopes (Figure 6)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The human evaluation (Section 3.3) validates the quality of generated QA pairs, not the LLM forecasting outputs. No human evaluation of the models' forecasting performance is performed — evaluation is entirely automated accuracy scoring."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Models are pre-trained and not fine-tuned on any part of the Daily Oracle dataset. The entire benchmark functions as a held-out test set. Temporal structure provides natural separation between training data and evaluation data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by question type (TF vs MC), per model, per year (Table 3), pre-cutoff vs post-cutoff, and by evaluation setting. Category distribution shown in Figure 1b and Figure 8. Appendix provides per-model constrained open-book results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Refusal behavior analyzed in detail (Section 4.3, Appendix B.2, Figures 9-11). Figure 20 shows a concrete failure case example with Mixtral-8x7B giving wrong answer in closed-book but correct in open-book. Performance drops below random baseline discussed for Mistral-7B and Mixtral-8x7B."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "RAG sometimes hurts performance: 'Llama-3-8B may perform worse than the closed-book setting when the RAG cutoff is prior to the knowledge cutoff dates' (Section 4.2). Claude-3.5-Sonnet open-book 'lags behind its closed-book performance' (Appendix B.4). Gold article setting still shows declining trends despite providing the answer."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of performance degradation (21.55% TF, 11.33% MC) are directly supported by Table 3 and Section 4.2. Claims that RAG helps but degradation persists are supported by Figure 4 and Section 4.2. All abstract claims have corresponding results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper claims outdated pre-training data causes performance degradation. The study design uses temporal structure as a natural quasi-experiment, with three evaluation settings (closed-book, open-book, gold article) to disentangle missing knowledge from outdated representations. The pre/post-cutoff comparison and gold article finding provide reasonable causal support."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title asks 'Are LLMs Prescient?' and abstract claims about 'LLMs' temporal generalization and forecasting abilities' broadly, but the benchmark covers only English news from 5 specific sources (CBS, CNBC, CNN, Forbes, NPR) with LLM-generated questions. Results may not generalize to other languages, domains, or question formats. These scope boundaries are not explicitly stated."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 4.3 discusses multiple alternative explanations: missing knowledge vs outdated representations, overrepresentation of pre-September 2021 data in training corpora, websites restricting web crawlers after ChatGPT. The Limitations section adds question generation bias and event selection bias."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures accuracy on LLM-generated news QA pairs and frames this as 'forecasting ability' and 'temporal generalization.' It does not discuss whether answering auto-generated True/False and Multiple Choice questions about news events is a valid proxy for actual forecasting ability, which involves reasoning under uncertainty, probability calibration, and handling of novel scenarios."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 5 (Appendix B.1) lists exact model versions: claude-3-5-sonnet-20240620, gpt-4-1106-preview, gpt-3.5-turbo-0125, Mixtral-8x7B-Instruct-v0.1, Mistral-7B-Instruct-v0.3, Meta-Llama-3-8B-Instruct, Qwen2-7B-Instruct, gemma-2-2b-it."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt texts for all steps are provided in Appendix D (Figures 23-37): article summary, QA generation, misleading choices generation, QA filtering, and all evaluation prompts for closed-book, open-book, and gold article settings for both TF and MC."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No inference hyperparameters (temperature, top-p, max tokens) are reported for any of the evaluated models. RAG uses BM25 with top-5 retrieval and 512-word truncation, but LLM sampling parameters are not stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models are evaluated via direct prompting without tool use, retry logic, or multi-step reasoning frameworks."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The 4-step construction pipeline is documented in detail (Section 3.1, Figure 7): article selection (3 random + 3 hot topic via DBSCAN clustering), article summarization, QA generation with few-shot prompting, misleading choice generation, and QA filtering with 7 principles and 13-point threshold. Article corpus details given (1,246,973 articles from Jan 2019 to Dec 2024)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' subsection at the end of Section 4.3 discusses data generation biases and evaluation limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: generated questions may contain biases from an outdated LLM generator, questions only cover events that definitively occurred (not non-events that never made the news), limited time horizon after each model's cutoff date prevents thorough post-cutoff analysis, and the RAG/knowledge cutoff interaction is weak within available data."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While the Limitations section mentions specific issues, it does not explicitly state what the results do NOT show. The paper does not explicitly bound claims to English news from 5 sources, does not state that results may not generalize to other forecasting domains, languages, or question formats."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states code and data are available at the provided URL, and the dataset of 31,510 QA pairs is described as released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection described in detail: Common Crawl News Dataset + Newspaper3k scraping, filtered to 5 mainstream sources (CBS, CNBC, CNN, Forbes, NPR), 1,246,973 English articles from Jan 2019 to Dec 2024. Article selection strategy and QA generation pipeline documented."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Four human annotators are used for quality evaluation (Section 3.3) but no information is provided about who they are, how they were recruited, or their qualifications. This could introduce bias in the quality assessment."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full pipeline documented: news corpus collection → article selection (random + hot topic) → 4-step QA construction (summarization → generation → misleading choices → filtering with 7 criteria and 13-point threshold). The dataset size at each stage is not fully specified (e.g., how many articles were filtered out at each step), but the pipeline stages and criteria are clearly described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgment section lists: IITP grants from Republic of Korea (No. RS-2024-00469482 & RS-2024-00509279), Microsoft Accelerating Foundation Models Research program for Azure credits, and NYU HPC."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors listed as affiliated with New York University. They evaluate third-party models (GPT, Claude, Llama, etc.) — no direct affiliation with evaluated model companies."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Microsoft provided Azure cloud compute credits for LLM APIs, and the paper evaluates GPT models (Microsoft/OpenAI products). Microsoft has a financial interest in GPT model performance. This potential conflict is not acknowledged."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement found in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Table 3 lists knowledge cutoff dates for models: Claude-3.5-Sonnet (Apr 2024), GPT-4 (Apr 2023), GPT-3.5 (Sept 2021), Llama-3-8B (Mar 2023), Gemma-2-2B (Jul 2024). Models with unknown cutoffs are marked as 'Unknown.'"
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "This is central to the paper. The entire experimental design is built around studying how performance changes before and after training cutoffs. Section 4.3 discusses pre-September 2021 data overrepresentation in training corpora and its effects."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The benchmark is designed to mitigate contamination by construction: questions are generated from daily news after most models' training cutoffs. Section 4.3 discusses that earlier questions may be contaminated by training data and analyzes the performance cliff around September 2021. The pre/post-cutoff analysis explicitly addresses this."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects study. The 4 human annotators validate dataset quality but are not experimental subjects."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study. The paper evaluates LLMs on a benchmark, not human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, API costs, or per-example costs reported. The paper mentions switching to GPT-4o-mini/GPT-4o for cost-effectiveness and 'budget constraints' preventing some experiments, but actual costs are not quantified."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget stated. Acknowledgments mention NYU HPC and Microsoft Azure credits, but no GPU hours, total API spend, or hardware specifications are provided."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or repeated runs. Results appear to be single evaluations per model per question. No seed sensitivity analysis is performed."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated. It appears each model answered each question once, but this is not explicitly confirmed."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "RAG parameters (top-5 retrieval, 512-word truncation, BM25) and QA filtering threshold (13 points) are stated but no justification for these choices or search budget is reported. LLM inference hyperparameters are not mentioned at all."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Fixed configurations are used without justification: BM25 with top-5, 512-word truncation, 13-point QA filter threshold. No explanation of how these specific values were selected."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The paper evaluates existing third-party models on a benchmark rather than proposing a new model to compare against baselines. No self-comparison bias in the traditional sense."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Models of vastly different sizes are compared (Gemma-2-2B vs GPT-4, Claude-3.5-Sonnet) without discussing compute budget differences. No performance-vs-compute analysis is provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper engages with construct validity through multiple evaluation settings: closed-book tests forecasting, gold article tests reading comprehension, and open-book tests RAG-augmented forecasting. Section 4.3 disentangles missing knowledge from outdated representations. Appendix C compares with forecasting market datasets to argue their approach better captures temporal trends."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding involved. Models are evaluated via direct prompting without any agentic scaffold."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Central to the paper's design. Knowledge cutoff dates are used to separate pre-training data from evaluation data. RAG cutoff dates prevent information leakage from future articles. The entire paper studies how temporal information boundaries affect performance."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "QA filtering principle 3 (absence of information leakage) explicitly checks that questions don't reveal post-publication information. The RAG cutoff formally limits accessible articles to before min(dres−1, dR-Cutoff). The construction ensures questions don't contain answer-leaking temporal information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper does not discuss whether QA pairs from the same news article or topic cluster are correlated. Figure 2 shows topic frequency is temporally autocorrelated, suggesting questions across days are not independent. This non-independence could affect the reliability of temporal trend estimates."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Temporal splits based on knowledge cutoff dates are used as the primary leakage prevention method. The QA filtering pipeline includes an automated LLM-based check for information leakage (principle 3), and the RAG cutoff mechanism formally prevents future information access."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "LLM performance on forecasting questions degrades over time, with average declines of 21.55% on TF and 11.33% on MC questions between January 2020 and December 2024.",
    365       "evidence": "Table 3 and Figure 3 show yearly accuracy for 8 models. Average accuracy drops from 64.68% to 50.74% (TF) and 58.30% to 51.69% (MC) across all models (Section 4.2).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Performance degradation accelerates after models' knowledge cutoff dates, with post-cutoff YoY declines steeper than pre-cutoff declines.",
    370       "evidence": "Table 3 shows GPT-4 MC post-cutoff YoY decline of -18.54% vs pre-cutoff -4.23%. Claude-3.5-Sonnet post-cutoff drops -12.41% (TF) and -11.78% (MC) vs pre-cutoff -4.77% and -6.26%. Figure 6 shows regression slope becoming more negative after cutoffs.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "RAG improves prediction accuracy but the performance degradation pattern persists across all RAG cutoff settings.",
    375       "evidence": "Figure 4 shows Mixtral-8x7B TF accuracy improves with later RAG cutoffs, but declining trends persist in all curves. Llama-3-8B MC shows similar patterns. Section 4.2 notes RAG can even hurt performance when cutoff is before knowledge cutoff.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Even with gold article access (reading comprehension), LLM performance declines over time, suggesting outdated internal representations beyond just missing knowledge.",
    380       "evidence": "Figure 5 shows most models approach ~90% accuracy with gold articles but still exhibit declining trends. Section 4.3 discusses this as evidence that 'the models' internal representations are outdated.'",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Daily Oracle is the largest and most up-to-date continuously updated forecasting benchmark.",
    385       "evidence": "Table 1 compares with 10 prior benchmarks. Daily Oracle has 31,510 questions with daily updates, larger and more frequently updated than ForecastBench (1,000, biweekly), FreshBench (2,769), and ForecastQA (10,382, static).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The LLM-based QA filtering achieves 85% accuracy compared to human consensus on final QA pair acceptance.",
    390       "evidence": "Section 3.3 and Table 4: 4 human annotators evaluate 60 QA pairs. Average accuracy across 7 principles is 89.52%, and final acceptance accuracy is 85.00%. Fleiss' Kappa of 0.26 indicates fair agreement among annotators.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No statistical significance testing",
    397       "detail": "All performance comparisons and degradation claims are based on comparing raw accuracy numbers without any statistical tests. With thousands of questions per time period, significance tests would be straightforward and would strengthen the claims."
    398     },
    399     {
    400       "flag": "Single-run evaluation with no variance estimation",
    401       "detail": "Each model appears to answer each question once. No multiple runs, no seed sensitivity, no bootstrap confidence intervals. The smoothing via 5-month moving averages is not a substitute for proper uncertainty quantification."
    402     },
    403     {
    404       "flag": "Weak human evaluation agreement",
    405       "detail": "The human annotator agreement is only 'fair' (Fleiss' Kappa = 0.26). Some principles have near-zero agreement (Non-answerability: 0.02). This raises questions about the reliability of the QA filtering criteria and the benchmark quality validation."
    406     },
    407     {
    408       "flag": "LLM-generated benchmark evaluated on LLMs",
    409       "detail": "GPT-3.5/GPT-4 generate the QA pairs, and GPT-3.5/GPT-4 are among the evaluated models. This creates potential circular biases — questions may be easier or harder for the generating model family. This is not discussed."
    410     },
    411     {
    412       "flag": "Missing inference hyperparameters",
    413       "detail": "Temperature, top-p, and other sampling parameters are not reported for any evaluated model. These significantly affect output and could explain some performance variation."
    414     },
    415     {
    416       "flag": "Undisclosed Microsoft conflict",
    417       "detail": "Microsoft provided Azure compute credits and the paper evaluates GPT models (Microsoft/OpenAI). This potential conflict is not acknowledged, though results don't appear to favor GPT models."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    423       "authors": ["O. Sainz", "J. Campos", "I. García-Ferrero", "J. Etxaniz", "O. L. de Lacalle", "E. Agirre"],
    424       "year": 2023,
    425       "relevance": "Directly relevant to benchmark contamination — argues every LLM benchmark must assess data contamination, which is a core concern this paper addresses."
    426     },
    427     {
    428       "title": "Benchmark data contamination of large language models: A survey",
    429       "authors": ["C. Xu", "S. Guan", "D. Greene", "M.-T. Kechadi"],
    430       "year": 2024,
    431       "arxiv_id": "2406.04244",
    432       "relevance": "Comprehensive survey of benchmark contamination in LLMs, directly relevant to the contamination and temporal evaluation themes."
    433     },
    434     {
    435       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    436       "authors": ["P. Lewis", "E. Perez", "A. Piktus", "F. Petroni"],
    437       "year": 2020,
    438       "relevance": "Foundational RAG paper — the constrained open-book evaluation setting in this paper directly builds on RAG methodology."
    439     },
    440     {
    441       "title": "Approaching human-level forecasting with language models",
    442       "authors": ["D. Halawi", "F. Zhang", "C. Yueh-Han", "J. Steinhardt"],
    443       "year": 2024,
    444       "relevance": "Evaluates LLM forecasting capabilities using forecasting market questions — directly comparable to Daily Oracle's approach."
    445     },
    446     {
    447       "title": "Inadequacies of large language model benchmarks in the era of generative artificial intelligence",
    448       "authors": ["T. R. McIntosh", "T. Susnjak", "N. Arachchilage"],
    449       "year": 2025,
    450       "relevance": "Critiques LLM benchmarks for lacking temporal dimensions and becoming outdated — the same problems Daily Oracle aims to address."
    451     },
    452     {
    453       "title": "Task contamination: Language models may not be few-shot anymore",
    454       "authors": ["C. Li", "J. Flanigan"],
    455       "year": 2024,
    456       "relevance": "Demonstrates task contamination in few-shot LLM evaluation, supporting the need for dynamic benchmarks."
    457     },
    458     {
    459       "title": "ForecastBench: A dynamic benchmark of AI forecasting capabilities",
    460       "authors": ["E. Karger", "H. Bastani", "C. Yueh-Han", "Z. Jacobs", "D. Halawi", "F. Zhang", "P. E. Tetlock"],
    461       "year": 2025,
    462       "relevance": "Dynamic forecasting benchmark that Daily Oracle directly compares against — biweekly updates with 1,000 questions from forecasting markets."
    463     },
    464     {
    465       "title": "Mind the gap: Assessing temporal generalization in neural language models",
    466       "authors": ["A. Lazaridou", "A. Kuncoro", "E. Gribovskaya"],
    467       "year": 2021,
    468       "relevance": "Defines temporal generalization for language models and demonstrates performance deterioration over time — foundational work for this paper's research question."
    469     },
    470     {
    471       "title": "Is your LLM outdated? A deep look at temporal generalization",
    472       "authors": ["C. Zhu", "N. Chen", "Y. Gao", "Y. Zhang", "P. Tiwari", "B. Wang"],
    473       "year": 2025,
    474       "relevance": "Introduces FreshBench and studies LLM temporal generalization — closest comparable work using forecasting market questions."
    475     },
    476     {
    477       "title": "Consent in crisis: The rapid decline of the AI data commons",
    478       "authors": ["S. Longpre", "R. Mahari", "A. Lee"],
    479       "year": 2024,
    480       "relevance": "Documents increasing restrictions on web crawlers post-ChatGPT, which the paper cites as a potential explanation for training data imbalance across time periods."
    481     },
    482     {
    483       "title": "The Llama 3 herd of models",
    484       "authors": ["A. Dubey", "A. Jauhri", "A. Pandey"],
    485       "year": 2024,
    486       "arxiv_id": "2407.21783",
    487       "relevance": "Llama-3-8B is one of the eight evaluated models in the benchmark experiments."
    488     },
    489     {
    490       "title": "GPT-4 technical report",
    491       "authors": ["OpenAI"],
    492       "year": 2023,
    493       "arxiv_id": "2303.08774",
    494       "relevance": "GPT-4 is one of the evaluated models and used for QA generation in the benchmark construction pipeline."
    495     }
    496   ]
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs