scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20319B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Episodic Memories Generation and Evaluation Benchmark for Large Language Models",
      6     "authors": [
      7       "Alexis Huet",
      8       "Zied Ben-Houidi",
      9       "Dario Rossi"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Learning Representations",
     13     "arxiv_id": "2501.13121",
     14     "doi": "10.48550/arXiv.2501.13121"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Key abstract claims — that LLMs struggle with episodic tasks especially for multiple related events, and that the benchmark is contamination-free — are backed by Table 3 (F1 ≤ 0.60 for 2+ events across all models) and the synthetic generation design. The '10k-100k token' framing is slightly overstated for single-event tasks on the short book but holds for multi-event queries.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal-ish claims (RAG outperforms in-context for most models; naive fine-tuning fails on multi-event generalization) are tested with Wilcoxon signed-rank tests and ablation comparisons across three memory strategies, which is adequate for the controlled benchmark setting.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper draws broad conclusions about LLM 'episodic memory' capabilities from a synthetic fictional benchmark with explicit temporal markers and controlled ground truth; the gap between benchmark performance and genuine episodic memory in real-world settings is not carefully bounded in the conclusions.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Poor model performance could reflect sensitivity to prompt framing, distributional quirks of synthetic text, or retrieval granularity rather than a fundamental episodic memory deficit; the paper does not systematically consider these alternatives, though it does briefly note RAG granularity as a factor.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper uses F1 on templated Q&A over synthetic text as a proxy for 'episodic memory capability' and draws the cognitive science parallel extensively, but does not explicitly discuss the measurement gap between benchmark F1 and the broader cognitive construct being claimed.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 'Summary and Limitations' contains a dedicated multi-paragraph discussion with four named limitations: temporal representation, event independence, limited domain scope, and training limitations.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Limitations are specific: 'relies on explicit temporal markers, which may not fully capture nuanced ways time is expressed'; 'independent generation of chapters does not capture the interconnected and causal nature of real-world events'; 'primarily involves human-like protagonists within fictional contexts'.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states the benchmark does not cover implicit temporal references, causal event chains, or non-NYC/non-fictional domains, and Section 2 lists what existing benchmarks lack as a mirror of what this benchmark also does not claim to measure.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment section appears in the paper; only institutional affiliation (Huawei Technologies) is listed, without any explicit statement of whether or how the work was funded.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are clearly identified as employees of Huawei Technologies Co., Ltd., Paris, France on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper evaluates GPT-4o, Claude, Llama, and o1-mini — none of which are Huawei products — so the employer/funder has no direct commercial stake in the performance outcomes reported.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or patent/equity declaration is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Episodic memory, entities, episodic events, cue-based recall, and the world model are all formally defined in Section 3 with notation (e.g., eventi = (ti, si, enti, ci)) and grounded in Tulving's cognitive science framework.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly lists three contributions in the introduction: (1) modeling framework for episodic memory, (2) benchmark code and 11 datasets, (3) baseline evaluation of state-of-the-art LLMs under three memory strategies.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 extensively engages with needle-in-a-haystack (Kamradt 2023), bAbI (Weston 2015), bAbILong (Kuratov 2024), Michelangelo/LSQ (Vodrahalli 2024), and temporal QA benchmarks, explaining specifically how each falls short and how this work differs.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Section 3 and Appendix A systematically argue why the benchmark measures episodic memory specifically (not just retrieval): tasks require temporal/spatial context tracking, entity state monitoring, and cue-based recall, all grounded in Tulving's encoding specificity principle and human memory test design.",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Difficulty is explicitly operationalized by number of matching events (0, 1, 2, 3-5, 6+), controlled via truncated geometric sampling distribution, and verified in Table 6/9; questions are balanced across bins as shown in Table 12.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Results in Table 3 show near-ceiling performance for 0-event confabulation detection by some models (o1-mini: 0.97) and floor-like behavior for multi-event retrieval (≤0.60 for all models at 2+ events), and this is discussed explicitly in Section 5.2.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No human baseline is provided; the paper explicitly defers this to future work: 'A comparison with human performance would be interesting for future work' (Appendix E.4).",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "The paper describes a lenient F1 computation (Appendix B.3.2) with a justified rationale for the leniency rule (#pred = min(#iditems, #gt)), uses Kendall's τ for chronological ordering, and provides examples of partial match scoring (Appendix B.4) to demonstrate validity.",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Contamination resistance is a central design goal: the benchmark generates synthetic fictional narratives with controlled ground truth, explicitly stated as 'free from contamination' and distinguished from benchmarks using Freebase/Wikidata or real books.",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper does not discuss what happens when models improve enough to solve these tasks, whether the benchmark will be regnerated with new parameters, or any update/refresh mechanism — scalability is mentioned but not maintenance over time.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Section 6 and Section 2 discuss specific failure modes: shortcut exploitation, synthetic/artificial nature opening the door to pattern exploitation, event independence limiting ecological validity, and explicit temporal markers being unrepresentative of natural language.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Code and all 11 datasets are released at the cited GitHub repository (Huet et al. 2025); prompts, generation scripts, evaluation code, and all hyperparameters (RAG K, fine-tuning epochs/batch size/LR) are documented in the appendices.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The paper provides exhaustive documentation across Appendix B: universe construction (B.1.1-B.1.2), event generation distribution (B.1.3), meta-data generation (B.1.4), chapter generation prompts (B.1.5), verification procedures (B.1.6-B.1.7), secondary entities (B.1.8), assembly (B.1.9), and generation statistics (B.1.12).",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The paper states 'open source code and datasets' and provides a GitHub URL, but no specific license (MIT, Apache, CC-BY, etc.) is mentioned in the paper text.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The intended use (evaluating LLM episodic memory under in-context, RAG, and fine-tuning strategies) is clearly stated; limitations on generalization to real-world episodic tasks and implicit temporal language are specified in Section 6.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "All tested state-of-the-art LLMs show consistent F1 decline as the number of matching events increases, with performance dropping to ≤0.60 for 2+ events on the long book.",
    201       "evidence": "Table 3 shows F1 scores across all models (GPT-4o, Claude, Llama, o1-mini) for 0–6+ event bins; every model degrades substantially from 1-event to 2+ event queries.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "RAG generally outperforms in-context memory for most models, except GPT-4o, which performs comparably or worse with RAG.",
    206       "evidence": "Figure 3 Critical Difference plot shows RAG variants cluster above in-context for Claude/GPT-mini/Llama, but GPT-4o in-context achieves the best rank overall; the exception is explicitly noted in Section 5.2.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Naive fine-tuning catastrophically fails for multi-event generalization, overfitting to single-event facts without learning relational episodic structure.",
    211       "evidence": "Table 3 shows fine-tuned GPT-4o-mini achieves F1=0.83 for 1-event questions but drops to F1=0.37/0.28/0.19 for 2/3-5/6+ events; Table 4 shows 0% exact match on all events for fine-tuning.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Performance degrades systematically by cue type: content cues are easiest, then entity, space, and time cues are hardest.",
    216       "evidence": "Figure 4 shows a clear top-to-bottom gradient across all models for cue types (c > ent > s > t), with time-based cues consistently yielding the lowest F1 scores.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Models generated on Claude-authored books show different performance patterns than on GPT-4o-authored books, with GPT models showing statistical dominance on GPT books.",
    221       "evidence": "Table 21 Mann-Whitney U tests show GPT models outperform Claude models with p<0.01 on GPT book but not on Claude book (p=0.11 for GPT-4o vs Claude-3.5-sonnet).",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "Even with very limited context (10k tokens), all models show suboptimal performance on multi-event episodic tasks.",
    226       "evidence": "Table 13 shows short-book F1 for 2-event questions ranges from 0.59–0.97; o1-mini and GPT-4o perform well, but Claude-3-Haiku and GPT-4o-mini show clear degradation — the claim holds selectively, not universally.",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval",
    232     "observational"
    233   ],
    234   "key_findings": "The paper introduces a synthetic episodic memory benchmark grounded in cognitive science, where events are characterized by (time, space, entity, content) tuples and questions vary in cue specificity and number of matching events. All tested LLMs (GPT-4o, Claude 3.5 Sonnet, Llama 3.1 405B, o1-mini) show consistent F1 degradation as the number of cue-matching events increases, from ~0.80–0.96 for single-event queries to ≤0.60 for two or more events, even at 100k tokens. Performance also degrades systematically by cue type (content > entity > space > time), and naive fine-tuning severely overfits to single-event recall while collapsing on multi-event generalization. No tested model or strategy comes close to solving the benchmark, suggesting fundamental gaps in LLM temporal and spatial event tracking.",
    235   "red_flags": [
    236     {
    237       "flag": "No human baseline",
    238       "detail": "The paper provides no human performance data on the benchmark tasks, making it impossible to calibrate how difficult the tasks are relative to human episodic memory performance or whether the benchmark successfully captures human-challenging aspects."
    239     },
    240     {
    241       "flag": "Circular evaluation: Claude-generated benchmark evaluated with Claude",
    242       "detail": "The default benchmark books are generated using Claude 3.5 Sonnet, and Claude models are among the primary evaluated models. The ablation (Appendix E.5) shows Claude models perform better on Claude-generated books than on GPT-generated books, suggesting potential evaluation bias."
    243     },
    244     {
    245       "flag": "LLM-as-a-judge for evaluation scoring",
    246       "detail": "The F1 scoring relies on an LLM to extract items and assign matching scores; the lenient F1 rule (#pred = min(#items, #gt)) is somewhat arbitrary and could mask systematic over-generation errors."
    247     },
    248     {
    249       "flag": "Synthetic benchmark / real capability gap unaddressed",
    250       "detail": "The paper claims to measure 'episodic memory' but the benchmark uses explicitly marked synthetic dates, names, and locations in fictional narratives — the relationship between performance on these controlled tasks and genuine episodic memory capability is asserted via cognitive science analogy but not empirically validated."
    251     },
    252     {
    253       "flag": "No license specified",
    254       "detail": "Despite claiming open-source release, the paper does not specify a license for the code or datasets, creating ambiguity about permissible reuse."
    255     }
    256   ],
    257   "cited_papers": [
    258     {
    259       "title": "Michelangelo: Long context evaluations beyond haystacks via latent structure queries",
    260       "relevance": "Most closely related prior work; introduces LSQ framework that the authors explicitly compare against as sharing design philosophy but narrower scope"
    261     },
    262     {
    263       "title": "Towards AI-complete question answering: A set of prerequisite toy tasks (bAbI)",
    264       "relevance": "Baseline synthetic reasoning benchmark the authors distinguish from by adding narrative coherence and spatio-temporal grounding"
    265     },
    266     {
    267       "title": "BabiLong: Testing the limits of LLMs with long context reasoning-in-a-haystack",
    268       "relevance": "Long-context extension of bAbI; compared as lacking complexity and cue differentiation"
    269     },
    270     {
    271       "title": "Needle In A Haystack – Pressure Testing LLMs",
    272       "relevance": "Paradigmatic retrieval benchmark the authors position against as lacking temporal/spatial awareness"
    273     },
    274     {
    275       "title": "InfiniteBench: Extending long context evaluation beyond 100k tokens",
    276       "relevance": "Long-context QA benchmark compared as not probing entity state tracking or temporal relationships"
    277     },
    278     {
    279       "title": "RULER: What's the real context size of your long-context language models?",
    280       "relevance": "Multi-needle retrieval extension benchmark; compared as lacking cue differentiation and episodic structure"
    281     },
    282     {
    283       "title": "Human-like episodic memory for infinite context LLMs",
    284       "relevance": "Concurrent work on incorporating episodic memory architecture into LLMs; cited as future baseline to evaluate on this benchmark"
    285     },
    286     {
    287       "title": "Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context",
    288       "relevance": "State-of-the-art long-context model work; cited for multi-needle extension approaches"
    289     }
    290   ],
    291   "engagement_factors": {
    292     "practical_relevance": {
    293       "score": 2,
    294       "justification": "Open-source benchmark with 11 datasets and full generation code enables practitioners to test their own models and extend to new domains."
    295     },
    296     "surprise_contrarian": {
    297       "score": 2,
    298       "justification": "Finding that even o1-mini scores near 0 on multi-event single-event recall (F1=0.05 for 1-event in-context) despite strong confabulation avoidance challenges assumptions about reasoning model capabilities."
    299     },
    300     "fear_safety": {
    301       "score": 1,
    302       "justification": "Confabulation/hallucination evaluation is a component, but the paper frames this as capability research rather than safety risk."
    303     },
    304     "drama_conflict": {
    305       "score": 0,
    306       "justification": "No controversy angle; straightforward benchmark paper with cooperative comparisons across model families."
    307     },
    308     "demo_ability": {
    309       "score": 2,
    310       "justification": "Full code and datasets are released on GitHub, allowing immediate replication or extension by other researchers."
    311     },
    312     "brand_recognition": {
    313       "score": 1,
    314       "justification": "Huawei is a known technology company but not a recognized AI research lab; however, the paper evaluates GPT-4o, Claude, and o1-mini, lending brand-name visibility."
    315     }
    316   },
    317   "hn_data": {
    318     "threads": [
    319       {
    320         "hn_id": "43067948",
    321         "title": "A Model for French Voters",
    322         "points": 2,
    323         "comments": 0,
    324         "url": "https://news.ycombinator.com/item?id=43067948"
    325       },
    326       {
    327         "hn_id": "42974556",
    328         "title": "IServe: An Intent-Based Serving System for LLMs",
    329         "points": 1,
    330         "comments": 0,
    331         "url": "https://news.ycombinator.com/item?id=42974556"
    332       }
    333     ],
    334     "top_points": 2,
    335     "total_points": 3,
    336     "total_comments": 0
    337   }
    338 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs