scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (19609B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Episodic Memories Generation and Evaluation Benchmark for Large Language Models",
      6     "authors": [
      7       "Alexis Huet",
      8       "Zied Ben Houidi",
      9       "Dario Rossi"
     10     ],
     11     "year": 2025,
     12     "venue": "International Conference on Learning Representations",
     13     "arxiv_id": "2501.13121",
     14     "doi": "10.48550/arXiv.2501.13121"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims that 'even the most advanced LLMs struggle with episodic memory tasks, particularly when dealing with multiple related events' are directly supported by Tab. 3 (F1 ≤0.60 for 2+ events) and Fig. 3 (model rankings with statistical tests).",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims are generally supported through controlled comparisons. 'RAG generally outperforms in-context counterparts' is supported by comparing the same model across memory strategies (Fig. 3). Ablation studies (Appendix E) systematically vary single factors.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Claims are bounded to the tested models and benchmark. The paper specifies which models were evaluated, acknowledges limitations including 'limited domain scope' (Section 6), and explicitly notes the benchmark covers 'human-like protagonists within fictional contexts.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Multiple alternative explanations are investigated: Claude vs. GPT book generation bias (Appendix E.5), chronological vs. unordered presentation (E.6), realistic vs. non-realistic events (E.7), and paragraph vs. chapter RAG granularity (E.2).",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly draws parallels between their benchmark and established human episodic memory tests (AMI, Autobiographical Interview in Section 2 and Appendix A.3.1), and clearly defines what F1 score on cue-based recall measures. Section 3 articulates the cognitive science framework underlying the measurements.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 'Summary and Limitations' provides dedicated subsections addressing temporal representation, event independence, limited domain scope, and training limitations.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are identified: 'Our benchmark relies on explicit temporal markers, which may not fully capture the nuanced ways time is expressed in natural language'; 'The independent generation of chapters... does not capture the interconnected and causal nature of real-world events' (Section 6).",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 6 states specific scope boundaries: benchmark is limited to explicit temporal markers (not implicit references like 'yesterday'), events are independently generated (no causal chains), and domain is restricted to 'human-like protagonists within fictional contexts.'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding disclosure, acknowledgments section, or grant information is present in the paper. Authors are listed as Huawei Technologies employees but no explicit funding statement is provided.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliation is clearly stated: 'Huawei Technologies Co., Ltd., Paris, France' on the title page.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The implicit funder (Huawei) does not have a direct financial interest in the specific outcomes — the paper evaluates third-party models (GPT-4o, Claude, Llama, o1-mini), none of which are Huawei products.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Core terms are formally defined: 'episodic memory' from Tulving et al., 'entities' as subjects with time-evolving states, 'events' as tuples (t,s,ent,c), and 'cue' as a retrieval key in a key-value system (Sec. 3).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three explicit contributions are enumerated in the introduction: an episodic memory model for LLMs, a benchmark dataset and code release, and an empirical evaluation of state-of-the-art LLMs across memory strategies.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 and Appendix A systematically compare the benchmark to needle-in-a-haystack, bAbI, LSQ, and temporal QA benchmarks, identifying five specific limitations of prior work that motivate the new benchmark.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "benchmark-creation": {
    118       "construct_design": {
    119         "construct_validity_argued": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper explicitly argues that cue-based recall over structured (t,s,ent,c) event tuples operationalizes Tulving's definition of episodic memory, drawing on hippocampal indexing theory, encoding specificity principle, and cue overload phenomena (Sec. 3).",
    123           "source": "haiku"
    124         },
    125         "difficulty_distribution_characterized": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Difficulty is explicitly varied by number of matching events per cue (0, 1, 2, 3-5, 6+ bins) using a controlled truncated geometric distribution, and results are broken out by bin throughout Tables 3, 4, 12-19.",
    129           "source": "haiku"
    130         },
    131         "ceiling_floor_effects_checked": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The authors note models start struggling at modest context sizes and deliberately limit benchmark size; they also observe and report the o1-mini inversion (F1=0.97 on 0-event confabulation questions but F1=0.05 on single-event recall), explicitly analyzing this range.",
    135           "source": "haiku"
    136         },
    137         "human_baseline_included": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No human performance baseline is included; the paper explicitly acknowledges 'a comparison with human performance would be interesting for future work' (Appendix E.4), leaving the severity of the LLM gap uncalibrated.",
    141           "source": "haiku"
    142         },
    143         "scoring_rubric_justified": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "The F1-score computation with LLM-as-a-judge semantic matching and lenient prediction count policy is justified in Appendix B.3, with rationale for why strict string matching is insufficient and documentation that partial scores are rare (4-8% of cases).",
    147           "source": "haiku"
    148         }
    149       },
    150       "robustness": {
    151         "contamination_resistance_designed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "The benchmark uses synthetically generated fictional narratives with a controlled universe not derived from public data, explicitly designed to be 'free from contamination'; the listed 5 prior-benchmark limitations explicitly includes data leakage as a problem being solved.",
    155           "source": "haiku"
    156         },
    157         "temporal_robustness_discussed": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper does not discuss whether future models trained on internet-crawled data might encounter these synthetic benchmarks, nor is any versioning, canary string, or update strategy described.",
    161           "source": "haiku"
    162         },
    163         "failure_modes_discussed": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The paper discusses LLM failure modes extensively but does not address benchmark failure modes—e.g., ways models could exploit chapter-structure regularities, or systematic biases introduced when the same LLMs that are evaluated are used as the LLM-as-judge evaluator.",
    167           "source": "haiku"
    168         },
    169         "baseline_implementations_provided": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "Full code and 11 datasets are released on GitHub; baseline results for in-context, RAG (paragraph and chapter granularity), and fine-tuned strategies across six models are reported with detailed appendices enabling reproduction.",
    173           "source": "haiku"
    174         }
    175       },
    176       "documentation": {
    177         "dataset_documentation_complete": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Appendix B provides comprehensive documentation covering universe construction, event generation with truncated geometric sampling, chapter generation prompts with full templates, LLM-as-judge verification steps, QA pair selection methodology, and evaluation computation.",
    181           "source": "haiku"
    182         },
    183         "licensing_and_access_clear": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The paper states code and datasets are released open-source on GitHub but does not specify the license (MIT, CC, Apache, etc.) under which the benchmark can be used, modified, or incorporated into commercial research.",
    187           "source": "haiku"
    188         },
    189         "intended_use_specified": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Intended use (evaluating LLM episodic memory capabilities across memory strategies) is clearly stated; the limitations section identifies domains and task types where results should not be generalized.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Even the most advanced LLMs struggle with episodic memory tasks, particularly when dealing with multiple related events, even for contexts as short as 10k–100k tokens.",
    201       "evidence": "Table 3 shows F1 ≤ 0.60 for all models at 2+ matching events on the long book; Table 4 shows ≤36% exact latest-state matches and ≤18% exact full-set matches across all configurations.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "RAG generally outperforms in-context memory for most models except GPT-4o.",
    206       "evidence": "Fig. 3 Critical Difference plot shows RAG variants cluster at higher ranks; Sec. 5.2 explicitly states 'except for GPT-4o, models utilizing RAG generally outperform their in-context counterparts.'",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Naive fine-tuning fails to generalize beyond single-event memorization.",
    211       "evidence": "Table 3 shows fine-tuned GPT-4o-mini achieves F1=0.83 for single-event questions but drops to ≤0.37 for 2+ events; Table 4 shows 0% fully correct chronological ordering.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Performance degrades consistently as the number of matching events per cue increases (cue overload effect).",
    216       "evidence": "Table 3 shows monotonic F1 decline across all models and memory strategies from bin=1 to bin=6+; Fig. 4 shows the left-to-right gradient is 'clearly visible' across all cue types and models.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Temporal cues produce worse retrieval performance than content or entity cues across all models.",
    221       "evidence": "Fig. 4 shows a consistent top-to-bottom performance gradient from context to entity to space to time cues; Table 16 confirms lower F1 for (t,*,*,*) cues vs. (*,*,*,c) across event count bins.",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "Generator-evaluatee bias exists: GPT-generated books favor GPT models, and Claude-generated books favor Claude models.",
    226       "evidence": "Table 21 Mann-Whitney U tests show GPT models significantly outperform Claude on GPT books (p<0.01) but no significant difference on Claude books (p=0.11); Table 20 shows reversed relative performance.",
    227       "supported": "moderate"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "benchmark-eval"
    232   ],
    233   "key_findings": "State-of-the-art LLMs including GPT-4o, Claude 3.5 Sonnet, o1-mini, and Llama 3.1 405B show systematic deficits in episodic memory: performance drops sharply to F1≤0.60 when even two events match a retrieval cue, and fewer than 18% of multi-event questions receive exact full answers. Temporal cues elicit consistently worse recall than entity or content cues across all models and contexts. Naive fine-tuning memorizes single facts but collapses on multi-event generalization, while RAG provides modest improvements at the cost of retrieval granularity. The synthetic benchmark design enables contamination-free evaluation with full ground-truth control, and generalization is confirmed across news and science fiction universe variants.",
    234   "red_flags": [
    235     {
    236       "flag": "No human baseline",
    237       "detail": "The paper claims LLMs 'struggle' with episodic memory without establishing how humans perform on identical tasks; without a human upper bound, the severity of the LLM capability gap cannot be calibrated."
    238     },
    239     {
    240       "flag": "LLM-generated benchmark content",
    241       "detail": "Both primary benchmark documents are generated by Claude 3.5 Sonnet, yet Claude is also one of the evaluated models; the paper shows generator-evaluatee bias exists (Tables 20-21) but the default evaluation is on Claude-generated content."
    242     },
    243     {
    244       "flag": "LLM-as-judge circularity",
    245       "detail": "The evaluation uses GPT-4o and GPT-4o-mini as judges for answer correctness while simultaneously evaluating them as test models; systematic self-favoring biases are not controlled for."
    246     },
    247     {
    248       "flag": "License not specified",
    249       "detail": "The paper claims open-source release but does not state a specific license, leaving unclear whether the benchmark can be freely adapted for commercial research or incorporated into model training data."
    250     },
    251     {
    252       "flag": "Temporal robustness unaddressed",
    253       "detail": "The synthetic benchmark may be incorporated into future model training data via web crawl; no versioning, canary strings, or update strategy is described to preserve contamination-free status over time."
    254     }
    255   ],
    256   "cited_papers": [
    257     {
    258       "title": "Towards AI-Complete Question Answering: A Set of Prerequisite Toy Tasks (bAbI)",
    259       "relevance": "Primary prior synthetic reasoning benchmark that the authors differentiate from, noting it lacks coherent storytelling and is vulnerable to shortcut reasoning."
    260     },
    261     {
    262       "title": "Michelangelo: Long Context Evaluations Beyond Haystacks via Latent Structure Queries",
    263       "relevance": "Closest prior work to the proposed benchmark; authors explicitly build on the LSQ framework's state-tracking approach while extending it with temporal and spatial episodic dimensions."
    264     },
    265     {
    266       "title": "RULER: What's the Real Context Size of Your Long-Context Language Models?",
    267       "relevance": "Multiple-needle retrieval benchmark compared to show prior benchmarks lack the temporal/spatial cue differentiation central to episodic memory evaluation."
    268     },
    269     {
    270       "title": "BabiLong: Testing the Limits of LLMs with Long Context Reasoning-in-a-Haystack",
    271       "relevance": "Long-context extension of bAbI compared as a baseline; lacks episodic memory dimensions (temporal tracking, entity state changes)."
    272     },
    273     {
    274       "title": "Episodic and Semantic Memory (Tulving et al., 1972)",
    275       "relevance": "Foundational cognitive science paper defining episodic memory; the core theoretical basis for the benchmark's construct definition and task design."
    276     },
    277     {
    278       "title": "Encoding Specificity and Retrieval Processes in Episodic Memory (Tulving & Thomson, 1973)",
    279       "relevance": "Source of the encoding specificity principle used to design and justify cue specificity variations in the benchmark."
    280     },
    281     {
    282       "title": "Larimar: Large Language Models with Episodic Memory Control",
    283       "relevance": "Proposed episodic memory architecture for LLMs that the authors plan to test on their benchmark as future work."
    284     },
    285     {
    286       "title": "Human-like Episodic Memory for Infinite Context LLMs (Fountas et al., 2024)",
    287       "relevance": "Proposed episodic memory extension for LLMs; authors plan to evaluate it on the proposed benchmark, making it directly relevant to the benchmark's intended use cases."
    288     }
    289   ],
    290   "engagement_factors": {
    291     "practical_relevance": {
    292       "score": 1,
    293       "justification": "Benchmark framework and code are released for researchers, but the benchmark itself is a research tool not directly applicable to practitioner workflows."
    294     },
    295     "surprise_contrarian": {
    296       "score": 1,
    297       "justification": "LLMs struggling with memory tasks is not hugely surprising, though the degree of failure at even 10k-token contexts and the complete failure of fine-tuning to generalize is somewhat unexpected."
    298     },
    299     "fear_safety": {
    300       "score": 0,
    301       "justification": "No safety, security, or AI risk concerns are raised by this work."
    302     },
    303     "drama_conflict": {
    304       "score": 0,
    305       "justification": "No controversy or conflict angle; straightforward benchmark evaluation paper."
    306     },
    307     "demo_ability": {
    308       "score": 2,
    309       "justification": "GitHub repository with code and 11 datasets is released, allowing researchers to run the benchmark on their own models."
    310     },
    311     "brand_recognition": {
    312       "score": 1,
    313       "justification": "From Huawei (known but not top-tier AI lab); evaluates well-known models (GPT-4, Claude) which adds some recognition."
    314     }
    315   },
    316   "hn_data": {
    317     "threads": [
    318       {
    319         "hn_id": "43067948",
    320         "title": "A Model for French Voters",
    321         "points": 2,
    322         "comments": 0,
    323         "url": "https://news.ycombinator.com/item?id=43067948"
    324       },
    325       {
    326         "hn_id": "42974556",
    327         "title": "IServe: An Intent-Based Serving System for LLMs",
    328         "points": 1,
    329         "comments": 0,
    330         "url": "https://news.ycombinator.com/item?id=42974556"
    331       }
    332     ],
    333     "top_points": 2,
    334     "total_points": 3,
    335     "total_comments": 0
    336   }
    337 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs