scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (22169B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Forgetful but Faithful: A Cognitive Memory Architecture and Benchmark for Privacy-Aware Generative Agents",
      6     "authors": [
      7       "Saad Alqithami"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.12856",
     12     "doi": "10.48550/arXiv.2512.12856"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract claims 'the Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Random Drop leads at 0.635 and Hybrid is at 0.589. No score of 0.911 appears anywhere in the results. This is a direct contradiction between abstract and results.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper makes causal claims about policy effects (e.g., 'purposeful forgetting can improve both user-facing quality'). The experimental design directly manipulates the policy variable while controlling scenario and seed, which is adequate for these claims within the simulation.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "§7.3 explicitly bounds generalizability: simulation vs. real deployment, English-only, specific model family, cultural scope limitations, budget range (2K-32K), retrieval-gated working set. The paper repeatedly notes these are simulation results.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "§7.2 discusses how Random Drop's superiority stems from metric design (SRA ceiling, cost weighting) rather than inherent superiority. §6.5.1 discusses how reweighting the Composite changes rankings. §7.3 considers model dependence and cultural factors.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "§7.2 explicitly discusses how 'composite outcomes are sensitive to metric weights and definitions' and that 'wins are not intrinsic properties of policies but reflections of product priorities.' §6.5.1 discusses SRA ceiling effects and the gap between the metric and actual social competence.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "§7.3 'Limitations and Constraints' provides a substantial multi-paragraph discussion covering external validity, model dependence, language scope, technical scope, formal assumptions, and methodological constraints.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "§7.3 discusses specific threats: FiFA's near-ceiling SRA may result from 'stable, redundantly encoded social facts in the simulator'; leakage opportunities are 'adversarial but fixed in frequency'; results 'are obtained with a particular family of large language models'; cultural scope limited to 'English interactions with Western conversational norms.'",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "§7.3 explicitly states what was not tested: real human interactions, multilingual settings, budgets beyond 32K tokens, multi-modal memory, dynamic policy adaptation, adversarial attacks on memory. §7.4 frames these as future work.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source, grants, or acknowledgments section appears in the paper. Whether the work is funded or unfunded is not stated.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation is clearly stated: 'Computer Science Department, Al-Baha University' with email. Since the paper evaluates a novel framework rather than a commercial product, no product-affiliation conflict arises.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No funding information is disclosed, making it impossible to assess funder independence. The paper provides no acknowledgments or funding statement.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or financial disclosure appears in the paper.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms are formally defined: memory node tuple (ci, ti, τi, si, wi, ρi), budget constraint, forgetting policy as a transformation f: M × R→M, and each FiFA metric (NC, GCR, SRA, PP, CE) with formal equations.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The introduction explicitly lists three contribution types: conceptual (MaRS ontology), algorithmic (forgetting policy family with complexity and privacy analyses), and evaluative (FiFA benchmark with multi-dimensional metrics).",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 has seven subsections situating the work against cognitive architectures (ACT-R, Soar), memory-augmented LLMs (MemGPT, MemoryBank), differential privacy and machine unlearning, and existing agent benchmarks (AgentBench, WebArena), with explicit gap analysis in section 2.8.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "benchmark-creation": {
    116       "construct_design": {
    117         "construct_validity_argued": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "Section 6.1 asserts metrics are 'chosen to align with user-facing desiderata' but does not validate that simulation scores actually predict user experience; the connection between rubricized LLM coherence scores and human perception of memory quality is asserted, not demonstrated.",
    121           "source": "haiku"
    122         },
    123         "difficulty_distribution_characterized": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Five scenario types are described qualitatively but no characterization of item difficulty, distribution, or calibration against model capability is provided; there are no easy/medium/hard tiers or difficulty measurements.",
    127           "source": "haiku"
    128         },
    129         "ceiling_floor_effects_checked": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Section 6.6.3 explicitly identifies and discusses ceiling effects in Social Recall Accuracy (most policies at 1.000 ± 0.000) and proposes an opportunity-normalized variant to address it; this is a genuine methodological acknowledgment.",
    133           "source": "haiku"
    134         },
    135         "human_baseline_included": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No human baseline is included anywhere; all comparisons are between six algorithmic forgetting policies, with no evaluation of how humans would manage memory or what human-level performance on the benchmark would be.",
    139           "source": "haiku"
    140         },
    141         "scoring_rubric_justified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Composite weights (NC 0.25, GCR 0.25, SRA 0.20, PP 0.15, CE 0.15) are stated without justification; section 7.2 explicitly acknowledges that policy rankings flip under alternative weighting, making the primary results sensitive to an unjustified design choice.",
    145           "source": "haiku"
    146         }
    147       },
    148       "robustness": {
    149         "contamination_resistance_designed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The benchmark is simulation-based with LLM-as-judge; no contamination resistance measures (temporal splits, canary strings, dynamic generation) are discussed, and there is no analysis of whether future models trained on arXiv data could game the benchmark.",
    153           "source": "haiku"
    154         },
    155         "temporal_robustness_discussed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Section 7.3 mentions that advances in long-context attention 'could shift the cost-performance frontier' but there is no versioning plan, update strategy, or discussion of benchmark longevity.",
    159           "source": "haiku"
    160         },
    161         "failure_modes_discussed": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Section 7.3 discusses specific benchmark failure modes: ceiling effects in SRA, LLM-as-judge approximation of human judgment, simulation non-stationarity, fixed-frequency privacy opportunities underrepresenting production stress, and Reflection-Summary missing from main results table.",
    165           "source": "haiku"
    166         },
    167         "baseline_implementations_provided": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The paper describes implementations in detail (Sections 5.1–5.4) but provides no code release, no GitHub link, and no dataset or simulation environment accessible to external researchers to reproduce the reported numbers.",
    171           "source": "haiku"
    172         }
    173       },
    174       "documentation": {
    175         "dataset_documentation_complete": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "FiFA is a simulation environment, not a static dataset; the five scenario types are described qualitatively but specific scenarios, agent personality profiles, event sequences, and prompts are not released or documented in a form others can use.",
    179           "source": "haiku"
    180         },
    181         "licensing_and_access_clear": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "No licensing terms, repository link, or access instructions are provided for either the MaRS implementation or the FiFA benchmark.",
    185           "source": "haiku"
    186         },
    187         "intended_use_specified": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Sections 6.9 and 7.2 explicitly specify intended use (comparing forgetting policies, informing deployment decisions) and what should NOT be concluded (rankings are sensitive to metric weights; simulation findings may not transfer directly to production).",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "The Hybrid policy delivers the best composite performance (~0.911) while maintaining tractable cost and high privacy scores.",
    199       "evidence": "Table 2 shows Hybrid composite score of 0.589 ± 0.009, ranking last among five reported policies; Random Drop wins at 0.635 ± 0.024. The ~0.911 figure does not appear anywhere in the results.",
    200       "supported": "unsupported"
    201     },
    202     {
    203       "claim": "Policy choice, not raw memory budget, is the primary lever for improving user-visible agent behavior.",
    204       "evidence": "Section 6.7 reports no significant main effects of budget over 2K–32K tokens (omnibus F values low single digits, p > 0.27), while policy effects are significant with large η² on Composite, Coherence, GCR, and CE.",
    205       "supported": "moderate"
    206     },
    207     {
    208       "claim": "Random Drop achieves the highest composite score (0.635) and narrative coherence (0.667) among all evaluated policies.",
    209       "evidence": "Table 2 and statistical analysis (Table 3) confirm this; the paper attributes it to ceiling effects in SRA and cost efficiency weighting rather than substantive superiority.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Cost efficiency shows the largest between-policy separation (ANOVA η² = 0.832), with temporal/random policies substantially outperforming importance-aware approaches.",
    214       "evidence": "Table 3 reports F = 86.43, p < 0.0001, η² = 0.832 for cost efficiency; FIFO 0.941 vs. Hybrid 0.730 corroborated by pairwise contrasts in Table 4 (d = 7.34 for FIFO vs Priority Decay).",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Provenance-closed feasibility families form antimatroids when dependency graphs are forests, enabling constant-factor greedy approximation.",
    219       "evidence": "Formal proofs provided in Appendix A (Lemma A1, Theorem A2) with sketched exchange arguments; theoretical claims are self-contained but depend on the antimatroid assumption which may not hold in complex real workflows.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "The MaRS privacy engine provides (ε, δ)-differential privacy guarantees for retention decisions via the exponential mechanism.",
    224       "evidence": "Theorem A1 and Lemma A3 give formal proofs; however, the empirical evaluation shows privacy preservation shows no statistically significant differences across policies (p = 0.485, η² = 0.047), suggesting DP tie-breaks fire rarely in practice.",
    225       "supported": "weak"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "benchmark-eval",
    230     "theoretical",
    231     "case-study"
    232   ],
    233   "key_findings": "FiFA benchmark results across 300 runs (6 policies × 5 budgets × 10 seeds) show that simple Random Drop achieves the highest composite score (0.635), contradicting the abstract's claim that Hybrid wins at ~0.911. Cost efficiency shows the largest policy separation (η² = 0.832), with temporal/random policies dominating importance-aware approaches at roughly 25% lower computational overhead. Memory budget over the 2K–32K range has no significant effect on metric rankings — policy choice dominates. Social Recall and Privacy Preservation exhibit near-ceiling performance with no statistically significant policy differences, attributed to stable redundant social encoding in the simulation and low frequency of DP tie-breaks.",
    234   "red_flags": [
    235     {
    236       "flag": "Abstract contradicts results",
    237       "detail": "Abstract claims 'Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Hybrid at 0.589 ranking last, with Random Drop winning at 0.635. The 0.911 figure appears nowhere in the paper's empirical results — likely a stale draft claim never updated."
    238     },
    239     {
    240       "flag": "Reflection-Summary excluded from main results",
    241       "detail": "Table 2 explicitly notes 'The Reflection-Summary row will be inserted once its aggregates are finalized' — one of the six policies central to the paper's contribution is missing from the primary results table."
    242     },
    243     {
    244       "flag": "No human baseline",
    245       "detail": "A benchmark paper evaluating memory governance provides no human performance baseline, making it impossible to determine whether any algorithmic policy approaches human-level appropriateness."
    246     },
    247     {
    248       "flag": "No code or simulation release",
    249       "detail": "Neither the MaRS implementation nor the FiFA simulation environment is released or linked; section 6.3.2 references 'version-locked code' but provides no access, making the benchmark non-reproducible externally."
    250     },
    251     {
    252       "flag": "Composite weights unjustified and ranking-sensitive",
    253       "detail": "The Composite weights (NC 0.25, GCR 0.25, SRA 0.20, PP 0.15, CE 0.15) are stated without justification, and the paper explicitly acknowledges that rankings flip under alternative weights — yet the main conclusions are presented without this caveat in the abstract."
    254     },
    255     {
    256       "flag": "Goal completion rates uniformly very low",
    257       "detail": "All policies achieve GCR between 0.058 and 0.078, suggesting either the tasks are too hard for the simulation agents or the GCR metric is miscalibrated — yet no floor effect analysis is provided."
    258     },
    259     {
    260       "flag": "Simulation validity not established",
    261       "detail": "All results derive from a controlled multi-agent simulation with LLM-as-judge scoring; no user study, field deployment, or external validation confirms that FiFA metrics predict real-world user experience with memory-budgeted agents."
    262     }
    263   ],
    264   "cited_papers": [
    265     {
    266       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    267       "relevance": "Foundational work on long-horizon generative agents that motivates the memory management problem; directly cited as the deployment context FiFA targets."
    268     },
    269     {
    270       "title": "MemGPT: Towards LLMs as Operating Systems",
    271       "relevance": "Primary related work on virtual memory paging for LLM agents; MaRS is positioned as adding policy-level governance that MemGPT leaves unspecified."
    272     },
    273     {
    274       "title": "AgentBench: Evaluating LLMs as Agents",
    275       "relevance": "Existing agent evaluation benchmark that FiFA is positioned against; cited as lacking memory-governance and budget-varied evaluation axes."
    276     },
    277     {
    278       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    279       "relevance": "Key prior work on reflection mechanisms in agents; cited as improving self-consistency but not regulating unbounded memory growth."
    280     },
    281     {
    282       "title": "Deep Learning with Differential Privacy",
    283       "relevance": "Foundational DP paper (Abadi et al. 2016); provides the composition theorems and moments accountant that MaRS's privacy guarantees build upon."
    284     },
    285     {
    286       "title": "MemoryBank: Enhancing Large Language Models with Long-Term Memory",
    287       "relevance": "Related memory augmentation system; cited as a comparison point for MaRS's approach of treating retention as a governed policy decision."
    288     },
    289     {
    290       "title": "A Survey on Large Language Model Based Autonomous Agents",
    291       "relevance": "Survey explicitly calling for principled memory budgeting and policy-level evaluation; directly motivates the FiFA benchmark design."
    292     },
    293     {
    294       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    295       "relevance": "Methodological basis for FiFA's rubricized LLM-as-judge coherence scoring; cited as evidence that well-specified rubrics show non-trivial agreement with expert annotators."
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 2,
    301       "justification": "Memory management for long-running AI agents is a real deployment problem, and the policy selection guidelines (Section 6.9) are actionable, but the framework is not released so practitioners cannot directly use it."
    302     },
    303     "surprise_contrarian": {
    304       "score": 2,
    305       "justification": "The finding that Random Drop outperforms sophisticated importance-aware policies on the composite benchmark is genuinely counter-intuitive and the paper engages substantively with why this occurs."
    306     },
    307     "fear_safety": {
    308       "score": 2,
    309       "justification": "Privacy in long-running agents (right-to-be-forgotten, GDPR compliance, sensitivity leakage) touches real-world AI deployment safety concerns, though the treatment is more engineering than existential risk."
    310     },
    311     "drama_conflict": {
    312       "score": 1,
    313       "justification": "No significant controversy or conflict angle; the paper is a technical framework and benchmark contribution from a single academic author."
    314     },
    315     "demo_ability": {
    316       "score": 0,
    317       "justification": "No code, demo, or simulation environment is released; readers cannot try FiFA or MaRS themselves."
    318     },
    319     "brand_recognition": {
    320       "score": 0,
    321       "justification": "Single-author paper from Al-Baha University with no affiliation to a prominent AI lab or industry partner."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [
    326       {
    327         "hn_id": "45963729",
    328         "title": "The Fundamental Limits of LLMs at Scale",
    329         "points": 6,
    330         "comments": 0,
    331         "url": "https://news.ycombinator.com/item?id=45963729",
    332         "created_at": "2025-11-18T11:26:02Z"
    333       },
    334       {
    335         "hn_id": "43193918",
    336         "title": "Ringworlds and Dyson spheres can be stable",
    337         "points": 6,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=43193918",
    340         "created_at": "2025-02-27T12:48:58Z"
    341       },
    342       {
    343         "hn_id": "46341968",
    344         "title": "Distributional AGI Safety (DeepMind)",
    345         "points": 4,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=46341968",
    348         "created_at": "2025-12-21T03:25:58Z"
    349       },
    350       {
    351         "hn_id": "47097399",
    352         "title": "The Fundamental Limits of LLMs at Scale",
    353         "points": 4,
    354         "comments": 0,
    355         "url": "https://news.ycombinator.com/item?id=47097399",
    356         "created_at": "2026-02-21T04:07:37Z"
    357       },
    358       {
    359         "hn_id": "46344905",
    360         "title": "Distributional AGI Safety",
    361         "points": 2,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=46344905",
    364         "created_at": "2025-12-21T14:01:56Z"
    365       },
    366       {
    367         "hn_id": "43148731",
    368         "title": "None of the Others: General Technique to Distinguish Reasoning from Memorization",
    369         "points": 2,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=43148731",
    372         "created_at": "2025-02-23T12:12:21Z"
    373       },
    374       {
    375         "hn_id": "38818811",
    376         "title": "Johnsen-Rahbek Capstan Clutch: A High Torque Electrostatic Clutch",
    377         "points": 2,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=38818811",
    380         "created_at": "2023-12-30T20:32:13Z"
    381       }
    382     ],
    383     "top_points": 6,
    384     "total_points": 26,
    385     "total_comments": 0
    386   }
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs