scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22327B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Forgetful but Faithful: A Cognitive Memory Architecture and Benchmark for Privacy-Aware Generative Agents",
      6     "authors": [
      7       "Saad Alqithami"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.12856",
     12     "doi": "10.48550/arXiv.2512.12856"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract explicitly claims 'the Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Hybrid ranks last at 0.589±0.009, with Random Drop winning at 0.635±0.024. The figure 0.911 does not appear anywhere in the results tables or figures.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper makes causal claims ('Hybrid improves coherence over temporal baselines,' 'principled forgetting can simultaneously support coherence, efficiency, and privacy') but all results derive from a synthetic multi-agent simulation with LLM-as-judge evaluation; no real-world deployment or user study validates causal claims.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 7.3 explicitly acknowledges scope: 'primarily English interactions,' results conditioned on retrieval gating up to 32K tokens, and 'controlled, multi-agent simulation that approximates extended, mixed-purpose interactions but cannot reproduce the full heterogeneity of real deployments.'",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper explicitly analyzes why Random Drop counterintuitively outperforms Hybrid on the Composite, attributing it to ceiling effects in Social Recall Accuracy (measured conditional on attempts) and the heavy cost-efficiency weighting, with ablations showing rankings shift when weights change.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper acknowledges LLM-as-judge coherence scoring 'remains an approximation of human perception' and notes ceiling effects in SRA; limitations section distinguishes simulation outcomes from real user satisfaction, recommending future user studies.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 7.3 'Limitations and Constraints' is a dedicated multi-paragraph section covering external validity, model dependence, language/cultural scope, technical assumptions, and methodological constraints.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific threats include: simulator approximates but cannot reproduce non-stationarity and long idle intervals of real deployment; 'results are obtained with a particular family of large language models'; 'primarily English interactions with Western conversational norms'; Social Recall ceiling effect where policies can evade penalties by attempting fewer references.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Explicit boundaries stated: 'budget independence we report is conditional on retrieval gating and prompt curation,' 'within budgets up to 32,000 tokens,' 'provenance-closure family is treated as an antimatroid induced by dependency forests' with caveats if real workflows create cycles.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding acknowledgment section or grant disclosure appears anywhere in the paper; the author is affiliated with Al-Baha University but no funding source is stated.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation (Computer Science Department, Al-Baha University) is clearly disclosed on the title page.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No funder is disclosed, so independence cannot be assessed.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement, no disclosure of patents or equity interests appears anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms are formally defined: memory node tuple (ci, ti, τi, si, wi, ρi), budget constraint, forgetting policy as a transformation f: M × R>0 → M, and all five metrics (NC, GCR, SRA, PP, CE) receive formal mathematical definitions in Section 6.2.3.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Contributions are explicitly enumerated: 'conceptual' (MaRS as relational provenance-aware schema), 'algorithmic' (family of forgetting policies with complexity/privacy analysis), and 'evaluative' (FiFA benchmark operationalizing human-centered criteria).",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 spans eight subsections (2.1–2.8) engaging with cognitive architectures (ACT-R, Soar), generative agents (Park et al. 2023), memory-augmented LLMs (MemGPT, MemoryBank), privacy-preserving AI (GDPR, DP, machine unlearning), benchmarks (AgentBench, WebArena), and explicitly positioning MaRS/FiFA relative to each.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "benchmark-creation": {
    116       "construct_design": {
    117         "construct_validity_argued": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "The paper asserts FiFA measures 'memory governance' rather than raw capability, and Section 6.1 lists design principles, but it does not rigorously argue why a synthetic multi-agent simulation with LLM-as-judge scores constitutes valid measurement of real-world memory governance capability; the link between simulation metrics and actual human-facing memory quality is assumed rather than validated.",
    121           "source": "haiku"
    122         },
    123         "difficulty_distribution_characterized": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No characterization of easy/medium/hard benchmark items; the five budget levels (2K–32K tokens) represent experimental conditions rather than a measured difficulty distribution, and scenario difficulty is described qualitatively without empirical difficulty calibration.",
    127           "source": "haiku"
    128         },
    129         "ceiling_floor_effects_checked": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Section 6.6.3 and 6.5.1 explicitly identify the SRA ceiling effect (FIFO, LRU, Random Drop all score 1.000±0.000) and propose an opportunity-normalized variant; the paper acknowledges this as a limitation affecting policy discrimination.",
    133           "source": "haiku"
    134         },
    135         "human_baseline_included": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No human baseline is included; all evaluation involves only LLM-based agents in a synthetic simulation, with no comparison to human memory management performance.",
    139           "source": "haiku"
    140         },
    141         "scoring_rubric_justified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "The Composite weights (NC 0.25, GCR 0.25, SRA 0.20, PP 0.15, CE 0.15) are stated in Eq. 14 without justification for why these specific values were chosen; the paper acknowledges the ordering shifts when reweighted but does not argue why the primary weights reflect deployment priorities.",
    145           "source": "haiku"
    146         }
    147       },
    148       "robustness": {
    149         "contamination_resistance_designed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The benchmark is a dynamic simulation rather than a static dataset, which provides incidental contamination resistance, but contamination resistance is never discussed as a design goal; there are no temporal splits, canary strings, or explicit anti-gaming measures.",
    153           "source": "haiku"
    154         },
    155         "temporal_robustness_discussed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Future work (Section 7.4) mentions extending FiFA with 'longer horizons, richer privacy stressors, multilingual settings' but there is no plan for benchmark maintenance, versioning, or addressing obsolescence as models improve.",
    159           "source": "haiku"
    160         },
    161         "failure_modes_discussed": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Section 7.2 explicitly analyzes why a naïve policy (Random Drop) outperforms sophisticated ones—identifying metric ceiling effects and composite weighting as the cause—and Section 7.3 discusses what the benchmark does not capture (non-stationarity, cultural variation, multi-modal content, dense privacy opportunities).",
    165           "source": "haiku"
    166         },
    167         "baseline_implementations_provided": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Six forgetting policies are implemented and evaluated, but no code repository or public release is mentioned anywhere in the paper; reproducibility is described architecturally (JSON-LD serialization, fixed seeds) but without a public implementation.",
    171           "source": "haiku"
    172         }
    173       },
    174       "documentation": {
    175         "dataset_documentation_complete": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "FiFA is a simulation benchmark with high-level scenario descriptions (5 types, 15–30 agents, 10 seeds per configuration) but no data card, no specification of which LLM backbone was used for the simulation, no release of prompts or world-generation code, and a note that Reflection-Summary results are not yet finalized.",
    179           "source": "haiku"
    180         },
    181         "licensing_and_access_clear": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "No licensing information, no repository URL, and no indication of how other researchers can access, run, or build on the FiFA benchmark or MaRS implementation.",
    185           "source": "haiku"
    186         },
    187         "intended_use_specified": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Section 6.9 provides deployment recommendations and Section 2.6 positions FiFA against capability benchmarks, but the paper does not specify what should NOT be concluded from FiFA results (e.g., that high FiFA scores do not imply real-world privacy compliance or user satisfaction).",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "The Hybrid policy delivers the best composite performance (≈0.911) across 300 simulation runs.",
    199       "evidence": "Table 2 shows Hybrid ranks last at Composite 0.589±0.009; Random Drop leads at 0.635±0.024. The figure 0.911 appears nowhere in the results.",
    200       "supported": "unsupported"
    201     },
    202     {
    203       "claim": "Policy choice, not memory budget size, is the primary lever for improving user-visible behavior.",
    204       "evidence": "ANOVA shows no significant main effects of budget across metrics (p>0.27), while policy effects are significant for NC, GCR, CE, and Composite; stable policy rankings confirmed across 2K–32K token range in Fig. 4.",
    205       "supported": "moderate"
    206     },
    207     {
    208       "claim": "Cost efficiency shows the largest between-policy separations (η²=0.832) of any metric.",
    209       "evidence": "Table 3 reports Cost Efficiency F=86.43, p<0.0001, η²=0.832; FIFO (0.941) and Random Drop (0.935) substantially outperform Hybrid (0.730).",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Privacy preservation does not significantly differ across forgetting policies.",
    214       "evidence": "Table 3 shows Privacy Preservation F=0.87, p=0.485, η²=0.047 (non-significant); opportunity-normalized variant recommended for future work due to sparse adversarial prompts.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "Goal completion rates are low across all policies (best: 0.078), reflecting difficulty of maintaining task prerequisites under tight budgets.",
    219       "evidence": "Table 2 shows GCR ranging from 0.058 (LRU) to 0.078 (Random Drop); explained by losing prerequisite edges invalidating plans even when conversational coherence remains intact.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Reflection-summary consolidation preserves narrative coherence while reducing leakage risk.",
    224       "evidence": "Section 6.8.2 discusses Reflection-Summary's contribution inside Hybrid qualitatively, but Table 2 explicitly notes 'The Reflection-Summary row will be inserted once its aggregates are finalized'—no standalone quantitative results are reported.",
    225       "supported": "weak"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "benchmark-eval",
    230     "theoretical",
    231     "case-study"
    232   ],
    233   "key_findings": "FiFA is a multi-dimensional simulation benchmark for memory-budgeted generative agents; across 300 runs (6 policies × 5 budgets × 10 seeds), Random Drop achieves the highest Composite score (0.635), contradicting the abstract's claim that Hybrid wins at ≈0.911. Policy choice dominates outcomes while memory budget (2K–32K tokens) has minimal impact on rankings. Cost efficiency shows the largest separations between policies (η²=0.832), with simple temporal/random policies dominating; privacy preservation shows no statistically significant differences across policies under the sparse adversarial stressors used.",
    234   "red_flags": [
    235     {
    236       "flag": "Abstract contradicts results table",
    237       "detail": "Abstract claims 'Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Hybrid ranks last at 0.589±0.009 and Random Drop wins at 0.635±0.024; the value 0.911 does not appear anywhere in the quantitative results."
    238     },
    239     {
    240       "flag": "Incomplete results table",
    241       "detail": "The paper explicitly states 'The table currently reports five of six policies. The Reflection-Summary row will be inserted once its aggregates are finalized'—a benchmark paper that omits one of its own six evaluated policies from the main results table."
    242     },
    243     {
    244       "flag": "No public implementation",
    245       "detail": "No code repository or dataset release is mentioned; the benchmark cannot be reproduced by others without the complete simulation codebase, LLM prompts, and world-generation parameters, none of which are provided."
    246     },
    247     {
    248       "flag": "No human baseline",
    249       "detail": "The benchmark claims to measure human-centered criteria but includes no human performance baseline, making it impossible to assess whether benchmark tasks are appropriately calibrated or whether LLM performance is meaningful relative to human ability."
    250     },
    251     {
    252       "flag": "Floor effect in goal completion",
    253       "detail": "All six policies achieve goal completion rates below 8% (best: 0.078), suggesting the benchmark tasks may be too difficult or that the metric is miscalibrated; such uniformly low absolute performance limits discriminative validity."
    254     },
    255     {
    256       "flag": "Simulation-only validation",
    257       "detail": "All results derive from a synthetic multi-agent simulation with LLM-as-judge evaluation; no real users, no real deployment, and no validation that simulation metrics predict actual user experience or privacy outcomes."
    258     },
    259     {
    260       "flag": "Composite weights unjustified",
    261       "detail": "The key Composite metric weights (NC 0.25, GCR 0.25, SRA 0.20, PP 0.15, CE 0.15) determining policy rankings are stated without justification; the authors acknowledge that reweighting changes rankings, undermining the benchmark's ability to produce stable policy recommendations."
    262     }
    263   ],
    264   "cited_papers": [
    265     {
    266       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    267       "relevance": "Foundational work on long-horizon generative agents whose memory management capabilities the MaRS/FiFA framework is designed to evaluate and improve."
    268     },
    269     {
    270       "title": "MemGPT: Towards LLMs as Operating Systems",
    271       "relevance": "Prior work on virtual memory abstractions for LLM agents; directly compared to MaRS's approach of elevating retention to a first-class policy decision."
    272     },
    273     {
    274       "title": "AgentBench: Evaluating LLMs as Agents",
    275       "relevance": "Representative capability-centric agent benchmark that FiFA explicitly positions against, arguing capability benchmarks lack memory governance evaluation axes."
    276     },
    277     {
    278       "title": "Deep Learning with Differential Privacy",
    279       "relevance": "Foundation for the exponential mechanism and DP guarantees incorporated into MaRS's privacy-aware retention decisions."
    280     },
    281     {
    282       "title": "A Survey on Large Language Model Based Autonomous Agents",
    283       "relevance": "Survey establishing that memory management is a critical bottleneck for long-horizon LLM agent deployment, motivating the paper's research agenda."
    284     },
    285     {
    286       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    287       "relevance": "Methodological basis for the LLM-as-judge protocol used in FiFA's narrative coherence evaluation."
    288     },
    289     {
    290       "title": "MemoryBank: Enhancing Large Language Models with Long-Term Memory",
    291       "relevance": "Prior memory augmentation system with human-like decay and importance cues; positioned as complementary to MaRS's governance-focused approach."
    292     },
    293     {
    294       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    295       "relevance": "Reflection-based consolidation mechanism that MaRS's Reflection-Summary policy builds upon and extends with budget and privacy constraints."
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 2,
    301       "justification": "Addresses a real deployment problem (memory management for long-running agents) with concrete policy selection guidelines, but the simulation-only evaluation and lack of public implementation limit immediate practitioner use."
    302     },
    303     "surprise_contrarian": {
    304       "score": 2,
    305       "justification": "The finding that Random Drop outperforms sophisticated importance-aware policies on the Composite metric is counterintuitive and challenges the assumption that more complex retention strategies are always better."
    306     },
    307     "fear_safety": {
    308       "score": 2,
    309       "justification": "Directly addresses privacy risks from LLM agents retaining sensitive information indefinitely, the 'right to be forgotten,' and GDPR compliance—concrete safety concerns in AI deployment."
    310     },
    311     "drama_conflict": {
    312       "score": 1,
    313       "justification": "The abstract-vs-results contradiction (Hybrid claimed best at 0.911 vs. actually last at 0.589) is notable but unlikely to generate public controversy; the field debate over memory architectures is technical."
    314     },
    315     "demo_ability": {
    316       "score": 0,
    317       "justification": "No code, no demo, no public implementation mentioned; the benchmark cannot be tried by others."
    318     },
    319     "brand_recognition": {
    320       "score": 0,
    321       "justification": "Single author from Al-Baha University with no famous lab affiliation, no well-known collaborators."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [
    326       {
    327         "hn_id": "45963729",
    328         "title": "The Fundamental Limits of LLMs at Scale",
    329         "points": 6,
    330         "comments": 0,
    331         "url": "https://news.ycombinator.com/item?id=45963729",
    332         "created_at": "2025-11-18T11:26:02Z"
    333       },
    334       {
    335         "hn_id": "43193918",
    336         "title": "Ringworlds and Dyson spheres can be stable",
    337         "points": 6,
    338         "comments": 0,
    339         "url": "https://news.ycombinator.com/item?id=43193918",
    340         "created_at": "2025-02-27T12:48:58Z"
    341       },
    342       {
    343         "hn_id": "46341968",
    344         "title": "Distributional AGI Safety (DeepMind)",
    345         "points": 4,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=46341968",
    348         "created_at": "2025-12-21T03:25:58Z"
    349       },
    350       {
    351         "hn_id": "47097399",
    352         "title": "The Fundamental Limits of LLMs at Scale",
    353         "points": 4,
    354         "comments": 0,
    355         "url": "https://news.ycombinator.com/item?id=47097399",
    356         "created_at": "2026-02-21T04:07:37Z"
    357       },
    358       {
    359         "hn_id": "46344905",
    360         "title": "Distributional AGI Safety",
    361         "points": 2,
    362         "comments": 0,
    363         "url": "https://news.ycombinator.com/item?id=46344905",
    364         "created_at": "2025-12-21T14:01:56Z"
    365       },
    366       {
    367         "hn_id": "43148731",
    368         "title": "None of the Others: General Technique to Distinguish Reasoning from Memorization",
    369         "points": 2,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=43148731",
    372         "created_at": "2025-02-23T12:12:21Z"
    373       },
    374       {
    375         "hn_id": "38818811",
    376         "title": "Johnsen-Rahbek Capstan Clutch: A High Torque Electrostatic Clutch",
    377         "points": 2,
    378         "comments": 0,
    379         "url": "https://news.ycombinator.com/item?id=38818811",
    380         "created_at": "2023-12-30T20:32:13Z"
    381       }
    382     ],
    383     "top_points": 6,
    384     "total_points": 26,
    385     "total_comments": 0
    386   }
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs