ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31827B)


      1 {
      2   "paper": {
      3     "title": "Forgetful but Faithful: A Cognitive Memory Architecture and Benchmark for Privacy-Aware Generative Agents",
      4     "authors": ["Saad Alqithami"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.12856",
      8     "doi": "10.48550/arXiv.2512.12856"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "The paper proposes MaRS, a typed memory architecture for generative agents with provenance-tracked nodes and forgetting policies, plus FiFA, a benchmark measuring coherence, goal completion, social recall, privacy, and cost. Across 300 simulation runs (6 policies × 5 budgets × 10 seeds), the naive Random Drop policy surprisingly achieves the highest composite score (0.635), while the sophisticated Hybrid policy scores lower (0.589) due to compute overhead. The abstract claims Hybrid achieves ≈0.911 composite, directly contradicting the reported results. Policy choice dominates budget size as the primary performance lever across the 2K–32K token range.",
     14   "claims": [
     15     {
     16       "claim": "The Hybrid policy delivers the best composite performance (≈0.911) while maintaining tractable cost and high privacy scores.",
     17       "evidence": "Abstract claim. Table 2 in §6.5.1 actually shows Random Drop at 0.635 as highest composite and Hybrid at 0.589 — a direct contradiction. No 0.911 figure appears anywhere in the results.",
     18       "supported": "unsupported"
     19     },
     20     {
     21       "claim": "Random Drop achieves the highest Composite score (0.635 ± 0.024) among the evaluated policies.",
     22       "evidence": "Table 2 (§6.5.1) with 95% confidence intervals. Statistical significance confirmed in Table 3 (F=5.93, p<0.001) and pairwise comparison in Table 4 (Random Drop vs. Hybrid: d=1.41, p<0.001).",
     23       "supported": "moderate"
     24     },
     25     {
     26       "claim": "Cost efficiency shows the largest policy effect (η² = 0.832), with simple policies dominating.",
     27       "evidence": "Table 3 (§6.5.2): F=86.43, p<0.0001, η²=0.832. FIFO (0.941), Random Drop (0.935) vs. Hybrid (0.730). Consistent with the theoretical complexity analysis.",
     28       "supported": "strong"
     29     },
     30     {
     31       "claim": "Budget independence holds over the 2K–32K token range: absolute scores improve modestly but policy rankings remain stable.",
     32       "evidence": "§6.7.1: omnibus F values in low single digits, p>0.27 across metrics. Figures 4 and 6 visualize the stable rankings. Consistent with the budget–utility Lipschitz bound.",
     33       "supported": "moderate"
     34     },
     35     {
     36       "claim": "Narrative coherence differences among policies are highly significant with large effect size (η² = 0.351).",
     37       "evidence": "Table 3 (§6.5.2): F=9.45, p<0.0001. Random Drop leads at 0.667 ± 0.074 vs. LRU at 0.501 ± 0.023 (Table 2).",
     38       "supported": "moderate"
     39     },
     40     {
     41       "claim": "MaRS supports (ε, δ)-differential privacy guarantees at the retention decision boundary.",
     42       "evidence": "Theorem A1 in Appendix C provides the proof via the exponential mechanism. However, no empirical validation of the DP guarantees is presented; the DP tie-break 'fires infrequently' in practice (§6.6.4).",
     43       "supported": "weak"
     44     },
     45     {
     46       "claim": "All goal completion rates are extremely low (<8%) across all policies and budgets.",
     47       "evidence": "Table 2: best is Random Drop at 0.078 ± 0.010. This is an empirical observation from the benchmark runs, but it raises questions about benchmark calibration.",
     48       "supported": "moderate"
     49     }
     50   ],
     51   "checklist": {
     52     "artifacts": {
     53       "code_released": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No repository URL, code archive, or any link to released code appears anywhere in the paper. The paper describes a complex framework (MaRS + FiFA) but provides no implementation artifacts."
     57       },
     58       "data_released": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No simulation data, scenario definitions, or experimental outputs are released. The benchmark results and simulation logs are not made available."
     62       },
     63       "environment_specified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No environment specifications, dependency lists, hardware descriptions, or software requirements are provided anywhere in the paper."
     67       },
     68       "reproduction_instructions": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No reproduction instructions, README, or runnable scripts are provided. The paper describes the architecture conceptually but provides no steps to replicate experiments."
     72       }
     73     },
     74     "statistical_methodology": {
     75       "confidence_intervals_or_error_bars": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 reports 95% confidence intervals for all metrics (e.g., '0.635 ± 0.024'). §6.3.2 describes bootstrap confidence intervals, and §6.4.2 specifies cluster-robust sandwich estimators."
     79       },
     80       "significance_tests": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 3 reports ANOVA F-statistics and p-values for each metric. Table 4 reports Holm–Bonferroni adjusted p-values for pairwise comparisons. §6.4.2 describes Wilcoxon signed-rank tests and McNemar's test."
     84       },
     85       "effect_sizes_reported": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 3 reports η² effect sizes for each metric. Table 4 reports Cohen's d with Hedges' correction (e.g., d=1.41 for Random Drop vs. Hybrid). §6.4.2 also mentions Cliff's δ."
     89       },
     90       "sample_size_justified": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The design uses 10 seeds, 5 budgets, 6 policies, 15 agents, 5 scenario types, but no power analysis or explicit justification for why 10 seeds or 15 agents is sufficient for the claims being made."
     94       },
     95       "variance_reported": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 reports 95% confidence intervals across seeds. §6.4.2 describes bootstrap intervals preserving dependency structure. Figure 3 shows distributions per policy."
     99       }
    100     },
    101     "evaluation_design": {
    102       "baselines_included": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Six forgetting policies are compared: FIFO, LRU, Priority Decay, Reflection-Summary, Random Drop, and Hybrid (§6.4.1). These represent a range from naive to sophisticated baselines."
    106       },
    107       "baselines_contemporary": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The baselines include classical memory policies (FIFO, LRU) which are appropriate controls for this domain, plus more sophisticated approaches (Priority Decay, Reflection-Summary). Since this is a new problem formulation, there are no prior competing approaches to compare against."
    111       },
    112       "ablation_study": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The policy comparison serves as an implicit ablation: Hybrid composes temporal, reflection, importance, and privacy components, while individual policies test each component in isolation. §6.8 provides policy-specific analysis. Budget sweeps ablate capacity."
    116       },
    117       "multiple_metrics": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Five distinct metrics are reported: Narrative Coherence, Goal Completion Rate, Social Recall Accuracy, Privacy Preservation, and Cost Efficiency (§6.2.3, Eqs. 8-13), plus a weighted Composite score."
    121       },
    122       "human_evaluation": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper uses LLM-as-judge for narrative coherence evaluation (§6.3.1) but includes no human evaluation. §7.3 explicitly acknowledges this: 'we replace large-scale human annotation with rubricized LLM-as-judge scoring.' The paper claims human-centered evaluation but relies entirely on automated metrics."
    126       },
    127       "held_out_test_set": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "§5.3 mentions 'The importance proxy Ûn is learned per type from held-out FiFA runs or specified via calibrated priors' but does not clearly describe separation of tuning and evaluation data. No explicit dev/test split is documented."
    131       },
    132       "per_category_breakdown": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Table 2 breaks down results by policy and metric. §6.6 provides per-metric detailed analysis. §6.7 analyzes budget effects. §6.8 provides policy-specific analysis. Figure 4 shows budget×policy heatmap."
    136       },
    137       "failure_cases_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "§6.6.1 discusses how temporal policies remove 'bridge' episodes causing broken discourse. §6.6.2 notes extremely low goal completion rates. §6.8.3 discusses what simple policies miss. §7.2 analyzes the unexpected Random Drop result."
    141       },
    142       "negative_results_reported": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper's most prominent finding is negative: the sophisticated Hybrid policy (0.589) loses to naive Random Drop (0.635) on the Composite. Reflection-Summary results are incomplete (footnote in Table 2). The ceiling effect in SRA is reported as a metric design failure."
    146       }
    147     },
    148     "claims_and_evidence": {
    149       "abstract_claims_supported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The abstract claims 'the Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Random Drop leads at 0.635 and Hybrid is at 0.589. No score of 0.911 appears anywhere in the results. This is a direct contradiction between abstract and results."
    153       },
    154       "causal_claims_justified": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The paper makes causal claims about policy effects (e.g., 'purposeful forgetting can improve both user-facing quality'). The experimental design directly manipulates the policy variable while controlling scenario and seed, which is adequate for these claims within the simulation."
    158       },
    159       "generalization_bounded": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "§7.3 explicitly bounds generalizability: simulation vs. real deployment, English-only, specific model family, cultural scope limitations, budget range (2K-32K), retrieval-gated working set. The paper repeatedly notes these are simulation results."
    163       },
    164       "alternative_explanations_discussed": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "§7.2 discusses how Random Drop's superiority stems from metric design (SRA ceiling, cost weighting) rather than inherent superiority. §6.5.1 discusses how reweighting the Composite changes rankings. §7.3 considers model dependence and cultural factors."
    168       },
    169       "proxy_outcome_distinction": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "§7.2 explicitly discusses how 'composite outcomes are sensitive to metric weights and definitions' and that 'wins are not intrinsic properties of policies but reflections of product priorities.' §6.5.1 discusses SRA ceiling effects and the gap between the metric and actual social competence."
    173       }
    174     },
    175     "setup_transparency": {
    176       "model_versions_specified": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper describes a system using LLMs for agent simulation and LLM-as-judge evaluation but never specifies which model or version is used. §6.3.1 mentions 'LLM-as-judge protocol' without naming the model. No model version appears anywhere."
    180       },
    181       "prompts_provided": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper describes 'templated rubrics with auto-calibration' (§6.1.3) and 'rubricized, explanation-seeking prompts' (§6.1.4) but never provides the actual prompt text. No prompts appear in the paper or appendices."
    185       },
    186       "hyperparameters_reported": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "Budget levels are specified (2K-32K tokens) and the experimental grid is described. However, critical hyperparameters are missing: α, β, γ weights for Priority Decay, λ_priv values, distortion thresholds for reflection, LLM temperature/sampling settings, and the 'calibrated priors' mentioned in §5.3."
    190       },
    191       "scaffolding_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The MaRS architecture is described in substantial detail across §§3-5 with typed memory nodes, graph structure, indices, policy execution runtime, privacy engine, and data flow (Figure 1). Policy decision flow is shown in Figure 2. The multi-layer architecture is thoroughly documented."
    195       },
    196       "data_preprocessing_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Scenario types are described at a high level (social-gathering, project-collaboration, etc. in §6.2.2), but the concrete scenario content, event generation parameters, agent population details, and the data pipeline from simulation to metric computation are not documented at a reproducible level."
    200       }
    201     },
    202     "limitations_and_scope": {
    203       "limitations_section_present": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "§7.3 'Limitations and Constraints' provides a substantial multi-paragraph discussion covering external validity, model dependence, language scope, technical scope, formal assumptions, and methodological constraints."
    207       },
    208       "threats_to_validity_specific": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "§7.3 discusses specific threats: FiFA's near-ceiling SRA may result from 'stable, redundantly encoded social facts in the simulator'; leakage opportunities are 'adversarial but fixed in frequency'; results 'are obtained with a particular family of large language models'; cultural scope limited to 'English interactions with Western conversational norms.'"
    212       },
    213       "scope_boundaries_stated": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "§7.3 explicitly states what was not tested: real human interactions, multilingual settings, budgets beyond 32K tokens, multi-modal memory, dynamic policy adaptation, adversarial attacks on memory. §7.4 frames these as future work."
    217       }
    218     },
    219     "data_integrity": {
    220       "raw_data_available": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No raw data from the 300 simulation runs is released. No MaRS audit logs, scenario snapshots, or per-turn metrics are made available for independent verification."
    224       },
    225       "data_collection_described": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The simulation environment is described conceptually (§6.2.1-6.2.2) but lacks reproducible detail. Agent population '15-30 agents', scenario types, and stochastic scheduling are described at a high level without concrete specifications that would allow replication."
    229       },
    230       "recruitment_methods_described": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No human participants. The study is entirely simulation-based with algorithmic agent populations."
    234       },
    235       "data_pipeline_documented": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper describes the conceptual flow from simulation to metrics but does not document the full pipeline at a reproducible level. For example, how exactly are the 5 scenario types instantiated, how are the 15 agents initialized, what are the exact event sequences, and how are metrics computed from raw logs."
    239       }
    240     },
    241     "conflicts_of_interest": {
    242       "funding_disclosed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No funding source, grants, or acknowledgments section appears in the paper. Whether the work is funded or unfunded is not stated."
    246       },
    247       "affiliations_disclosed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Author affiliation is clearly stated: 'Computer Science Department, Al-Baha University' with email. Since the paper evaluates a novel framework rather than a commercial product, no product-affiliation conflict arises."
    251       },
    252       "funder_independent_of_outcome": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No funding information is disclosed, making it impossible to assess funder independence. The paper provides no acknowledgments or funding statement."
    256       },
    257       "financial_interests_declared": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No competing interests statement or financial disclosure appears in the paper."
    261       }
    262     },
    263     "contamination": {
    264       "training_cutoff_stated": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "The paper evaluates algorithmic forgetting policies (FIFO, LRU, Priority Decay, etc.) on a custom simulation benchmark. The policies are not pre-trained models whose knowledge could be contaminated by test data."
    268       },
    269       "train_test_overlap_discussed": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "The evaluated policies are deterministic algorithms, not pre-trained models. Train/test overlap is not a meaningful concern for FIFO, LRU, or Random Drop eviction policies."
    273       },
    274       "benchmark_contamination_addressed": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "FiFA is a newly created benchmark and the evaluated policies are algorithmic heuristics, not pre-trained models that could have been trained on benchmark data."
    278       }
    279     },
    280     "human_studies": {
    281       "pre_registered": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants. The study is entirely simulation-based."
    285       },
    286       "irb_or_ethics_approval": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants. The study uses simulated multi-agent interactions."
    290       },
    291       "demographics_reported": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants. Agent 'archetypes' (Social, Analytical, Creative, Practical, Empathetic) are synthetic, not human demographics."
    295       },
    296       "inclusion_exclusion_criteria": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants in the study."
    300       },
    301       "randomization_described": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants. Randomization of simulation seeds is described but this is not a human-subjects study."
    305       },
    306       "blinding_described": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No human participants."
    310       },
    311       "attrition_reported": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No human participants."
    315       }
    316     },
    317     "cost_and_practicality": {
    318       "inference_cost_reported": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper reports a normalized Cost Efficiency metric (§6.2.3, Eq. 13) and discusses token budgets, but never reports actual API costs, token counts consumed, wall-clock time per run, or cost per simulation step. Only relative efficiency ratios are provided."
    322       },
    323       "compute_budget_stated": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No total computational budget is stated: no GPU hours, API spend, hardware used, or total time for the 300 simulation runs. The paper describes O-notation complexity but reports no actual compute numbers."
    327       }
    328     },
    329     "experimental_rigor": {
    330       "seed_sensitivity_reported": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Results are reported across 10 independent seeds per configuration. §6.4.1: 'replication strategy that executes each configuration across 10 independent seeds.' Bootstrap confidence intervals in Table 2 reflect seed variation."
    334       },
    335       "number_of_runs_stated": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "§6.5.1 explicitly states '6 × 5 × 10 = 300 runs spanning six forgetting policies and five memory budgets.' §6.4.1 describes 10 seeds per configuration."
    339       },
    340       "hyperparameter_search_budget": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No hyperparameter search budget is reported. §5.3 mentions 'The proxy Ûn is learned per type from held-out FiFA runs or specified via calibrated priors' but does not describe how many configurations were tried or what search method was used."
    344       },
    345       "best_config_selection_justified": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The paper does not explain how policy hyperparameters (α, β, γ for Priority Decay, λ_priv, distortion thresholds, reflection rate limits) were selected. §5.3 says 'the defaults match those used in the experiments' without justifying why these defaults were chosen."
    349       },
    350       "multiple_comparison_correction": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "§6.4.2: 'Multiple comparisons across policies and budgets are controlled by Holm–Bonferroni correction applied per metric family.' Table 4 reports Holm–Bonferroni adjusted p-values."
    354       },
    355       "self_comparison_bias_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The authors designed both MaRS (the architecture) and FiFA (the benchmark), and implemented all six policies. No acknowledgment of self-comparison bias. §6.3.1 validates judges but not the potential bias of authors evaluating their own framework."
    359       },
    360       "compute_budget_vs_performance": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "Cost Efficiency is an explicit metric (§6.2.3, Eq. 13). §6.6.5 analyzes cost vs. performance trade-offs. §6.7 analyzes performance across 5 budget levels. The cost–performance frontier is a central finding (§7.2)."
    364       },
    365       "benchmark_construct_validity": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "§7.2 discusses how composite outcomes depend on metric weights and definitions. §6.5.1 discusses ceiling effects in SRA and how reweighting changes policy rankings. The paper acknowledges FiFA's limitations as a construct for measuring agent quality and proposes opportunity-normalized variants."
    369       },
    370       "scaffold_confound_addressed": {
    371         "applies": false,
    372         "answer": false,
    373         "justification": "The paper compares forgetting policies, not different LLM models. All policies operate within the same MaRS infrastructure. The scaffold is constant across conditions, so there is no scaffold confound to address."
    374       }
    375     },
    376     "data_leakage": {
    377       "temporal_leakage_addressed": {
    378         "applies": false,
    379         "answer": false,
    380         "justification": "The evaluated forgetting policies (FIFO, LRU, etc.) are algorithmic heuristics, not pre-trained models. Temporal leakage between training and test data is not applicable to deterministic algorithms."
    381       },
    382       "feature_leakage_addressed": {
    383         "applies": false,
    384         "answer": false,
    385         "justification": "The policies being evaluated are not learned from features. They are predefined algorithms operating on memory node metadata. Feature leakage in the ML sense does not apply."
    386       },
    387       "non_independence_addressed": {
    388         "applies": false,
    389         "answer": false,
    390         "justification": "The study compares algorithmic policies on a custom simulation. Train/test independence is not a meaningful concern since no model training occurs."
    391       },
    392       "leakage_detection_method": {
    393         "applies": false,
    394         "answer": false,
    395         "justification": "The study evaluates algorithmic memory management policies, not pre-trained models. Leakage detection methods are not applicable."
    396       }
    397     }
    398   },
    399   "red_flags": [
    400     {
    401       "flag": "Abstract contradicts results",
    402       "detail": "The abstract claims 'the Hybrid policy delivers the best composite performance (≈0.911)' but Table 2 shows Random Drop leads at 0.635 and Hybrid is at 0.589. No score near 0.911 appears anywhere in the results. This is a major inconsistency between the abstract and the experimental findings."
    403     },
    404     {
    405       "flag": "Incomplete results",
    406       "detail": "Footnote 1 in §6.5.1 states 'The Reflection-Summary row will be inserted once its aggregates are finalized.' One of the six policies is missing from the main results table, suggesting the paper was submitted with incomplete experiments."
    407     },
    408     {
    409       "flag": "No implementation released",
    410       "detail": "A 45-page paper describing a complex multi-layer architecture (MaRS) and benchmark (FiFA) provides no code, no data, no scenarios, and no reproduction instructions. The claims are entirely unverifiable."
    411     },
    412     {
    413       "flag": "Extremely low goal completion rates",
    414       "detail": "All policies achieve <8% goal completion (best: Random Drop 0.078). Either the benchmark is poorly calibrated, the tasks are unreasonably difficult for the system, or the metric captures something other than intended. This is not discussed as a problem."
    415     },
    416     {
    417       "flag": "Naive policy outperforms sophisticated approach",
    418       "detail": "Random Drop (random eviction) achieves the highest composite score and narrative coherence, outperforming the carefully designed Hybrid policy. While the authors discuss this, the result raises questions about whether FiFA actually measures what is claimed, or whether the metric weights were not calibrated to the paper's own motivating thesis."
    419     },
    420     {
    421       "flag": "No human validation of LLM-as-judge",
    422       "detail": "The paper claims human-centered evaluation but relies entirely on LLM-as-judge for subjective metrics like narrative coherence, with no human validation study. §7.3 acknowledges this as a limitation but proceeds to draw strong conclusions from the automated scores."
    423     },
    424     {
    425       "flag": "Entirely simulated evaluation",
    426       "detail": "All 300 experimental runs are in a controlled simulation with synthetic agents and scenarios. No real user interactions, no deployment study, no field validation. The paper's claims about trust, user experience, and human-centered AI rest entirely on simulation."
    427     },
    428     {
    429       "flag": "Missing model specification",
    430       "detail": "The paper never states which LLM is used for agent simulation or LLM-as-judge evaluation. This is a critical omission since all simulation results depend on the underlying model's behavior."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Generative agents: Interactive simulacra of human behavior",
    436       "authors": ["J.S. Park", "J.C. O'Brien", "C.J. Cai", "M.R. Morris", "P. Liang", "M.S. Bernstein"],
    437       "year": 2023,
    438       "relevance": "Foundational work on LLM-based generative agents with memory and reflection, directly motivates MaRS's memory architecture design."
    439     },
    440     {
    441       "title": "Reflexion: language agents with verbal reinforcement learning",
    442       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    443       "year": 2023,
    444       "relevance": "Introduced reflection mechanisms for LLM agents that improve self-consistency, directly relevant to MaRS's reflection-summary policy."
    445     },
    446     {
    447       "title": "MemGPT: Towards LLMs as operating systems",
    448       "authors": ["C. Packer", "S. Wooders", "K. Lin", "V. Fang", "S.G. Patil", "I. Stoica", "J.E. Gonzalez"],
    449       "year": 2024,
    450       "arxiv_id": "2310.08560",
    451       "relevance": "OS-inspired memory paging for LLMs with working set and archival store, key comparison point for MaRS's memory management approach."
    452     },
    453     {
    454       "title": "MemoryBank: Enhancing large language models with long-term memory",
    455       "authors": ["W. Zhong", "L. Guo", "Q. Gao", "H. Ye", "Y. Wang"],
    456       "year": 2024,
    457       "doi": "10.1609/aaai.v38i17.29946",
    458       "relevance": "Attaches human-like decay and importance cues to LLM memory for growth stabilization."
    459     },
    460     {
    461       "title": "Augmenting language models with long-term memory",
    462       "authors": ["W. Wang", "L. Dong", "H. Cheng", "X. Liu", "X. Yan", "J. Gao", "F. Wei"],
    463       "year": 2023,
    464       "relevance": "Separates memory encoding from response generation for improved longevity and recall in LLMs."
    465     },
    466     {
    467       "title": "AgentBench: Evaluating LLMs as agents",
    468       "authors": ["X. Liu", "H. Yu", "H. Zhang", "Y. Xu"],
    469       "year": 2024,
    470       "relevance": "Major agent evaluation benchmark that FiFA positions itself against as addressing memory governance gaps."
    471     },
    472     {
    473       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    474       "authors": ["L. Zheng", "W.L. Chiang", "Y. Sheng"],
    475       "year": 2023,
    476       "relevance": "Establishes LLM-as-judge evaluation methodology that FiFA's rubricized scoring protocol builds upon."
    477     },
    478     {
    479       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    480       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    481       "year": 2024,
    482       "relevance": "Cost-aware LLM evaluation framework relevant to FiFA's cost efficiency metric."
    483     },
    484     {
    485       "title": "Language models are few-shot learners",
    486       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    487       "year": 2020,
    488       "relevance": "GPT-3 paper establishing large language model paradigm; discusses context growth and memory challenges."
    489     },
    490     {
    491       "title": "GPT-4 technical report",
    492       "authors": ["OpenAI"],
    493       "year": 2023,
    494       "arxiv_id": "2303.08774",
    495       "relevance": "Foundational LLM technical report relevant to the generative agents that MaRS aims to manage."
    496     },
    497     {
    498       "title": "FlashAttention-2: Faster attention with better parallelism and work partitioning",
    499       "authors": ["T. Dao"],
    500       "year": 2024,
    501       "relevance": "Efficient attention implementation that increases affordable context horizon, complementary to MaRS's memory governance approach."
    502     },
    503     {
    504       "title": "A survey on large language model based autonomous agents",
    505       "authors": ["L. Wang", "C. Ma", "X. Feng"],
    506       "year": 2024,
    507       "relevance": "Comprehensive survey of LLM-based agents that calls for principled memory budgeting and policy-level evaluation."
    508     },
    509     {
    510       "title": "The rise and potential of large language model based agents: A survey",
    511       "authors": ["Z. Xi", "W. Chen", "X. Guo"],
    512       "year": 2025,
    513       "doi": "10.1007/s11432-024-4222-0",
    514       "relevance": "Survey on LLM agents documenting the gap in memory governance that MaRS addresses."
    515     },
    516     {
    517       "title": "G-Eval: NLG evaluation using GPT-4 with better human alignment",
    518       "authors": ["Y. Liu", "D. Iter", "Y. Xu", "S. Wang", "R. Xu", "C. Zhu"],
    519       "year": 2023,
    520       "doi": "10.18653/v1/2023.emnlp-main.153",
    521       "relevance": "LLM-based evaluation methodology with human alignment, informing FiFA's rubricized judging approach."
    522     }
    523   ]
    524 }

Impressum · Datenschutz