ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25417B)


      1 {
      2   "paper": {
      3     "title": "Generative Agents: Interactive Simulacra of Human Behavior",
      4     "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"],
      5     "year": 2023,
      6     "venue": "ACM Symposium on User Interface Software and Technology (UIST '23)",
      7     "arxiv_id": "2304.03442",
      8     "doi": "10.1145/3586183.3606763"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval", "qualitative"],
     13   "key_findings": "Generative agents using an architecture combining memory stream, reflection, and planning produce believable behavior that outperforms ablated versions and human crowdworkers (TrueSkill μ=29.89 vs 21.21 for no-memory baseline, d=8.16). The full architecture enables emergent social behaviors including information diffusion (4%→52% awareness), relationship formation (network density 0.167→0.74), and coordination (5 of 12 invited agents attended a party). Each architectural component (observation, reflection, planning) contributes critically to believability as shown by ablation.",
     14   "claims": [
     15     {
     16       "claim": "The full generative agent architecture produces the most believable behavior compared to ablations and human crowdworkers (TrueSkill μ=29.89, σ=0.72)",
     17       "evidence": "Section 6.5.1: TrueSkill ratings from 100 human evaluators ranking 5 conditions. Kruskal-Wallis test H(4)=150.29, p<0.001. All pairwise Dunn post-hoc tests significant at p<0.001 except crowdworker vs fully ablated baseline.",
     18       "supported": "strong"
     19     },
     20     {
     21       "claim": "Each component (observation, reflection, planning) contributes critically to believability with an effect size of d=8.16 between full architecture and no-memory baseline",
     22       "evidence": "Section 6.5.1: Monotonic degradation across ablation conditions. Full (29.89) > no reflection (26.88) > no reflection/planning (25.64) > crowdworker (22.95) > no memory/reflection/planning (21.21).",
     23       "supported": "strong"
     24     },
     25     {
     26       "claim": "Information diffuses through the agent community: Sam's candidacy spread from 1 to 8 agents (32%), Isabella's party from 1 to 13 agents (52%)",
     27       "evidence": "Section 7.1.2: End-of-simulation interviews with all 25 agents. Verified that affirmative responses were not hallucinated by checking memory streams.",
     28       "supported": "moderate"
     29     },
     30     {
     31       "claim": "Agents form new relationships, with network density increasing from 0.167 to 0.74 over two game days",
     32       "evidence": "Section 7.1.2: Interview-based measurement at start and end of simulation. 1.3% hallucination rate (6/453 responses).",
     33       "supported": "moderate"
     34     },
     35     {
     36       "claim": "Agents coordinate to attend a Valentine's Day party: 5 of 12 invited agents showed up at the correct time and place",
     37       "evidence": "Section 7.1.2: Direct observation of agent behavior in the simulation.",
     38       "supported": "moderate"
     39     }
     40   ],
     41   "checklist": {
     42     "artifacts": {
     43       "code_released": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper provides a public repository URL: https://github.com/joonspk-research/generative_agents (footnote 2)."
     47       },
     48       "data_released": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dataset or simulation logs are released. The demo link (reverie.herokuapp.com) shows a replay but does not provide downloadable data."
     52       },
     53       "environment_specified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. The implementation uses Phaser web framework and ChatGPT API but no dependency specifications are provided."
     57       },
     58       "reproduction_instructions": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No step-by-step reproduction instructions are provided in the paper. The architecture is described in detail but there are no specific instructions for replicating the experiments."
     62       }
     63     },
     64     "statistical_methodology": {
     65       "confidence_intervals_or_error_bars": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "TrueSkill ratings include standard deviations (e.g., μ=29.89, σ=0.72) for each condition in Section 6.5.1."
     69       },
     70       "significance_tests": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Kruskal-Wallis test (H(4)=150.29, p<0.001) and Dunn post-hoc tests with Holm-Bonferroni correction are reported in Section 6.4-6.5."
     74       },
     75       "effect_sizes_reported": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Cohen's d=8.16 reported for full architecture vs no-memory baseline in Section 6.5.1."
     79       },
     80       "sample_size_justified": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "100 evaluators were recruited but no power analysis or justification for this sample size is provided."
     84       },
     85       "variance_reported": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "TrueSkill σ values reported for each condition (e.g., σ=0.72, 0.69, 0.68, 0.69, 0.70)."
     89       }
     90     },
     91     "evaluation_design": {
     92       "baselines_included": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Four conditions compared: full architecture, three ablations (no reflection, no reflection/planning, no memory/reflection/planning), and human crowdworker baseline (Section 6.2)."
     96       },
     97       "baselines_contemporary": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The no-memory/reflection/planning condition represents prior work on LLM-based agents [12, 46, 80]. Human crowdworker condition provides a human baseline."
    101       },
    102       "ablation_study": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three ablation conditions systematically remove architectural components: reflection, planning, and observation memory (Section 6.2)."
    106       },
    107       "multiple_metrics": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Controlled evaluation uses TrueSkill believability ratings across 5 question categories (self-knowledge, memory, plans, reactions, reflections). End-to-end evaluation measures information diffusion, relationship formation, and coordination."
    111       },
    112       "human_evaluation": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "100 human evaluators ranked believability of agent responses across conditions (Section 6.3). This is the primary evaluation method."
    116       },
    117       "held_out_test_set": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "Not applicable — this is not a train/test split evaluation. The evaluation is based on human judgment of agent behavior."
    121       },
    122       "per_category_breakdown": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "Although five interview question categories are described (self-knowledge, memory, plans, reactions, reflections), results are reported only as aggregate TrueSkill scores, not broken down by category."
    126       },
    127       "failure_cases_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Sections 6.5.2, 6.5.3, and 7.2 discuss failure modes: memory retrieval failures, hallucinated embellishments, instruction tuning artifacts, location choice errors, and behavioral norms violations."
    131       },
    132       "negative_results_reported": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 7.2 reports that agents chose inappropriate locations (bar for lunch), violated physical norms (multiple agents in single-person bathroom), and exhibited overly cooperative behavior due to instruction tuning."
    136       }
    137     },
    138     "claims_and_evidence": {
    139       "abstract_claims_supported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Abstract claims about believable behavior, emergent social dynamics (party coordination, information diffusion), and ablation contributions are all supported by results in Sections 6 and 7."
    143       },
    144       "causal_claims_justified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Causal claims about component contributions are supported by ablation study with controlled single-variable manipulation. The authors acknowledge the conservative estimate limitation of using shared memory across conditions (Section 6.2)."
    148       },
    149       "generalization_bounded": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The title 'Interactive Simulacra of Human Behavior' and abstract claims about 'believable human behavior' are broad, but the evaluation is limited to 25 agents in a single sandbox environment over 2 game days using only ChatGPT (gpt-3.5-turbo). The paper doesn't bound claims to this specific setting."
    153       },
    154       "alternative_explanations_discussed": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "No discussion of alternative explanations for the results. For example, the believability advantage could partly stem from the full architecture producing longer/more detailed responses rather than inherently more believable behavior."
    158       },
    159       "proxy_outcome_distinction": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper measures 'believability' via human rankings but does not discuss the gap between ranked preference in a forced-choice comparison and actual believability of behavior in open-ended interaction. The crowdworker baseline is described as not representing 'maximal human expert performance' but the proxy/outcome gap is not explicitly discussed."
    163       }
    164     },
    165     "setup_transparency": {
    166       "model_versions_specified": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper states 'gpt3.5-turbo version of ChatGPT' (Section 4) but does not provide a specific snapshot date or API version (e.g., gpt-3.5-turbo-0301)."
    170       },
    171       "prompts_provided": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Multiple full prompt texts are provided throughout Sections 4.1-4.3, including importance scoring, reflection generation, planning, reaction, and dialogue prompts."
    175       },
    176       "hyperparameters_reported": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "Temperature, top-p, and other API parameters are not reported. The decay factor (0.995) and reflection threshold (150) are stated, but LLM sampling parameters are missing."
    180       },
    181       "scaffolding_described": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The agent architecture is described in extensive detail: memory stream, retrieval function (recency/importance/relevance with α weights), reflection mechanism, recursive planning decomposition, and reaction/re-planning loop (Sections 4.1-4.3)."
    185       },
    186       "data_preprocessing_documented": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The initialization procedure is documented: agent descriptions are split into semicolon-delimited memories, the environment tree structure is described, and the sandbox server's JSON data pipeline is explained (Section 5)."
    190       }
    191     },
    192     "limitations_and_scope": {
    193       "limitations_section_present": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 8.2 'Future Work and Limitations' provides substantive discussion of limitations including cost, evaluation timescale, robustness concerns, and biases."
    197       },
    198       "threats_to_validity_specific": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Specific threats discussed: the study cost thousands of dollars (Section 8.2), evaluation was limited to short timescale, crowdworker baseline didn't represent maximal human performance, agents may be vulnerable to prompt/memory hacking, and LLM biases are inherited."
    202       },
    203       "scope_boundaries_stated": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 8.2 states the evaluation was 'limited to a relatively short timescale and a baseline human crowdworker condition' and that 'the robustness of generative agents is still largely unknown.' The paper also notes the crowdworker condition 'did not represent the maximal human performance.'"
    207       }
    208     },
    209     "data_integrity": {
    210       "raw_data_available": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "Raw evaluation data (individual rankings from 100 evaluators) is not released. Agent memory streams and simulation logs are not provided."
    214       },
    215       "data_collection_described": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Section 6.1-6.3 describes the evaluation procedure: interview method, 5 question categories with 5 questions each, within-subjects design, and ranking task. Prolific platform used for recruitment."
    219       },
    220       "recruitment_methods_described": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Section 6.3: 100 evaluators recruited from Prolific, US-based, fluent in English, >18 years old, paid $15/hour. Demographics reported (median age 25-34, 73 male, 25 female, 2 non-binary, education and ethnicity breakdown)."
    224       },
    225       "data_pipeline_documented": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "The evaluation pipeline is documented: agents run for 2 game days → interviews conducted → responses generated from 5 conditions → human evaluators watch replays and rank responses → TrueSkill ratings computed. Crowdworker quality check described (4 re-generated)."
    229       }
    230     },
    231     "conflicts_of_interest": {
    232       "funding_disclosed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Acknowledgments section lists Microsoft Research PhD Fellowship, Stanford HAI, Google Research, Hasso Plattner Design Thinking Research Program, Siegel Family Endowment, and OpenAI."
    236       },
    237       "affiliations_disclosed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Author affiliations clearly listed: Stanford University (Park, O'Brien, Liang, Bernstein), Google Research (Cai), Google DeepMind (Morris)."
    241       },
    242       "funder_independent_of_outcome": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "OpenAI funded this research and the paper evaluates ChatGPT (OpenAI's product). Google Research/DeepMind employs two co-authors. These funders have commercial interest in demonstrating LLM capabilities."
    246       },
    247       "financial_interests_declared": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No competing interests or financial interests statement is included in the paper."
    251       }
    252     },
    253     "contamination": {
    254       "training_cutoff_stated": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The paper does not evaluate a pre-trained model's capability on a standard benchmark. It uses ChatGPT as a component in a novel agent architecture evaluated via human judgment."
    258       },
    259       "train_test_overlap_discussed": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not applicable — no benchmark evaluation of model knowledge. The evaluation is based on human ratings of generated agent behavior."
    263       },
    264       "benchmark_contamination_addressed": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "Not applicable — the paper creates a novel sandbox environment, not an existing benchmark. Contamination of benchmark data is not a concern."
    268       }
    269     },
    270     "human_studies": {
    271       "pre_registered": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No mention of pre-registration for the human evaluation study."
    275       },
    276       "irb_or_ethics_approval": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 6.3 states participants 'provided consent by agreeing to a consent form approved by our institution's IRB.'"
    280       },
    281       "demographics_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 6.3 reports median age, gender distribution (25F/73M/2NB), education levels, and ethnicity breakdown for the 100 evaluators."
    285       },
    286       "inclusion_exclusion_criteria": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 6.3: 'We required that our evaluators be in the U.S., fluent in English, and older than 18 years old.'"
    290       },
    291       "randomization_described": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section 6.1: Within-subjects design where each participant saw 'one randomly chosen question from each of the five question categories' for a 'randomly chosen agent.' Conditions were compared within each participant."
    295       },
    296       "blinding_described": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of whether evaluators knew which responses came from which condition (full architecture, ablations, or human crowdworker). The paper does not describe blinding."
    300       },
    301       "attrition_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of dropout or attrition. The paper reports 100 evaluators but does not state whether any were excluded or dropped out."
    305       }
    306     },
    307     "cost_and_practicality": {
    308       "inference_cost_reported": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 8.2 states 'The present study required substantial time and resources to simulate 25 agents for two days, costing thousands of dollars in token credits and taking multiple days to complete.'"
    312       },
    313       "compute_budget_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Only a vague mention of 'thousands of dollars' and 'multiple days.' No specific GPU hours, total API spend, or token counts are provided."
    317       }
    318     },
    319     "experimental_rigor": {
    320       "seed_sensitivity_reported": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The simulation was run once. No multiple seeds or repeated simulations are reported. Given LLM stochasticity, results could vary across runs."
    324       },
    325       "number_of_runs_stated": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper does not explicitly state how many simulation runs were conducted. Appears to be a single run for the end-to-end evaluation."
    329       },
    330       "hyperparameter_search_budget": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Architecture hyperparameters (decay factor 0.995, reflection threshold 150, all α=1) appear chosen but no search budget or justification for these values is provided."
    334       },
    335       "best_config_selection_justified": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of how the final hyperparameter values were selected or whether other configurations were tried."
    339       },
    340       "multiple_comparison_correction": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Holm-Bonferroni correction applied to the Dunn post-hoc tests (Section 6.4)."
    344       },
    345       "self_comparison_bias_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The authors compare their architecture against their own ablated versions. The bias of authors evaluating their own system is not discussed."
    349       },
    350       "compute_budget_vs_performance": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "Not applicable — the ablation conditions use the same model with different memory access, so compute differences are negligible between conditions."
    354       },
    355       "benchmark_construct_validity": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The paper uses 'believability' as the evaluation construct but does not discuss whether human rankings in a forced-choice comparison actually measure believability of agent behavior in open-ended settings."
    359       },
    360       "scaffold_confound_addressed": {
    361         "applies": false,
    362         "answer": false,
    363         "justification": "Not applicable — the paper evaluates its own architecture as a bundled system, not comparing different models within different scaffolds."
    364       }
    365     }
    366   },
    367   "red_flags": [
    368     {
    369       "flag": "Single simulation run",
    370       "detail": "The end-to-end evaluation appears to be a single 2-day simulation run. Given the stochasticity of LLM outputs, different runs could produce substantially different emergent behaviors, information diffusion patterns, and coordination outcomes."
    371     },
    372     {
    373       "flag": "Non-independent funders",
    374       "detail": "OpenAI funded the research and ChatGPT is the underlying model. Google Research/DeepMind employs two co-authors. Both companies have commercial interest in demonstrating LLM capabilities for agent applications."
    375     },
    376     {
    377       "flag": "Shared memory across ablation conditions",
    378       "detail": "All ablation conditions used memories accumulated by the full architecture agent, meaning ablated agents had access to memories they would not have generated themselves. The authors acknowledge this likely produces 'a conservative estimate' but it means the ablation is not fully clean."
    379     },
    380     {
    381       "flag": "Crowdworker baseline quality",
    382       "detail": "The human crowdworker baseline (which performed poorly) was explicitly noted as not representing 'maximal human expert performance.' Using a weak human baseline risks creating a misleading comparison where the AI system appears to outperform humans."
    383     }
    384   ],
    385   "cited_papers": [
    386     {
    387       "title": "Social Simulacra: Creating Populated Prototypes for Social Computing Systems",
    388       "authors": ["Joon Sung Park", "Lindsay Popowski", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"],
    389       "year": 2022,
    390       "doi": "10.1145/3526113.3545616",
    391       "relevance": "Prior work on using LLMs to generate social computing prototypes — direct precursor to generative agents."
    392     },
    393     {
    394       "title": "Language Models are Few-Shot Learners",
    395       "authors": ["Tom B. Brown"],
    396       "year": 2020,
    397       "arxiv_id": "2005.14165",
    398       "relevance": "GPT-3 paper establishing few-shot prompting capabilities foundational to generative agents."
    399     },
    400     {
    401       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    402       "authors": ["Jason Wei"],
    403       "year": 2023,
    404       "arxiv_id": "2201.11903",
    405       "relevance": "Chain-of-thought prompting technique used as a building block in agent architectures."
    406     },
    407     {
    408       "title": "Training language models to follow instructions with human feedback",
    409       "authors": ["Long Ouyang"],
    410       "year": 2022,
    411       "arxiv_id": "2203.02155",
    412       "relevance": "RLHF/instruction tuning paper — the paper identifies instruction tuning as a source of agent behavioral artifacts."
    413     },
    414     {
    415       "title": "Large Language Models as Simulated Economic Agents: What Can We Learn from Homo Silicus?",
    416       "authors": ["John J. Horton"],
    417       "year": 2023,
    418       "arxiv_id": "2301.07543",
    419       "relevance": "Using LLMs to simulate human behavior in economic experiments — related approach to social simulation."
    420     },
    421     {
    422       "title": "Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP",
    423       "authors": ["Omar Khattab"],
    424       "year": 2023,
    425       "arxiv_id": "2212.14024",
    426       "relevance": "Retrieval-augmented generation approach related to the memory retrieval architecture in generative agents."
    427     },
    428     {
    429       "title": "Using cognitive psychology to understand GPT-3",
    430       "authors": ["Marcel Binz", "Eric Schulz"],
    431       "year": 2023,
    432       "relevance": "Evaluating LLMs as simulations of human cognitive behavior — directly related to believability evaluation."
    433     },
    434     {
    435       "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models",
    436       "authors": ["Wenlong Huang"],
    437       "year": 2022,
    438       "arxiv_id": "2207.05608",
    439       "relevance": "LLM-based planning for robotics tasks — related agent architecture using language models for action decomposition."
    440     },
    441     {
    442       "title": "On the Opportunities and Risks of Foundation Models",
    443       "authors": ["Rishi Bommasani"],
    444       "year": 2022,
    445       "arxiv_id": "2108.07258",
    446       "relevance": "Foundation models survey covering the broad capabilities and risks of LLMs that underpin agent systems."
    447     },
    448     {
    449       "title": "Evaluating Large Language Models in Generating Synthetic HCI Research Data: a Case Study",
    450       "authors": ["Perttu Hämäläinen", "Mikke Tavast", "Anton Kunnari"],
    451       "year": 2023,
    452       "relevance": "Using LLMs to generate synthetic research data — related application of LLMs as human behavior proxies."
    453     }
    454   ]
    455 }

Impressum · Datenschutz