scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33485B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Generative Agents: Interactive Simulacra of Human Behavior",
      6     "authors": [
      7       "J. Park",
      8       "Joseph C. O'Brien",
      9       "Carrie J. Cai",
     10       "M. Morris",
     11       "Percy Liang"
     12     ],
     13     "year": 2023,
     14     "venue": "ACM Symposium on User Interface Software and Technology",
     15     "arxiv_id": "2304.03442",
     16     "doi": "10.1145/3586183.3606763"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims about believable behavior, emergent social dynamics (party coordination, information diffusion), and ablation contributions are all supported by results in Sections 6 and 7.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about component contributions are supported by ablation study with controlled single-variable manipulation. The authors acknowledge the conservative estimate limitation of using shared memory across conditions (Section 6.2).",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Interactive Simulacra of Human Behavior' and abstract claims about 'believable human behavior' are broad, but the evaluation is limited to 25 agents in a single sandbox environment over 2 game days using only ChatGPT (gpt-3.5-turbo). The paper doesn't bound claims to this specific setting.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations for the results. For example, the believability advantage could partly stem from the full architecture producing longer/more detailed responses rather than inherently more believable behavior.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures 'believability' via human rankings but does not discuss the gap between ranked preference in a forced-choice comparison and actual believability of behavior in open-ended interaction. The crowdworker baseline is described as not representing 'maximal human expert performance' but the proxy/outcome gap is not explicitly discussed.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8.2 'Future Work and Limitations' provides substantive discussion of limitations including cost, evaluation timescale, robustness concerns, and biases.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats discussed: the study cost thousands of dollars (Section 8.2), evaluation was limited to short timescale, crowdworker baseline didn't represent maximal human performance, agents may be vulnerable to prompt/memory hacking, and LLM biases are inherited.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 8.2 states the evaluation was 'limited to a relatively short timescale and a baseline human crowdworker condition' and that 'the robustness of generative agents is still largely unknown.' The paper also notes the crowdworker condition 'did not represent the maximal human performance.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section lists Microsoft Research PhD Fellowship, Stanford HAI, Google Research, Hasso Plattner Design Thinking Research Program, Siegel Family Endowment, and OpenAI.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations clearly listed: Stanford University (Park, O'Brien, Liang, Bernstein), Google Research (Cai), Google DeepMind (Morris).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OpenAI funded this research and the paper evaluates ChatGPT (OpenAI's product). Google Research/DeepMind employs two co-authors. These funders have commercial interest in demonstrating LLM capabilities.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Generative agents' are explicitly defined as 'computational software agents that simulate believable human behavior'; 'believability' is grounded in prior literature (Bates 1994) as providing 'an illusion of life'; memory stream, reflection, and planning are each defined precisely.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction enumerates four explicit contributions: the generative agent concept, the novel three-component architecture, two evaluations, and an ethics/societal risk discussion.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages substantively across three streams (HCI, believable agents, LLMs for behavior), situating the work relative to cognitive architectures (SOAR, ACT-R), rule-based approaches, and prior LLM prompting work, showing how each falls short.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper provides a public repository URL: https://github.com/joonspk-research/generative_agents (footnote 2).",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No dataset or simulation logs are released. The demo link (reverie.herokuapp.com) shows a replay but does not provide downloadable data.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. The implementation uses Phaser web framework and ChatGPT API but no dependency specifications are provided.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper. The architecture is described in detail but there are no specific instructions for replicating the experiments.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "TrueSkill ratings include standard deviations (e.g., μ=29.89, σ=0.72) for each condition in Section 6.5.1.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Kruskal-Wallis test (H(4)=150.29, p<0.001) and Dunn post-hoc tests with Holm-Bonferroni correction are reported in Section 6.4-6.5.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Cohen's d=8.16 reported for full architecture vs no-memory baseline in Section 6.5.1.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "100 evaluators were recruited but no power analysis or justification for this sample size is provided.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "TrueSkill σ values reported for each condition (e.g., σ=0.72, 0.69, 0.68, 0.69, 0.70).",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four conditions compared: full architecture, three ablations (no reflection, no reflection/planning, no memory/reflection/planning), and human crowdworker baseline (Section 6.2).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The no-memory/reflection/planning condition represents prior work on LLM-based agents [12, 46, 80]. Human crowdworker condition provides a human baseline.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Three ablation conditions systematically remove architectural components: reflection, planning, and observation memory (Section 6.2).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Controlled evaluation uses TrueSkill believability ratings across 5 question categories (self-knowledge, memory, plans, reactions, reflections). End-to-end evaluation measures information diffusion, relationship formation, and coordination.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "100 human evaluators ranked believability of agent responses across conditions (Section 6.3). This is the primary evaluation method.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Not applicable — this is not a train/test split evaluation. The evaluation is based on human judgment of agent behavior.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Although five interview question categories are described (self-knowledge, memory, plans, reactions, reflections), results are reported only as aggregate TrueSkill scores, not broken down by category.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Sections 6.5.2, 6.5.3, and 7.2 discuss failure modes: memory retrieval failures, hallucinated embellishments, instruction tuning artifacts, location choice errors, and behavioral norms violations.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Section 7.2 reports that agents chose inappropriate locations (bar for lunch), violated physical norms (multiple agents in single-person bathroom), and exhibited overly cooperative behavior due to instruction tuning.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper states 'gpt3.5-turbo version of ChatGPT' (Section 4) but does not provide a specific snapshot date or API version (e.g., gpt-3.5-turbo-0301).",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Multiple full prompt texts are provided throughout Sections 4.1-4.3, including importance scoring, reflection generation, planning, reaction, and dialogue prompts.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Temperature, top-p, and other API parameters are not reported. The decay factor (0.995) and reflection threshold (150) are stated, but LLM sampling parameters are missing.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The agent architecture is described in extensive detail: memory stream, retrieval function (recency/importance/relevance with α weights), reflection mechanism, recursive planning decomposition, and reaction/re-planning loop (Sections 4.1-4.3).",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The initialization procedure is documented: agent descriptions are split into semicolon-delimited memories, the environment tree structure is described, and the sandbox server's JSON data pipeline is explained (Section 5).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw evaluation data (individual rankings from 100 evaluators) is not released. Agent memory streams and simulation logs are not provided.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 6.1-6.3 describes the evaluation procedure: interview method, 5 question categories with 5 questions each, within-subjects design, and ranking task. Prolific platform used for recruitment.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Section 6.3: 100 evaluators recruited from Prolific, US-based, fluent in English, >18 years old, paid $15/hour. Demographics reported (median age 25-34, 73 male, 25 female, 2 non-binary, education and ethnicity breakdown).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The evaluation pipeline is documented: agents run for 2 game days → interviews conducted → responses generated from 5 conditions → human evaluators watch replays and rank responses → TrueSkill ratings computed. Crowdworker quality check described (4 re-generated).",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper does not evaluate a pre-trained model's capability on a standard benchmark. It uses ChatGPT as a component in a novel agent architecture evaluated via human judgment.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Not applicable — no benchmark evaluation of model knowledge. The evaluation is based on human ratings of generated agent behavior.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Not applicable — the paper creates a novel sandbox environment, not an existing benchmark. Contamination of benchmark data is not a concern.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No mention of pre-registration for the human evaluation study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": true,
    322           "justification": "Section 6.3 states participants 'provided consent by agreeing to a consent form approved by our institution's IRB.'",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Section 6.3 reports median age, gender distribution (25F/73M/2NB), education levels, and ethnicity breakdown for the 100 evaluators.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Section 6.3: 'We required that our evaluators be in the U.S., fluent in English, and older than 18 years old.'",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "Section 6.1: Within-subjects design where each participant saw 'one randomly chosen question from each of the five question categories' for a 'randomly chosen agent.' Conditions were compared within each participant.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No mention of whether evaluators knew which responses came from which condition (full architecture, ablations, or human crowdworker). The paper does not describe blinding.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No mention of dropout or attrition. The paper reports 100 evaluators but does not state whether any were excluded or dropped out.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 8.2 states 'The present study required substantial time and resources to simulate 25 agents for two days, costing thousands of dollars in token credits and taking multiple days to complete.'",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only a vague mention of 'thousands of dollars' and 'multiple days.' No specific GPU hours, total API spend, or token counts are provided.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "The simulation was run once. No multiple seeds or repeated simulations are reported. Given LLM stochasticity, results could vary across runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The paper does not explicitly state how many simulation runs were conducted. Appears to be a single run for the end-to-end evaluation.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Architecture hyperparameters (decay factor 0.995, reflection threshold 150, all α=1) appear chosen but no search budget or justification for these values is provided.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "No discussion of how the final hyperparameter values were selected or whether other configurations were tried.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": true,
    398           "justification": "Holm-Bonferroni correction applied to the Dunn post-hoc tests (Section 6.4).",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors compare their architecture against their own ablated versions. The bias of authors evaluating their own system is not discussed.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": false,
    409           "answer": false,
    410           "justification": "Not applicable — the ablation conditions use the same model with different memory access, so compute differences are negligible between conditions.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses 'believability' as the evaluation construct but does not discuss whether human rankings in a forced-choice comparison actually measure believability of agent behavior in open-ended settings.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "Not applicable — the paper evaluates its own architecture as a bundled system, not comparing different models within different scaffolds.",
    423           "source": "opus"
    424         }
    425       }
    426     }
    427   },
    428   "claims": [
    429     {
    430       "claim": "The full generative agent architecture (memory+reflection+planning) produces significantly more believable behavior than ablated versions and a human crowdworker baseline",
    431       "evidence": "TrueSkill ratings: full μ=29.89 vs. no-memory baseline μ=21.21; Cohen's d=8.16; Kruskal-Wallis H(4)=150.29, p<0.001; all pairwise differences significant at p<0.001 except crowdworker vs. fully-ablated",
    432       "supported": "strong"
    433     },
    434     {
    435       "claim": "Each architecture component (observation, reflection, planning) independently and critically contributes to believability",
    436       "evidence": "Monotonic degradation: full (29.89) > no-reflection (26.88) > no-reflection/planning (25.64) > no-memory (21.21); all adjacent pairwise differences statistically significant",
    437       "supported": "strong"
    438     },
    439     {
    440       "claim": "Generative agents demonstrate emergent information diffusion without user intervention",
    441       "evidence": "Over 2 simulated days, knowledge of the Valentine's Day party spread from 1 to 13/25 agents (52%) and candidacy knowledge from 1 to 8/25 (32%); verified against memory streams to exclude hallucinations",
    442       "supported": "moderate"
    443     },
    444     {
    445       "claim": "Agents form new social relationships spontaneously during simulation",
    446       "evidence": "Network density increased from 0.167 to 0.74 over two game days; 1.3% (6/453) of claimed relationships were hallucinated and excluded",
    447       "supported": "moderate"
    448     },
    449     {
    450       "claim": "Agents can coordinate multi-agent activities from a single user-specified seed intent",
    451       "evidence": "From one agent's party intent, 5 of 12 invited agents (42%) showed up; 4 of 7 non-attendees expressed interest but failed to plan accordingly, suggesting coordination is partial and fragile",
    452       "supported": "weak"
    453     },
    454     {
    455       "claim": "Reflection enables synthesis of experiences for deeper social reasoning beyond raw observation",
    456       "evidence": "Qualitative example: without reflection, agent denied knowing a frequent interaction partner's preferences; with reflection, provided specific gift recommendation based on synthesized shared interests",
    457       "supported": "moderate"
    458     }
    459   ],
    460   "methodology_tags": [
    461     "case-study",
    462     "qualitative"
    463   ],
    464   "key_findings": "Generative agents with a memory stream, reflection, and planning architecture produce significantly more believable simulated human behavior than ablated versions or a human crowdworker baseline (Cohen's d=8.16), as rated by 100 human evaluators in a within-subjects controlled study. An end-to-end evaluation with 25 agents over two simulated days demonstrates emergent social phenomena: information diffused without intervention (52% learned of a party, 32% of a mayoral candidacy), social network density increased from 0.167 to 0.74, and multi-agent coordination produced a functioning party event from a single seed intent. Key failure modes — memory retrieval failures, hallucinated embellishments, instruction-tuning-induced over-formality and over-cooperation, and poor spatial norm inference — are identified through inductive qualitative analysis and limit the system's reliability in open-ended contexts.",
    465   "red_flags": [
    466     {
    467       "flag": "Generalization severely overclaimed",
    468       "detail": "Claims about 'believable proxies of human behavior' and application to VR metaverses, social robots, and ubiquitous computing are drawn from evidence of 25 fictional English-speaking agents in a 2-day custom sandbox, with believability assessed only on structured interview questions."
    469     },
    470     {
    471       "flag": "Funder conflict undisclosed",
    472       "detail": "OpenAI is listed as a funder while the system is built on OpenAI's ChatGPT (gpt-3.5-turbo); Google Research and Google DeepMind are funders and co-author affiliations simultaneously, with no competing interests statement."
    473     },
    474     {
    475       "flag": "Weak human baseline",
    476       "detail": "The crowdworker 'human baseline' performed no better than the fully ablated architecture (not statistically different at p<0.001 threshold), suggesting it measured crowdworker compliance with an unfamiliar roleplay task rather than human behavioral quality."
    477     },
    478     {
    479       "flag": "No blinding in human evaluation",
    480       "detail": "Evaluators viewed all five conditions simultaneously and ranked them; condition identity was not concealed, enabling demand characteristics and contrast effects."
    481     },
    482     {
    483       "flag": "LLM generation hyperparameters unreported",
    484       "detail": "Temperature, top-p, and other ChatGPT API generation parameters are not reported anywhere in the paper, making exact replication of results impossible."
    485     },
    486     {
    487       "flag": "Coordination result misframed as success",
    488       "detail": "Only 5/12 invited agents (42%) attended the Valentine's Day party; 4 of the 7 non-attendees expressed intent to attend but failed to act on it — a substantial coordination failure rate presented primarily as a positive finding."
    489     },
    490     {
    491       "flag": "Evaluation data withheld",
    492       "detail": "The crowdworker ranking data from 100 evaluators is not released, preventing independent verification of the TrueSkill analysis."
    493     }
    494   ],
    495   "cited_papers": [
    496     {
    497       "title": "Social Simulacra: Creating Populated Prototypes for Social Computing Systems",
    498       "relevance": "Direct predecessor by the same group; generative agents extend Social Simulacra's stateless persona approach to stateful, memory-equipped agents with long-term coherence"
    499     },
    500     {
    501       "title": "Large Language Models as Simulated Economic Agents: What Can We Learn from Homo Silicus?",
    502       "relevance": "Establishes LLMs as simulators of human behavior in structured contexts; used as a prior-work baseline represented by the no-memory ablation condition"
    503     },
    504     {
    505       "title": "Using Cognitive Psychology to Understand GPT-3",
    506       "relevance": "Investigates LLM behavioral simulation capabilities; represents the prior state of the art the architecture is designed to surpass"
    507     },
    508     {
    509       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    510       "relevance": "Foundational prompting technique whose limitations (conditioning only on current environment, not past experience) motivate the generative agent architecture"
    511     },
    512     {
    513       "title": "On the Opportunities and Risks of Foundation Models",
    514       "relevance": "Contextualizes the LLM capabilities that enable generative agents and frames the broader research agenda"
    515     },
    516     {
    517       "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models",
    518       "relevance": "Related work on LLM-based action sequence planning for robotics; compared approach the authors build upon for hierarchical plan decomposition"
    519     },
    520     {
    521       "title": "TrueSkill: A Bayesian Skill Rating System",
    522       "relevance": "Statistical method used to convert rank data to interval scores for interpretable comparison of the five evaluation conditions"
    523     },
    524     {
    525       "title": "Training Language Models to Follow Instructions with Human Feedback",
    526       "relevance": "Instruction tuning paper whose effects — over-formal dialogue, over-cooperative behavior — are identified as failure modes in the deployed agents"
    527     }
    528   ],
    529   "engagement_factors": {
    530     "practical_relevance": {
    531       "score": 3,
    532       "justification": "Directly applicable to game NPCs, social prototyping tools, UX testing, and virtual environments; a live interactive demo was publicly available."
    533     },
    534     "surprise_contrarian": {
    535       "score": 2,
    536       "justification": "The emergence of social behaviors (information diffusion, relationship formation, party coordination) from LLM prompting alone challenged assumptions about what was required to produce believable multi-agent social dynamics."
    537     },
    538     "fear_safety": {
    539       "score": 2,
    540       "justification": "The paper explicitly raises concerns about parasocial relationships, deepfakes, tailored persuasion, and memory hacking as concrete risks of generative agent deployment."
    541     },
    542     "drama_conflict": {
    543       "score": 1,
    544       "justification": "Standard academic paper with no notable public controversy; the OpenAI funder conflict is unacknowledged rather than openly contested."
    545     },
    546     "demo_ability": {
    547       "score": 3,
    548       "justification": "A live interactive demo is linked (reverie.herokuapp.com) and the full simulation code is on public GitHub, making the system immediately explorable."
    549     },
    550     "brand_recognition": {
    551       "score": 3,
    552       "justification": "Stanford, Google Research, Google DeepMind, and Percy Liang (Stanford CRFM) are highly recognizable; published at UIST, a top-tier HCI venue, with 2,300+ citations by 2024."
    553     }
    554   },
    555   "hn_data": {
    556     "threads": [
    557       {
    558         "hn_id": "37128293",
    559         "title": "Show HN: AI-town, run your own custom AI world SIM with JavaScript",
    560         "points": 429,
    561         "comments": 115,
    562         "url": "https://news.ycombinator.com/item?id=37128293",
    563         "created_at": "2023-08-14T23:46:02Z"
    564       },
    565       {
    566         "hn_id": "35517649",
    567         "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    568         "points": 391,
    569         "comments": 252,
    570         "url": "https://news.ycombinator.com/item?id=35517649",
    571         "created_at": "2023-04-10T21:32:13Z"
    572       },
    573       {
    574         "hn_id": "36230750",
    575         "title": "I'm afraid I can't do that: Prompt refusal in generative language models",
    576         "points": 179,
    577         "comments": 166,
    578         "url": "https://news.ycombinator.com/item?id=36230750",
    579         "created_at": "2023-06-07T18:03:25Z"
    580       },
    581       {
    582         "hn_id": "34702988",
    583         "title": "Discovery of an Exceptionally Rare Nearby and Energetic Gamma-Ray Burst",
    584         "points": 90,
    585         "comments": 32,
    586         "url": "https://news.ycombinator.com/item?id=34702988",
    587         "created_at": "2023-02-08T01:44:54Z"
    588       },
    589       {
    590         "hn_id": "40212925",
    591         "title": "Show HN: LLM-powered NPCs running on your hardware",
    592         "points": 24,
    593         "comments": 4,
    594         "url": "https://news.ycombinator.com/item?id=40212925",
    595         "created_at": "2024-04-30T16:34:46Z"
    596       },
    597       {
    598         "hn_id": "35511843",
    599         "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    600         "points": 13,
    601         "comments": 2,
    602         "url": "https://news.ycombinator.com/item?id=35511843",
    603         "created_at": "2023-04-10T13:11:19Z"
    604       },
    605       {
    606         "hn_id": "36232330",
    607         "title": "Show HN: GalenAI – An AI Powered Search Engine for Clinicians",
    608         "points": 2,
    609         "comments": 2,
    610         "url": "https://news.ycombinator.com/item?id=36232330",
    611         "created_at": "2023-06-07T19:37:30Z"
    612       },
    613       {
    614         "hn_id": "40481871",
    615         "title": "Exploring Autonomous Agents Through the Lens of Large Language Models",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=40481871",
    619         "created_at": "2024-05-26T13:08:05Z"
    620       },
    621       {
    622         "hn_id": "39214022",
    623         "title": "Exploring Encrypted Keyboards to Defeat Client-Side Scanning in E2EE Systems",
    624         "points": 2,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=39214022",
    627         "created_at": "2024-02-01T09:04:19Z"
    628       },
    629       {
    630         "hn_id": "35512936",
    631         "title": "CrossCode: Multi-Level Visualization of Program Execution",
    632         "points": 1,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=35512936",
    635         "created_at": "2023-04-10T14:44:26Z"
    636       }
    637     ],
    638     "top_points": 429,
    639     "total_points": 1133,
    640     "total_comments": 573
    641   }
    642 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs