scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30047B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Generative Agents: Interactive Simulacra of Human Behavior",
      6     "authors": [
      7       "Joon Sung Park",
      8       "Joseph C. O'Brien",
      9       "Carrie J. Cai",
     10       "Meredith Ringel Morris",
     11       "Percy Liang",
     12       "Michael S. Bernstein"
     13     ],
     14     "year": 2023,
     15     "venue": "ACM Symposium on User Interface Software and Technology",
     16     "arxiv_id": "2304.03442",
     17     "doi": "10.1145/3586183.3606763"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims agents produce believable individual and emergent social behaviors; these are directly tested via a controlled ablation study (TrueSkill d=8.16) and end-to-end simulation (information diffusion, relationship formation, coordination).",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The ablation study systematically removes memory, reflection, and planning components and measures believability with statistical tests; the paper acknowledges that giving ablated conditions the same memories yields conservative estimates.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims 'simulacra of human behavior' broadly, but evaluation is limited to 25 agents in a single Sims-like sandbox over 2 game days; the discussion extrapolates to social computing, VR, and ubiquitous computing without supporting evidence.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not consider whether longer prompt context (rather than the specific architectural components) drives the believability improvement; failure modes are discussed but competing interpretations of main results are not.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper consistently frames 'believability' as the dependent variable from prior work and makes claims at that level; the limitations section explicitly acknowledges the crowdworker baseline does not represent maximal human performance.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 8.2 is titled 'Future Work and Limitations' and contains a substantive multi-paragraph discussion.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats are named: evaluation limited to short timescale, crowdworker baseline not maximal human performance, ablation confound (ablated conditions share same memory path), and robustness to prompt/memory hacking is unknown.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states assessment 'was limited to a relatively short timescale,' that robustness testing is future work, and that the crowdworker condition 'did not represent the maximal human performance that could serve as the gold standard.'",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments disclose Microsoft Research PhD Fellowship, Stanford HAI, Google Research, HPDTRP, Siegel Family Endowment, and OpenAI as funders.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are disclosed on the title page: Stanford University (Park, O'Brien, Liang, Bernstein), Google Research (Cai), and Google DeepMind (Morris).",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "OpenAI provided funding support while the paper evaluates OpenAI's ChatGPT (gpt-3.5-turbo) as the underlying model; the funder has a direct commercial interest in the evaluated system.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears beyond the acknowledgment of funding sources.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "'Generative agents' are defined in the introduction; 'believability/believable agents' is defined with reference to the Disney-character tradition from prior literature; the three architectural components (memory stream, reflection, planning) are each formally defined.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The introduction explicitly lists four contributions: the generative agent concept, the novel architecture, two evaluations, and a discussion of ethical/societal risks.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 covers four decades of believable agent research (rule-based, learning-based, cognitive architectures) and explicitly situates the contribution relative to each tradition, noting where prior approaches fall short and how this work extends them.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "Footnote 2 provides a public GitHub link to the simulation code: https://github.com/joonspk-research/generative_agents.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "Human evaluation rankings from 100 participants and simulation memory stream logs are not released as a public dataset.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions gpt-3.5-turbo and the Phaser web framework but provides no requirements.txt, Dockerfile, or equivalent dependency specification.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper provides architectural descriptions and sample prompts but no step-by-step reproduction instructions sufficient to recreate the simulation or evaluation without consulting the external GitHub repository.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "TrueSkill ratings report both μ and σ for each condition (e.g., full architecture: μ=29.89, σ=0.72), providing uncertainty estimates for the main results.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "A Kruskal-Wallis test (H(4)=150.29, p<0.001) and Dunn post-hoc tests with Holm-Bonferroni correction are applied to the rank data.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Cohen's d=8.16 is reported for the comparison between the full architecture and the no-memory baseline, derived from TrueSkill N(μ,σ²) model.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "100 evaluators were recruited from Prolific but no power analysis or justification for this sample size is provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "TrueSkill σ is reported for all five conditions in the controlled evaluation; though the end-to-end emergent behavior results lack variance, the primary quantitative claims include spread.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Three ablation baselines and a human crowdworker baseline are included; the fully ablated condition represents prior LLM-only state of the art.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The no-memory condition represents the then-current LLM-as-agent state of the art, citing 2023 contemporaries (Binz & Schulz, Horton, Park et al. 2022).",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Three ablations systematically remove reflection, planning, and observation components; monotonically decreasing believability across ablations establishes each component's contribution.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Controlled evaluation uses believability rankings (TrueSkill); end-to-end evaluation measures information diffusion (%), network density, and coordination (party attendance), covering multiple dimensions.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "100 participants recruited from Prolific evaluated agent interview responses by ranking believability of 5 conditions in a within-subjects design.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Not applicable; this is not a prediction task with a train/test split.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": false,
    219           "justification": "The five interview categories (self-knowledge, memory, plans, reactions, reflections) are analyzed qualitatively but quantitative TrueSkill scores are reported only in aggregate, not broken down by category.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 6.5.2 details memory retrieval failures and hallucination types; Section 7.2 describes three modes of erratic behavior (location selection errors, norm misclassification, instruction-tuning overcoopertiveness).",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The crowdworker condition performed statistically indistinguishably from the fully ablated baseline—a negative finding; 7 of 12 invited agents did not attend the party, reported without spin.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "The paper specifies 'gpt3.5-turbo version of ChatGPT' as the underlying model.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Multiple complete prompts are provided in the paper body: importance scoring, reflection generation, daily planning, reaction/replan, dialogue generation, and environment traversal.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "Decay factor (0.995), reflection threshold (150), and α weights (all 1) are reported, but LLM generation parameters (temperature, top-p) are not reported.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The agent architecture is described in detail across Section 4 (memory stream with retrieval scoring formula, reflection mechanism, planning/reacting modules) and Section 5 (sandbox server implementation).",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Preprocessing of initial agent descriptions (semicolon-delimited phrases entered as seed memories) and environment representation (tree structure converted to natural language) are documented.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw human evaluation rankings from 100 participants and simulation memory stream logs are not released for independent verification.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Human evaluation procedure is described in detail: Prolific platform, 100 US participants, within-subjects design, ~30-minute sessions watching agent replays, ranking believability of 5 conditions.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Participants recruited from Prolific, required to be US resident, English-fluent, 18+, paid at $15/hour, and provided IRB-approved consent.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline from simulation → agent interview responses → evaluator ranking → TrueSkill conversion → Kruskal-Wallis testing is fully described, including qualitative coding procedures (two-phase open coding).",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "Not applicable; the paper evaluates agent architecture believability via human raters, not model performance on a held-out benchmark where training contamination is the primary concern.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable; evaluation task is human judgment of agent behavior, not benchmark completion.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable; no standard benchmark is used where pre-training contamination would be a concern.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No mention of pre-registration on OSF, AsPredicted, or any registry.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": true,
    323           "justification": "Participants 'provided consent by agreeing to a consent form approved by our institution's IRB.'",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Age (median category 25-34), gender distribution (25 female, 73 male, 2 non-binary), education level breakdown, and race/ethnicity (73% Caucasian, etc.) are all reported.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Inclusion criteria stated: US resident, English-fluent, 18+; quality control excluded 4 crowdworker-authored responses not meeting coherence/voice criteria.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "Evaluators were assigned a randomly chosen agent's life to watch, and one randomly chosen question from each of the five categories was displayed.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "No description of blinding is provided; the paper does not state whether condition labels were masked from evaluators or whether presentation order was counterbalanced.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No mention of participant attrition or dropout from the 100 evaluator sample.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Section 8.2 states the 25-agent 2-day simulation required 'thousands of dollars in token credits and taking multiple days to complete.'",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Both monetary cost ('thousands of dollars in token credits') and wall-clock time ('multiple days') are reported for the simulation.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "The full generative agent architecture (memory + reflection + planning) produces the most believable behavior, with a large effect size over the no-memory baseline (Cohen's d=8.16).",
    376       "evidence": "TrueSkill ratings: full μ=29.89 vs. no-memory μ=21.21; Kruskal-Wallis H(4)=150.29, p<0.001; all pairwise differences significant at p<0.001 except crowdworker vs. fully ablated.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Each architectural component (observation, reflection, planning) independently contributes to believability, with monotonically degrading performance as components are removed.",
    381       "evidence": "Full (μ=29.89) > no-reflection (μ=26.88) > no-reflection/planning (μ=25.64) > no-memory/reflection/planning (μ=21.21); all pairwise differences significant at p<0.001.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Generative agents spontaneously diffuse information through the community without user intervention.",
    386       "evidence": "Sam's candidacy spread from 1 to 8 agents (32%), Isabella's party from 1 to 13 (52%) over 2 game days; all knowledge claims verified against memory streams with 0 hallucinations.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Agents form new social relationships over time, with network density increasing from 0.167 to 0.74.",
    391       "evidence": "Undirected graph analysis of mutual agent knowledge before and after 2-day simulation; 1.3% (n=6/453) of relationship claims were hallucinated.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Agents coordinate group activities across multiple autonomous steps from a single seed instruction.",
    396       "evidence": "5 of 12 invited agents attended Isabella's Valentine's Day party; the full chain (intent → invitation → acceptance → planning → attendance) ran without user scripting.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "The simulation architecture is resource-intensive, requiring thousands of dollars in API credits and multiple days for 25 agents over 2 game days.",
    401       "evidence": "Explicitly stated in Section 8.2: 'costing thousands of dollars in token credits and taking multiple days to complete.'",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "case-study",
    407     "qualitative",
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "A generative agent architecture combining a memory stream, reflection, and planning significantly outperforms LLM-only baselines and all ablations in producing believable behavior (Cohen's d=8.16), with each component contributing independently and monotonically. In a 25-agent sandbox simulation, agents exhibit emergent social behaviors—information diffusion, relationship formation, and event coordination—without direct scripting. The architecture inherits LLM failure modes including hallucination, over-formal dialogue, and instruction-tuning biases toward excessive agreeableness, and requires prohibitive compute (thousands of dollars, multiple days for 2 game days) at even small scale.",
    411   "red_flags": [
    412     {
    413       "flag": "OpenAI conflict of interest",
    414       "detail": "OpenAI provided funding support while the paper evaluates OpenAI's ChatGPT (gpt-3.5-turbo) as the core model; no competing interests statement is included."
    415     },
    416     {
    417       "flag": "Single simulation run, no variance",
    418       "detail": "All end-to-end emergent behavior claims (information diffusion %, network density, party attendance) come from a single 2-day simulation run with 25 agents; no replication or variance across runs is reported."
    419     },
    420     {
    421       "flag": "No blinding in human evaluation",
    422       "detail": "The paper does not describe whether condition labels were masked from evaluators or whether presentation order was counterbalanced, raising order-effect concerns in a within-subjects design."
    423     },
    424     {
    425       "flag": "Ablation confound acknowledged but not resolved",
    426       "detail": "Giving ablated architectures the same memories as the full architecture is acknowledged to yield conservative estimates; in practice ablated agents would have accumulated different memories, making the comparison difficult to interpret cleanly."
    427     },
    428     {
    429       "flag": "LLM generation hyperparameters unreported",
    430       "detail": "Temperature, top-p, and other sampling parameters for gpt-3.5-turbo are not reported, preventing reproduction and obscuring the role of output stochasticity."
    431     },
    432     {
    433       "flag": "No power analysis",
    434       "detail": "100 human evaluators chosen without power analysis or justification for sample size adequacy in a within-subjects 5-condition ranking study."
    435     },
    436     {
    437       "flag": "No statistical tests for emergent behavior",
    438       "detail": "End-to-end results are reported as descriptive percentages with no statistical tests, confidence intervals, or variance estimates."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "Social Simulacra: Creating Populated Prototypes for Social Computing Systems",
    444       "relevance": "Direct precursor work using LLMs to generate stateless user personas for social system prototyping; this paper explicitly extends it to persistent, memory-equipped agents."
    445     },
    446     {
    447       "title": "Using cognitive psychology to understand GPT-3",
    448       "relevance": "Establishes LLM baselines for human behavioral simulation; cited as representing the prior state of the art that the no-memory ablation condition emulates."
    449     },
    450     {
    451       "title": "Large Language Models as Simulated Economic Agents: What Can We Learn from Homo Silicus?",
    452       "relevance": "Closely related work using LLMs to replicate social science studies; cited as prior art for LLM-based human behavior simulation without persistent memory."
    453     },
    454     {
    455       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    456       "relevance": "Foundational prompting technique underlying the reflection and planning components of the architecture."
    457     },
    458     {
    459       "title": "Inner Monologue: Embodied Reasoning through Planning with Language Models",
    460       "relevance": "Related work on LLM-based action planning for robotics, cited as prior art for LLM-driven hierarchical planning."
    461     },
    462     {
    463       "title": "On the Opportunities and Risks of Foundation Models",
    464       "relevance": "Provides foundational context for why LLMs encode human behavior from training data, motivating the core architectural assumption."
    465     },
    466     {
    467       "title": "Evaluating Large Language Models in Generating Synthetic HCI Research Data: a Case Study",
    468       "relevance": "Related work on LLM-generated synthetic human behavioral data for HCI research, directly adjacent to this paper's evaluation methodology."
    469     },
    470     {
    471       "title": "Training language models to follow instructions with human feedback (InstructGPT)",
    472       "relevance": "Explains instruction tuning, which the paper identifies as a source of erratic agent behavior (over-formality, excessive cooperativeness) in the deployed system."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "Directly spawned AI-town and numerous downstream implementations; the architecture is immediately applicable to games, social prototyping tools, and training simulations with released code."
    479     },
    480     "surprise_contrarian": {
    481       "score": 2,
    482       "justification": "Emergent social coordination from a single seed instruction (Valentine's party) challenges assumptions about how much scaffolding LLM agents need; the crowdworker baseline performing no better than the fully ablated LLM was a notable negative result."
    483     },
    484     "fear_safety": {
    485       "score": 1,
    486       "justification": "Discusses parasocial relationship risks, deepfake concerns, and tailored persuasion, but these are framed as future concerns with proposed mitigations rather than demonstrated harms."
    487     },
    488     "drama_conflict": {
    489       "score": 1,
    490       "justification": "No significant controversy; paper is well-received and results are largely positive; minor ethical concerns acknowledged without generating substantial community debate."
    491     },
    492     "demo_ability": {
    493       "score": 3,
    494       "justification": "A live demo is explicitly linked in the paper and GitHub code is publicly released, enabling direct experimentation and replication."
    495     },
    496     "brand_recognition": {
    497       "score": 3,
    498       "justification": "Stanford (Percy Liang, Michael Bernstein), Google Research, and Google DeepMind affiliations; became one of the most-cited LLM agent papers of 2023."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "37128293",
    505         "title": "Show HN: AI-town, run your own custom AI world SIM with JavaScript",
    506         "points": 429,
    507         "comments": 115,
    508         "url": "https://news.ycombinator.com/item?id=37128293",
    509         "created_at": "2023-08-14T23:46:02Z"
    510       },
    511       {
    512         "hn_id": "35517649",
    513         "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    514         "points": 391,
    515         "comments": 252,
    516         "url": "https://news.ycombinator.com/item?id=35517649",
    517         "created_at": "2023-04-10T21:32:13Z"
    518       },
    519       {
    520         "hn_id": "36230750",
    521         "title": "I'm afraid I can't do that: Prompt refusal in generative language models",
    522         "points": 179,
    523         "comments": 166,
    524         "url": "https://news.ycombinator.com/item?id=36230750",
    525         "created_at": "2023-06-07T18:03:25Z"
    526       },
    527       {
    528         "hn_id": "34702988",
    529         "title": "Discovery of an Exceptionally Rare Nearby and Energetic Gamma-Ray Burst",
    530         "points": 90,
    531         "comments": 32,
    532         "url": "https://news.ycombinator.com/item?id=34702988",
    533         "created_at": "2023-02-08T01:44:54Z"
    534       },
    535       {
    536         "hn_id": "40212925",
    537         "title": "Show HN: LLM-powered NPCs running on your hardware",
    538         "points": 24,
    539         "comments": 4,
    540         "url": "https://news.ycombinator.com/item?id=40212925",
    541         "created_at": "2024-04-30T16:34:46Z"
    542       },
    543       {
    544         "hn_id": "35511843",
    545         "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    546         "points": 13,
    547         "comments": 2,
    548         "url": "https://news.ycombinator.com/item?id=35511843",
    549         "created_at": "2023-04-10T13:11:19Z"
    550       },
    551       {
    552         "hn_id": "36232330",
    553         "title": "Show HN: GalenAI – An AI Powered Search Engine for Clinicians",
    554         "points": 2,
    555         "comments": 2,
    556         "url": "https://news.ycombinator.com/item?id=36232330",
    557         "created_at": "2023-06-07T19:37:30Z"
    558       },
    559       {
    560         "hn_id": "40481871",
    561         "title": "Exploring Autonomous Agents Through the Lens of Large Language Models",
    562         "points": 2,
    563         "comments": 0,
    564         "url": "https://news.ycombinator.com/item?id=40481871",
    565         "created_at": "2024-05-26T13:08:05Z"
    566       },
    567       {
    568         "hn_id": "39214022",
    569         "title": "Exploring Encrypted Keyboards to Defeat Client-Side Scanning in E2EE Systems",
    570         "points": 2,
    571         "comments": 0,
    572         "url": "https://news.ycombinator.com/item?id=39214022",
    573         "created_at": "2024-02-01T09:04:19Z"
    574       },
    575       {
    576         "hn_id": "35512936",
    577         "title": "CrossCode: Multi-Level Visualization of Program Execution",
    578         "points": 1,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=35512936",
    581         "created_at": "2023-04-10T14:44:26Z"
    582       }
    583     ],
    584     "top_points": 429,
    585     "total_points": 1133,
    586     "total_comments": 573
    587   }
    588 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs