scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32879B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FlockVote: LLM-Empowered Agent-Based Modeling for Simulating U.S. Presidential Elections",
      6     "authors": [
      7       "Lingfeng Zhou",
      8       "Yi Xu",
      9       "Zhenyu Wang",
     10       "Dequan Wang"
     11     ],
     12     "year": 2025,
     13     "venue": "ICAIS 2025 / arXiv.org",
     14     "arxiv_id": "2512.05982",
     15     "doi": "10.48550/arXiv.2512.05982"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about replicating the real-world outcome are supported by Figure 2 (6/7 swing states correct). Claims about interpretability are demonstrated in Section 4.3. Claims about sensitivity analysis are in Section 4.5.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper implies causal mechanisms — that demographic profiles and policy context 'enable nuanced generative reasoning' to simulate voting. Ablation studies (Section 4.4) support some component contributions, but the core causal claim that LLM agents replicate voter reasoning (vs. pattern matching on training data) is not justified.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Title claims 'Simulating U.S. Presidential Elections' but tests only 7 swing states in one election. Conclusion calls for application to 'economics, law, and medicine.' The framework's success is demonstrated for one specific election with one primary model, yet generalization claims are broad.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 4.5 extensively discusses alternative explanations: model political bias (Table 3), prompt sensitivity (Figure 7), positional instability (swing agents), and questions whether 'these agents are valid tools for social science.'",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper measures LLM probability outputs and frames this as 'simulating voter decisions.' It does not adequately distinguish between LLM text generation based on training data patterns and actual voter decision-making processes. The possibility that correct predictions reflect training data correlations rather than reasoning is not addressed.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations section. The conclusion (Section 5) briefly mentions 'key challenges regarding agent bias and instability' in one sentence. Section 4.5 functions as limitation analysis but is framed as 'Sensitivity Analysis.'",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 4.5 identifies specific threats: pro-Democratic model bias (Table 3), context variant sensitivity causing 22+ percentage point swings (Figure 7), positional instability in JSON response format (swing agents), and model-to-model variation (Figure 6).",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show. The conclusion calls for application to 'economics, law, and medicine' without bounding the current findings to the specific election, models, and demographic framework tested.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure or acknowledgments section found in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations clearly listed: Shanghai Jiao Tong University, Shanghai Innovation Institute, Shanghai Academy of Social Sciences, Nanjing University.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding information disclosed at all, so independence cannot be assessed.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial disclosure statement found in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "'High fidelity' is used repeatedly without defining a quantitative threshold. 'Computational laboratory' is used as a rhetorical frame but not operationalized. What would constitute failure of fidelity is never stated.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper explicitly states three contributions: (1) macro-level fidelity validation, (2) interpretable micro-level analysis via agent interviews, and (3) a reliability audit of LLM agents as social simulation instruments.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 explicitly contrasts FlockVote with traditional ABM, statistical models, and concurrent LLM election simulation work (Yu et al., Jiang et al., Bradshaw et al.), articulating how this work differs in focus on interpretability and reliability auditing.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "GitHub repository provided: https://github.com/maple-zhou/FlockVote (footnote 1, also Appendix J).",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Demographic data sourced from publicly available 2023 ACS and 2020 ASARB datasets. Code release includes the framework.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or detailed environment/dependency specifications mentioned in the paper.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions in the paper. Appendix J mentions code release but no explicit instructions for replicating experiments.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Figure 4 shows error bars across 10 trials with different random seeds for the agent population stability analysis.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests used. Comparisons between models (Table 3, Figure 6) and context variants (Figure 7) are based on point estimates with no statistical tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Support rate percentages with baselines are reported throughout (e.g., 'Democrats win Nevada with a margin of only 0.17%', full tables with Republican/Democrat percentages in Table 5).",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Section 4.4.1 explicitly validates agent population size through stability analysis (10 to 2000 agents, 10 trials each), finding stabilization at 300 agents.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Figure 4 shows variance across 10 trials with distinct random seeds for different agent population sizes.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Compares against actual 2024 election results (Figure 2) and tests 7 different LLMs (Table 5, Figure 6).",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Uses contemporary models: GPT-4o-2024-08-06, Claude-3-5-sonnet-2024-10-22, Gemini-1.5-Pro-002, DeepSeek-V2.5 (Appendix A).",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 4.4 ablates agent population size (4.4.1) and profile dimensions — education (Table 1) and religion (Table 2).",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "Only aggregated support rate percentages are used. No additional metrics such as calibration, Brier score, or demographic-level accuracy metrics.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation of the simulation outputs. The 'interviews' in Section 4.3 are with LLM agents, not human judges evaluating output quality.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "Ablation studies use 2020 election data (Table 1, 2) and main results use 2024, but model selection (Qwen-Max-04-28 identified as best via Table 3) was not separated from 2024 evaluation. No explicit dev/test separation.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by state (7 swing states), by model (Table 5), by context condition (Table 3), and by demographic group (Figure 5).",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 4.5 extensively discusses failures: political bias in models, context sensitivity causing wild fluctuations (36.2% to 58.6%), positional instability ('swing agents'), and model disagreements.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Section 4.5 reports that agents are 'flawed, non-deterministic instruments', that minor prompt rephrasing causes wild fluctuations, and that candidate ordering alone flips votes. Appendix I reports that their mitigation strategy shows only 'minor improvements'.",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Appendix A lists specific versions: Qwen-Max-2024-04-28, GPT-4o-2024-08-06, Claude-3-5-sonnet-2024-10-22, Gemini-1.5-Pro-002, etc.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Full prompts provided in Appendix C (voting prompt), Appendix E (bias experiment prompts), Appendix G (context variants), and Appendix I (mitigation system prompt).",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Section 4.1: 'All experiments use a temperature of 0 for stability, except the main result (0.7 for diversity and realism).'",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. Each agent is a single prompt-response call to an LLM API.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.1 describes how demographic profiles are generated from ACS and ASARB data using joint and independent distributions, with 1000 agents per state via random sampling.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Code released on GitHub. Demographic data from publicly available ACS and ASARB datasets. Agent profiles are generated programmatically from these sources.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.1 describes data sources (2023 ACS, 2020 ASARB), the eight demographic attributes used, and how joint/independent distributions are applied. Section 3.2 describes contextual information sources (Pew, Gallup, NBC).",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Agents are synthetically generated from demographic distributions.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Section 3 documents the full pipeline: demographic modeling (3.1) → contextual information (3.2) → probabilistic voting behavior (3.3), with demographic categories detailed in Appendix B.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The paper argues the 2024 election is contemporary enough to prevent data leakage (Section 1, 4.1) but never states the actual training data cutoff dates for Qwen-Max-2024-04-28 or any other model used.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section 1 and 4.1 explicitly discuss data leakage risk: 'historical events like the 2020 election present a significant risk of data leakage, where LLMs might simply recall known outcomes rather than reasoning dynamically.'",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The 2024 election was explicitly chosen as the testbed because it is 'a contemporary event that prevents data leakage from LLM training data' (Section 4.1). The 2020 election contamination risk is acknowledged.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants. All agents are LLM-based synthetic simulations.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants. Synthetic agent demographics are reported in Appendix B.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Appendix J: 'reducing token consumption to approximately 160k tokens per state' and 'accurate predictions can be produced with Llama3.2-3B-Instruct in only one hour' on a consumer-grade device.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Appendix J states 'even on a consumer-grade device (M3 MacBook Pro), accurate predictions can be produced... in only one hour.' Token consumption of ~160k tokens per state is also stated.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": true,
    373           "justification": "Figure 4 shows results across 10 trials with distinct random seeds for agent population sizes from 10 to 2000.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": true,
    379           "justification": "Section 4.4.1: 'each agent number repeated over 10 trials using distinct random seeds to generate unique agent profiles.'",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "Temperature values (0 and 0.7) are stated but no hyperparameter search budget is reported. No discussion of how temperature values were selected.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "The primary model Qwen-Max-04-28 is described as a 'fortuitous choice' (Section 4.5) showing 'more neutrality.' This is post-hoc justification, not a systematic selection process.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors designed the framework, chose the demographic dimensions, selected the model, and crafted the prompts, yet do not discuss author-evaluation bias.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": false,
    408           "answer": false,
    409           "justification": "All models are used via API calls with similar compute costs; compute differences are negligible.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": true,
    415           "justification": "Section 4.5 and the Related Work (Section 2.2) extensively question whether LLM agents are valid instruments for social simulation, asking 'are these agents valid tools for social science?'",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No scaffolding involved. Each agent is a single prompt-response interaction.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": true,
    429           "justification": "Section 4.1: 'a contemporary event that prevents data leakage from LLM training data, thereby testing the agents' generative reasoning rather than recall.' The 2024 election was explicitly chosen to avoid temporal leakage.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether the contextual information provided (candidate stances) or the demographic framing could leak outcome information through the model's training data associations.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of whether agent responses are independent. All agents use the same model, potentially sharing systematic biases. The positional instability finding (Section 4.5) hints at this but does not formally address non-independence.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No concrete leakage detection method applied. The paper only uses temporal selection (choosing the 2024 election) as a prevention strategy, not a detection method.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "FlockVote correctly replicates the macro-level outcome of the 2024 U.S. Presidential Election, predicting 6 of 7 swing states correctly.",
    456       "evidence": "Figure 2 compares predicted outcomes against actual results; the only error is Nevada, predicted Democratic by 0.17% while actually Republican by a narrow margin.",
    457       "supported": "moderate"
    458     },
    459     {
    460       "claim": "Adding education and religion as demographic dimensions is necessary for accurate simulation of the 2024 election.",
    461       "evidence": "Tables 1-2 show 6-dimension profiles fail to predict Wisconsin winner; adding education corrects this. Religion reduces pro-Democratic bias and improves polling alignment.",
    462       "supported": "weak"
    463     },
    464     {
    465       "claim": "Most LLMs exhibit a strong default pro-Democratic political bias without contextual information.",
    466       "evidence": "Table 3 shows Qwen-Max-09-19 predicts Democratic victory in Georgia even under prompts asymmetrically framed to favor Trump. Figure 6 shows Llama3.2 and Llama3.1 consistently predict strong Republican wins across all states.",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "LLM agent outputs are highly sensitive to semantically irrelevant prompt variations, causing Democratic support to swing from 36.2% to 58.6%.",
    471       "evidence": "Figure 7 shows eight context variants, all semantically neutral, producing a 22.4pp range in Democratic support using Qwen-Max-04-28 in Pennsylvania.",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "Positional bias causes 'swing agents' to completely invert their voting preference based solely on candidate ordering in the JSON response format.",
    476       "evidence": "Appendix H documents three agents where swapping Trump/Harris order in the JSON schema inverts probability assignments; Swing Agent 1 flips from 0.5 Harris to 0.5 Trump.",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "The primary model choice (Qwen-Max-2024-04-28) was critical to the framework's success due to its political neutrality compared to other Qwen versions.",
    481       "evidence": "Table 3 shows Qwen-Max-09-19 predicts Democratic victory even under asymmetric pro-Trump framing; the paper admits the choice of 04-28 was 'fortuitous' — a post-hoc justification.",
    482       "supported": "weak"
    483     }
    484   ],
    485   "methodology_tags": [
    486     "case-study",
    487     "observational",
    488     "benchmark-eval"
    489   ],
    490   "key_findings": "FlockVote demonstrates that LLM agents with demographic profiles can replicate the macro-level outcome of the 2024 U.S. Presidential Election (6/7 swing states correct) while also revealing severe methodological limitations: most models exhibit strong pro-Democratic bias without context, outputs swing 22+ percentage points from minor prompt rephrasing, and positional bias alone causes individual agents to fully invert preferences. The paper's most durable contribution is not the prediction but the auditing framework that exposes LLM agents as unreliable social simulation instruments — yet the authors still advocate broad deployment in economics, law, and medicine based on one election test case.",
    491   "red_flags": [
    492     {
    493       "flag": "No baseline comparison",
    494       "detail": "The framework's 6/7 accuracy is never compared against polling averages, prediction markets, or simple demographic regression baselines, making it impossible to know if LLM simulation adds predictive value over cheaper methods."
    495     },
    496     {
    497       "flag": "N=1 validation",
    498       "detail": "The primary evaluation consists of a single election with 7 swing states, each producing a binary outcome. Statistical significance of 6/7 binary predictions is never tested and is consistent with near-chance performance."
    499     },
    500     {
    501       "flag": "Post-hoc model selection",
    502       "detail": "The paper explicitly admits that Qwen-Max-2024-04-28 was a 'fortuitous choice' discovered after Qwen-Max-09-19 failed. The primary results are thus from a model selected after observing its alignment with desired outcomes."
    503     },
    504     {
    505       "flag": "No variance reported in ablation",
    506       "detail": "Figure 4 runs 10 trials per condition but reports only mean values, never showing error bars or standard deviations, obscuring whether the stabilization at 300 agents is meaningful or noisy."
    507     },
    508     {
    509       "flag": "2020 election ablation with acknowledged leakage",
    510       "detail": "Tables 1-2 use the 2020 election as ground truth for ablation despite the paper explicitly acknowledging this introduces data leakage risk from LLM training data."
    511     },
    512     {
    513       "flag": "Broad generalization from single test",
    514       "detail": "The conclusion calls for applying FlockVote to 'economics, law, and medicine' based solely on one political simulation, a leap far beyond what the evidence supports."
    515     },
    516     {
    517       "flag": "No funding disclosure",
    518       "detail": "No acknowledgment section or funding statement appears in the paper, which is unusual for academic work involving significant LLM API costs across 8 models and 7 states."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "Out of One, Many: Using Language Models to Simulate Human Samples",
    524       "relevance": "Foundational work by Argyle et al. establishing LLMs as simulated human respondents ('silicon sampling'), directly cited as justification for demographic profiling approach."
    525     },
    526     {
    527       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    528       "relevance": "Park et al. 2023 Stanford generative agents paper — the primary precedent for LLM-based social simulation that FlockVote extends to electoral contexts."
    529     },
    530     {
    531       "title": "Hidden Persuaders: LLMs' Political Leaning and Their Influence on Voters",
    532       "relevance": "Potter et al. 2024 — demonstrates LLM political biases and their real-world influence on voters, cited as motivation for bias auditing in FlockVote."
    533     },
    534     {
    535       "title": "A Large-Scale Empirical Study on Large Language Models for Election Prediction",
    536       "relevance": "Yu et al. 2024 — concurrent work using LLMs for election prediction, contrasted with FlockVote's interpretability focus."
    537     },
    538     {
    539       "title": "LLM Generated Distribution-Based Prediction of US Electoral Results",
    540       "relevance": "Bradshaw et al. 2024 — alternative distribution-based macro-simulation approach to the same 2024 election testbed."
    541     },
    542     {
    543       "title": "LLM Stability: A Detailed Analysis with Some Surprises",
    544       "relevance": "Atil et al. 2024 — documents non-deterministic instability in LLMs at zero temperature, cited as motivation for stability analysis in Section 4.5."
    545     },
    546     {
    547       "title": "ProSA: Assessing and Understanding the Prompt Sensitivity of LLMs",
    548       "relevance": "Zhuo et al. 2024 — systematic study of prompt sensitivity in LLMs, cited as context for the context variant instability findings."
    549     },
    550     {
    551       "title": "Forecasting Elections with Agent-Based Modeling: Two Live Experiments",
    552       "relevance": "Gao et al. 2022 — prior ABM election forecasting work that FlockVote's demographic dimension selection is partially based on."
    553     }
    554   ],
    555   "engagement_factors": {
    556     "practical_relevance": {
    557       "score": 2,
    558       "justification": "The released code and consumer-hardware feasibility make this usable, but severe reliability issues documented in the paper undercut practical deployment for any consequential application."
    559     },
    560     "surprise_contrarian": {
    561       "score": 2,
    562       "justification": "The finding that LLMs can replicate an election outcome while simultaneously being demonstrated as unreliable, biased, and easily manipulated creates a genuinely surprising tension — the paper succeeds and refutes itself simultaneously."
    563     },
    564     "fear_safety": {
    565       "score": 2,
    566       "justification": "Potter et al. (cited) showed LLM agents can change real voters' opinions; this paper demonstrates those same agents have severe, exploitable biases — a real concern for political influence operations."
    567     },
    568     "drama_conflict": {
    569       "score": 2,
    570       "justification": "Using LLMs to simulate a contentious U.S. election with a politically sensitive outcome, combined with the revealed model biases, creates inherent controversy around LLMs in democratic processes."
    571     },
    572     "demo_ability": {
    573       "score": 3,
    574       "justification": "Code is released, runs on consumer hardware (M3 MacBook in one hour with Llama3.2), and the task is tangible and immediately understandable to any reader."
    575     },
    576     "brand_recognition": {
    577       "score": 1,
    578       "justification": "SJTU has moderate recognition; the paper is not from a major AI lab, though it uses and compares GPT-4o, Claude, and Gemini which adds indirect brand association."
    579     }
    580   },
    581   "hn_data": {
    582     "threads": [
    583       {
    584         "hn_id": "10762409",
    585         "title": "Scientific publications should be anonymous",
    586         "points": 128,
    587         "comments": 76,
    588         "url": "https://news.ycombinator.com/item?id=10762409",
    589         "created_at": "2015-12-19T02:50:25Z"
    590       },
    591       {
    592         "hn_id": "31318574",
    593         "title": "Flares from black hole binaries: black hole shadows via light-curve tomography",
    594         "points": 43,
    595         "comments": 1,
    596         "url": "https://news.ycombinator.com/item?id=31318574",
    597         "created_at": "2022-05-09T19:24:38Z"
    598       },
    599       {
    600         "hn_id": "29549353",
    601         "title": "Self-attention Does Not Need O(n^2) Memory",
    602         "points": 3,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=29549353",
    605         "created_at": "2021-12-14T08:01:16Z"
    606       },
    607       {
    608         "hn_id": "25405164",
    609         "title": "Emergent Quantumness in Neural Networks",
    610         "points": 3,
    611         "comments": 0,
    612         "url": "https://news.ycombinator.com/item?id=25405164",
    613         "created_at": "2020-12-13T08:33:43Z"
    614       },
    615       {
    616         "hn_id": "46720522",
    617         "title": "Accurate and efficient thermal modeling for 2.5D/3D heterogeneous chiplets",
    618         "points": 1,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=46720522",
    621         "created_at": "2026-01-22T15:29:20Z"
    622       },
    623       {
    624         "hn_id": "47240426",
    625         "title": "Learning-Based Multi-Stage Strategy for Aircraft to Evade Missile",
    626         "points": 1,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=47240426",
    629         "created_at": "2026-03-03T23:09:24Z"
    630       },
    631       {
    632         "hn_id": "29576916",
    633         "title": "Self-Attention does not need O(n^2) Memory",
    634         "points": 1,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=29576916",
    637         "created_at": "2021-12-16T10:35:59Z"
    638       }
    639     ],
    640     "top_points": 128,
    641     "total_points": 180,
    642     "total_comments": 77
    643   }
    644 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs