scan.json (25949B)
1 { 2 "scan_version": 2, 3 "active_modules": ["experimental_rigor", "data_leakage"], 4 "paper": { 5 "title": "FlockVote: LLM-Empowered Agent-Based Modeling for Simulating U.S. Presidential Elections", 6 "authors": ["Lingfeng Zhou", "Yi Xu", "Zhenyu Wang", "Dequan Wang"], 7 "year": 2025, 8 "venue": "ICAIS 2025", 9 "arxiv_id": "2512.05982", 10 "doi": "10.48550/arXiv.2512.05982" 11 }, 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "GitHub repository provided: https://github.com/maple-zhou/FlockVote (footnote 1, also Appendix J)." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "Demographic data sourced from publicly available 2023 ACS and 2020 ASARB datasets. Code release includes the framework." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No requirements.txt, Dockerfile, or detailed environment/dependency specifications mentioned in the paper." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions in the paper. Appendix J mentions code release but no explicit instructions for replicating experiments." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Figure 4 shows error bars across 10 trials with different random seeds for the agent population stability analysis." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "No statistical significance tests used. Comparisons between models (Table 3, Figure 6) and context variants (Figure 7) are based on point estimates with no statistical tests." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Support rate percentages with baselines are reported throughout (e.g., 'Democrats win Nevada with a margin of only 0.17%', full tables with Republican/Democrat percentages in Table 5)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 4.4.1 explicitly validates agent population size through stability analysis (10 to 2000 agents, 10 trials each), finding stabilization at 300 agents." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Figure 4 shows variance across 10 trials with distinct random seeds for different agent population sizes." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Compares against actual 2024 election results (Figure 2) and tests 7 different LLMs (Table 5, Figure 6)." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Uses contemporary models: GPT-4o-2024-08-06, Claude-3-5-sonnet-2024-10-22, Gemini-1.5-Pro-002, DeepSeek-V2.5 (Appendix A)." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "Section 4.4 ablates agent population size (4.4.1) and profile dimensions — education (Table 1) and religion (Table 2)." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": false, 81 "justification": "Only aggregated support rate percentages are used. No additional metrics such as calibration, Brier score, or demographic-level accuracy metrics." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": false, 86 "justification": "No human evaluation of the simulation outputs. The 'interviews' in Section 4.3 are with LLM agents, not human judges evaluating output quality." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": false, 91 "justification": "Ablation studies use 2020 election data (Table 1, 2) and main results use 2024, but model selection (Qwen-Max-04-28 identified as best via Table 3) was not separated from 2024 evaluation. No explicit dev/test separation." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results broken down by state (7 swing states), by model (Table 5), by context condition (Table 3), and by demographic group (Figure 5)." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 4.5 extensively discusses failures: political bias in models, context sensitivity causing wild fluctuations (36.2% to 58.6%), positional instability ('swing agents'), and model disagreements." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 4.5 reports that agents are 'flawed, non-deterministic instruments', that minor prompt rephrasing causes wild fluctuations, and that candidate ordering alone flips votes. Appendix I reports that their mitigation strategy shows only 'minor improvements'." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims about replicating the real-world outcome are supported by Figure 2 (6/7 swing states correct). Claims about interpretability are demonstrated in Section 4.3. Claims about sensitivity analysis are in Section 4.5." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper implies causal mechanisms — that demographic profiles and policy context 'enable nuanced generative reasoning' to simulate voting. Ablation studies (Section 4.4) support some component contributions, but the core causal claim that LLM agents replicate voter reasoning (vs. pattern matching on training data) is not justified." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "Title claims 'Simulating U.S. Presidential Elections' but tests only 7 swing states in one election. Conclusion calls for application to 'economics, law, and medicine.' The framework's success is demonstrated for one specific election with one primary model, yet generalization claims are broad." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "Section 4.5 extensively discusses alternative explanations: model political bias (Table 3), prompt sensitivity (Figure 7), positional instability (swing agents), and questions whether 'these agents are valid tools for social science.'" 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper measures LLM probability outputs and frames this as 'simulating voter decisions.' It does not adequately distinguish between LLM text generation based on training data patterns and actual voter decision-making processes. The possibility that correct predictions reflect training data correlations rather than reasoning is not addressed." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Appendix A lists specific versions: Qwen-Max-2024-04-28, GPT-4o-2024-08-06, Claude-3-5-sonnet-2024-10-22, Gemini-1.5-Pro-002, etc." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Full prompts provided in Appendix C (voting prompt), Appendix E (bias experiment prompts), Appendix G (context variants), and Appendix I (mitigation system prompt)." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Section 4.1: 'All experiments use a temperature of 0 for stability, except the main result (0.7 for diversity and realism).'" 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used. Each agent is a single prompt-response call to an LLM API." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 3.1 describes how demographic profiles are generated from ACS and ASARB data using joint and independent distributions, with 1000 agents per state via random sampling." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "No dedicated limitations section. The conclusion (Section 5) briefly mentions 'key challenges regarding agent bias and instability' in one sentence. Section 4.5 functions as limitation analysis but is framed as 'Sensitivity Analysis.'" 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 4.5 identifies specific threats: pro-Democratic model bias (Table 3), context variant sensitivity causing 22+ percentage point swings (Figure 7), positional instability in JSON response format (swing agents), and model-to-model variation (Figure 6)." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper does not explicitly state what the results do NOT show. The conclusion calls for application to 'economics, law, and medicine' without bounding the current findings to the specific election, models, and demographic framework tested." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "Code released on GitHub. Demographic data from publicly available ACS and ASARB datasets. Agent profiles are generated programmatically from these sources." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 3.1 describes data sources (2023 ACS, 2020 ASARB), the eight demographic attributes used, and how joint/independent distributions are applied. Section 3.2 describes contextual information sources (Pew, Gallup, NBC)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Agents are synthetically generated from demographic distributions." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 3 documents the full pipeline: demographic modeling (3.1) → contextual information (3.2) → probabilistic voting behavior (3.3), with demographic categories detailed in Appendix B." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding disclosure or acknowledgments section found in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations clearly listed: Shanghai Jiao Tong University, Shanghai Innovation Institute, Shanghai Academy of Social Sciences, Nanjing University." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding information disclosed at all, so independence cannot be assessed." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial disclosure statement found in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The paper argues the 2024 election is contemporary enough to prevent data leakage (Section 1, 4.1) but never states the actual training data cutoff dates for Qwen-Max-2024-04-28 or any other model used." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": true, 233 "justification": "Section 1 and 4.1 explicitly discuss data leakage risk: 'historical events like the 2020 election present a significant risk of data leakage, where LLMs might simply recall known outcomes rather than reasoning dynamically.'" 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": true, 238 "justification": "The 2024 election was explicitly chosen as the testbed because it is 'a contemporary event that prevents data leakage from LLM training data' (Section 4.1). The 2020 election contamination risk is acknowledged." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants. All agents are LLM-based synthetic simulations." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants. Synthetic agent demographics are reported in Appendix B." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": true, 282 "justification": "Appendix J: 'reducing token consumption to approximately 160k tokens per state' and 'accurate predictions can be produced with Llama3.2-3B-Instruct in only one hour' on a consumer-grade device." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": true, 287 "justification": "Appendix J states 'even on a consumer-grade device (M3 MacBook Pro), accurate predictions can be produced... in only one hour.' Token consumption of ~160k tokens per state is also stated." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Figure 4 shows results across 10 trials with distinct random seeds for agent population sizes from 10 to 2000." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": true, 299 "justification": "Section 4.4.1: 'each agent number repeated over 10 trials using distinct random seeds to generate unique agent profiles.'" 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "Temperature values (0 and 0.7) are stated but no hyperparameter search budget is reported. No discussion of how temperature values were selected." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": false, 309 "justification": "The primary model Qwen-Max-04-28 is described as a 'fortuitous choice' (Section 4.5) showing 'more neutrality.' This is post-hoc justification, not a systematic selection process." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": false, 319 "justification": "The authors designed the framework, chose the demographic dimensions, selected the model, and crafted the prompts, yet do not discuss author-evaluation bias." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": false, 323 "answer": false, 324 "justification": "All models are used via API calls with similar compute costs; compute differences are negligible." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": true, 329 "justification": "Section 4.5 and the Related Work (Section 2.2) extensively question whether LLM agents are valid instruments for social simulation, asking 'are these agents valid tools for social science?'" 330 }, 331 "scaffold_confound_addressed": { 332 "applies": false, 333 "answer": false, 334 "justification": "No scaffolding involved. Each agent is a single prompt-response interaction." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 4.1: 'a contemporary event that prevents data leakage from LLM training data, thereby testing the agents' generative reasoning rather than recall.' The 2024 election was explicitly chosen to avoid temporal leakage." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the contextual information provided (candidate stances) or the demographic framing could leak outcome information through the model's training data associations." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether agent responses are independent. All agents use the same model, potentially sharing systematic biases. The positional instability finding (Section 4.5) hints at this but does not formally address non-independence." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No concrete leakage detection method applied. The paper only uses temporal selection (choosing the 2024 election) as a prevention strategy, not a detection method." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "FlockVote correctly replicates the macro-level result that Donald Trump would win six of the seven pivotal swing states in the 2024 election.", 363 "evidence": "Figure 2 compares simulation results with actual election outcomes. The only discrepancy is Nevada, predicted for Harris by 0.17% margin (Section 4.2).", 364 "supported": "moderate" 365 }, 366 { 367 "claim": "Prediction variance stabilizes when agent population reaches 300, after which further increases yield minimal fluctuations.", 368 "evidence": "Figure 4 shows stability analysis from 10 to 2000 agents across 10 trials each in Pennsylvania (Section 4.4.1).", 369 "supported": "strong" 370 }, 371 { 372 "claim": "Adding education and religion dimensions to agent profiles improves simulation fidelity.", 373 "evidence": "Table 1 shows education fixes the Wisconsin prediction; Table 2 shows religion improves alignment with polling data (Section 4.4.2).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "LLM agents exhibit severe political bias, with most models showing a strong default pro-Democratic bias.", 378 "evidence": "Table 3 shows Qwen-Max-09-19 predicts Democratic victory even under asymmetric pro-Trump framing in Georgia (Section 4.5).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Agent decisions are highly sensitive to semantically irrelevant prompt changes, with Democratic support swinging from 36.2% to 58.6% across context variants.", 383 "evidence": "Figure 7 shows results for 8 semantically neutral but syntactically different context variants in Pennsylvania (Section 4.5).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Agents exhibit positional instability where swapping candidate order in the JSON response format causes complete vote flips.", 388 "evidence": "Appendix H provides examples of 'Swing Agents' who invert their preference solely due to JSON key ordering (Section 4.5).", 389 "supported": "strong" 390 } 391 ], 392 "methodology_tags": ["benchmark-eval", "case-study"], 393 "key_findings": "FlockVote uses LLM agents with demographic profiles to simulate the 2024 U.S. presidential election across 7 swing states, correctly predicting 6 of 7 outcomes. However, the paper's most significant findings are negative: LLM agents exhibit severe political bias (pro-Democratic default), extreme sensitivity to minor prompt rephrasing (22+ percentage point swings), and positional instability where changing JSON key order flips votes. These findings challenge the reliability of LLM-based social simulation despite the macro-level accuracy.", 394 "red_flags": [ 395 { 396 "flag": "Post-hoc model selection", 397 "detail": "The primary model Qwen-Max-04-28 was identified as the best choice through the same 2024 election data used for main results. The paper acknowledges this was a 'fortuitous choice' rather than a principled selection, raising concerns about overfitting to the specific outcome." 398 }, 399 { 400 "flag": "Extreme prompt sensitivity undermines main result", 401 "detail": "The paper's own sensitivity analysis (Figure 7) shows that trivial prompt variations cause 22+ percentage point swings. This calls into question whether the main result's accuracy is robust or coincidental with the chosen prompt formulation." 402 }, 403 { 404 "flag": "No statistical tests for any comparisons", 405 "detail": "All comparisons between models, contexts, and conditions are based on point estimates. With the demonstrated high variance across prompt variants, the absence of statistical testing makes it impossible to determine if observed differences are meaningful." 406 }, 407 { 408 "flag": "Training data contamination not ruled out", 409 "detail": "While the 2024 election was chosen to avoid contamination, the paper never verifies the training cutoff dates of the models used. Qwen-Max-2024-04-28 could have been trained on polling data that predicted similar outcomes, which would explain the 'correct' predictions without genuine reasoning." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Generative agents: Interactive simulacra of human behavior", 415 "authors": ["Joon Sung Park", "Joseph O'Brien", "Carrie Jun Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S Bernstein"], 416 "year": 2023, 417 "relevance": "Foundational work on LLM agents simulating human behavior, directly relevant to agentic AI capabilities." 418 }, 419 { 420 "title": "Out of one, many: Using language models to simulate human samples", 421 "authors": ["Lisa P Argyle", "Ethan C Busby", "Nancy Fulda", "Joshua R Gubler", "Christopher Rytting", "David Wingate"], 422 "year": 2023, 423 "relevance": "Validated using LLM agents for social science research by replicating human survey responses." 424 }, 425 { 426 "title": "Large language models as simulated economic agents: What can we learn from homo silicus?", 427 "authors": ["John J Horton"], 428 "year": 2023, 429 "relevance": "Proposed LLMs as simulated economic agents, foundational concept for LLM-based social simulation." 430 }, 431 { 432 "title": "Hidden persuaders: LLMs' political leaning and their influence on voters", 433 "authors": ["Yujin Potter", "Shiyang Lai", "Junsol Kim", "James Evans", "Dawn Song"], 434 "year": 2024, 435 "arxiv_id": "2410.24190", 436 "relevance": "Demonstrates that biased LLM agents can actually change real voters' opinions, relevant to AI safety and influence." 437 }, 438 { 439 "title": "Why do multi-agent LLM systems fail?", 440 "authors": ["Mert Cemri", "Melissa Z Pan", "Shuyi Yang"], 441 "year": 2025, 442 "arxiv_id": "2503.13657", 443 "relevance": "Analyzes failure modes of multi-agent LLM systems including reasoning-action mismatch." 444 }, 445 { 446 "title": "ProSA: Assessing and understanding the prompt sensitivity of LLMs", 447 "authors": ["Jingming Zhuo", "Songyang Zhang", "Xinyu Fang"], 448 "year": 2024, 449 "arxiv_id": "2410.12405", 450 "relevance": "Directly relevant to understanding LLM sensitivity to prompt variations, a key finding in FlockVote." 451 }, 452 { 453 "title": "LLM stability: A detailed analysis with some surprises", 454 "authors": ["Berk Atil", "Alexa Chittams", "Liseng Fu", "Ferhan Ture"], 455 "year": 2024, 456 "relevance": "Documents non-deterministic instability in LLMs even at zero temperature, relevant to reliability of LLM-based simulation." 457 }, 458 { 459 "title": "Measuring gender and racial biases in large language models: Intersectional evidence from automated resume evaluation", 460 "authors": ["Jiafu An", "Difang Huang", "Chen Lin", "Mingzhu Tai"], 461 "year": 2025, 462 "relevance": "Demonstrates pervasive demographic biases in LLMs, directly relevant to bias in LLM-based voter simulation." 463 }, 464 { 465 "title": "ElectionSim: Massive population election simulation powered by large language model driven agents", 466 "authors": ["Xinnong Zhang", "Jiayu Lin", "Libo Sun"], 467 "year": 2024, 468 "arxiv_id": "2410.20746", 469 "relevance": "Concurrent work on LLM-powered election simulation, directly comparable methodology." 470 }, 471 { 472 "title": "Benchmarking distributional alignment of large language models", 473 "authors": ["Nicole Meister", "Carlos Guestrin", "Tatsunori B Hashimoto"], 474 "year": 2025, 475 "relevance": "Validates probabilistic output format for LLM simulation tasks, methodology adopted by FlockVote." 476 } 477 ] 478 }