scan-v5.json (27008B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", 6 "authors": [ 7 "Yilun Du", 8 "Shuang Li", 9 "Antonio Torralba", 10 "Joshua B. Tenenbaum", 11 "Igor Mordatch" 12 ], 13 "year": 2023, 14 "venue": "International Conference on Machine Learning", 15 "arxiv_id": "2305.14325", 16 "doi": "10.48550/arXiv.2305.14325" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims of improved reasoning and factuality are backed by Tables 1 and 2 showing consistent accuracy gains (e.g., arithmetic 67.0% → 81.8%, MMLU 63.9% → 71.1%, biography 66.0% → 73.8%).", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Ablations varying number of agents (Figure 10a) and debate rounds (Figure 10b) with all else held constant provide evidence for causal attribution of performance gains to the debate mechanism.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The conclusion claims debate will 'pave the way for further breakthroughs in language generation and understanding' based on 6 tasks with a single model family (GPT-3.5), well beyond what the evidence supports.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "No discussion of whether gains could be explained by additional inference compute alone (e.g., repeated sampling without debate), or whether effect is specific to instruction-tuned/RLHF models.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly acknowledges that biography evaluation using chatGPT as judge is a proxy metric with limitations, while most other tasks use direct accuracy against ground truth.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 5 'Limitations and Discussion' explicitly addresses computational cost, context length constraints causing models to focus only on recent turns, and convergence to incorrect answers.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "Limitations describe practical challenges rather than methodological threats to validity; benchmark contamination, LLM-judge circularity in biography evaluation, and sample representativeness are not addressed.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not explicitly state what the results do not show or restrict conclusions to GPT-3.5 and the tested task types; claims are presented as broadly applicable to LLMs.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment, grant numbers, or financial support disclosure appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations (MIT CSAIL and Google Brain) are clearly listed on the title page.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, making this criterion not applicable.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or financial disclosure appears in the paper despite a Google Brain co-author evaluating general-purpose LLM improvements.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Multiagent debate is operationally defined in Section 2: agents are LLM instances, debate rounds are described step-by-step, and the consensus prompt mechanism is specified.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The introduction explicitly lists three contributions: the debate approach, a new biography benchmark, and evaluation across six tasks.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 4 situates the work relative to chain-of-thought, self-consistency, reflection, majority voting, and Irving et al.'s AI safety debate, explaining how this work differs from each.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "A project website URL is mentioned but no explicit code release is stated in the paper; website links without confirmed code availability = NO.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The majority of evaluation uses standard public benchmarks (GSM8K, MMLU, BIG-Bench) available unmodified; the novel biography dataset's release is not stated but is a minor fraction.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Only the model snapshot (gpt-3.5-turbo-0301) is specified; no requirements file, SDK version, or runtime environment specifications are provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Prompts are provided in Table 15 and procedures described, but no step-by-step reproduction guide exists and API sampling parameters are absent.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": true, 150 "justification": "Tables 1 and 2 report all main results with ± standard deviation (e.g., '81.8 ± 2.3' for debate on arithmetic).", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No formal statistical significance tests (t-tests, bootstrap, ANOVA) are reported despite making comparative claims across conditions with n=100-300 samples.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Absolute accuracy differences from a reported baseline are provided (e.g., debate improves arithmetic by ~14.8pp from 67.0% to 81.8%), giving effect size context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Sample sizes (100 arithmetic, 100 GSM8K, 300 chess, 100 MMLU, 100 chess validity) are stated but never justified; no power analysis is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Standard deviation is reported as ± values for all quantitative results in Tables 1 and 2.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Three baselines are compared: single agent (direct), single agent with reflection, and multi-agent majority voting.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Self-Refine (Madaan et al., 2023), Reflexion (Shinn et al., 2023), and self-consistency (Wang et al., 2022) are contemporary methods at the time of submission.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Multiple ablations are presented: number of agents (Figure 10a), debate rounds (Figure 10b), prompt type/stubbornness (Figure 12), summarization strategy (Figure 13), and initialization prompt diversity.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Six tasks with different metrics are used: classification accuracy (arithmetic, GSM8K, MMLU, chess validity, biographies) and pawn score (chess move quality).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "No human evaluation of system outputs is conducted; biography quality is judged by chatGPT itself, not human raters.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "MMLU and GSM8K have established held-out test sets; the paper evaluates on sampled subsets of these standard test splits.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": false, 218 "justification": "MMLU is evaluated with questions 'randomly distributed across subject areas' but only aggregate accuracy is reported; no per-subject breakdown is provided.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Figures 21-23 show three incorrect GSM8K debates; Section 5 explicitly discusses cases where models converge confidently on wrong answers.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Self-reflection regresses MMLU accuracy from 63.9% to 57.7% — a negative result that is reported in Table 2 without minimization.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "The appendix explicitly states all experiments use 'the gpt-3.5-turbo-0301 model', providing a specific dated snapshot.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Table 15 in the appendix provides both starting prompts and debate prompts for all six tasks.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "Temperature, top-p, max tokens, and other API sampling parameters are not reported, making exact reproduction impossible.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Section 2 describes the debate scaffolding in detail: how agents receive concatenated peer responses, the consensus prompt format, number of agents (3) and rounds (2) used by default.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Biography ground truth extraction from Wikipedia and the chatGPT-based fact-checking evaluation pipeline are documented in the appendix; standard benchmark preprocessing is minimal.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "The novel 524-person biography dataset is not stated to be released; individual model outputs used in evaluation are not made available for independent verification.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The appendix describes Wikipedia fact extraction for biographies, chess game sourcing from pgnmentor.com, and sampling procedures for standard benchmarks.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants were recruited; standard benchmarks and web-scraped data were used.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full biography evaluation pipeline is documented: Wikipedia extraction → generation → chatGPT judge with yes/no/uncertain → filtering uncertain responses → accuracy computation.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The training data cutoff for gpt-3.5-turbo-0301 is never stated in the paper.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of whether GPT-3.5 may have been trained on GSM8K or MMLU test examples, both of which were publicly released before GPT-3.5's training.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "GSM8K and MMLU were publicly available well before GPT-3.5's training cutoff; the resulting contamination risk is never acknowledged or analyzed.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "The paper acknowledges the approach is 'more computationally expensive' but does not quantify cost in dollars, API calls, or tokens.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Total computational budget, number of API calls across all experiments, or runtime costs are not reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Multiagent debate significantly improves mathematical reasoning over all baselines", 375 "evidence": "Table 1: arithmetic 67.0% → 81.8%, GSM8K 77.0% → 85.0%, chess pawn score 91.4 → 122.9, all with reported ± values", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Multiagent debate improves factual accuracy across biography, MMLU, and chess validity tasks", 380 "evidence": "Table 2: biographies 66.0% → 73.8%, MMLU 63.9% → 71.1%, chess validity 29.3% → 45.2%", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Debate can correct errors even when all agents initially answer incorrectly", 385 "evidence": "Qualitative figures 4, 5 show multiple cases where all agents start wrong but converge to correct answers; also noted in GSM8K examples in appendix", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Performance monotonically increases with number of agents and debate rounds", 390 "evidence": "Figure 10a shows accuracy rising from 1 to 5 agents; Figure 10b shows accuracy rising from 1 to 4 rounds then plateauing", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Self-reflection without other agents hurts factuality performance relative to direct single-agent baseline", 395 "evidence": "Table 2: reflection drops MMLU from 63.9% to 57.7%; biography improves marginally from 66.0% to 68.3%", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Debate is additive with chain-of-thought prompting", 400 "evidence": "Figure 6: debate improves GSM8K both with (CoT) and without chain-of-thought, and the combination outperforms both individually", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval" 406 ], 407 "key_findings": "Multiagent debate — where multiple LLM instances iteratively propose, critique, and update each other's answers over several rounds — consistently outperforms single-agent inference, self-reflection, and majority voting across six reasoning and factuality tasks. Gains increase with more agents and more debate rounds, and debate can correct errors even when all agents initially agree on a wrong answer, suggesting emergent error-correction rather than simple amplification of correct answers. Self-reflection without other agents actually degrades factuality on some tasks (MMLU: -6.2pp), highlighting that multi-agent dynamics are key, not iterative refinement per se. The method requires only black-box API access and is orthogonal to existing prompting techniques like chain-of-thought.", 408 "red_flags": [ 409 { 410 "flag": "LLM-as-judge circularity", 411 "detail": "Biography factuality is evaluated by prompting chatGPT to judge whether generated biographies match Wikipedia ground truth, creating potential evaluator bias since chatGPT is the same model used for generation." 412 }, 413 { 414 "flag": "No significance tests", 415 "detail": "Comparative claims are made on n=100 samples per task with no formal statistical tests; overlapping ± ranges in some conditions (e.g., reflection vs. single agent on biographies) are not assessed for significance." 416 }, 417 { 418 "flag": "Benchmark contamination unaddressed", 419 "detail": "GSM8K and MMLU were publicly available before GPT-3.5's training cutoff; potential memorization of test examples inflating baseline accuracy is never acknowledged." 420 }, 421 { 422 "flag": "Novel biography dataset unreleased", 423 "detail": "The paper introduces a 524-person biography benchmark as a contribution but does not state it is publicly available, preventing replication of this specific evaluation." 424 }, 425 { 426 "flag": "API sampling parameters absent", 427 "detail": "Temperature, top-p, and max tokens for gpt-3.5-turbo-0301 are not reported, making exact reproduction of probabilistic results impossible." 428 }, 429 { 430 "flag": "Broad generalization from narrow evidence", 431 "detail": "Conclusions claim debate will 'advance the capabilities of LLMs' broadly based on 6 tasks with one model family in 2023; no discussion of conditions where debate might not help." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 437 "relevance": "Key baseline and complementary technique; debate is shown to combine additively with chain-of-thought on GSM8K" 438 }, 439 { 440 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 441 "relevance": "Directly motivates the majority-voting baseline; debate is contrasted as using LLM synthesis rather than majority vote" 442 }, 443 { 444 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 445 "relevance": "Single-agent reflection baseline; debate outperforms self-refine, especially on factuality tasks where reflection hurts" 446 }, 447 { 448 "title": "Training Verifiers to Solve Math Word Problems (GSM8K)", 449 "relevance": "Primary mathematical reasoning benchmark providing the GSM8K evaluation set" 450 }, 451 { 452 "title": "Measuring Massive Multitask Language Understanding (MMLU)", 453 "relevance": "Primary factual knowledge benchmark used to evaluate factuality improvements" 454 }, 455 { 456 "title": "AI Safety via Debate", 457 "relevance": "Closest prior work proposing debate for AI; contrasted as requiring human judges whereas this paper achieves automated consensus" 458 }, 459 { 460 "title": "Large Language Models Are Zero-Shot Reasoners", 461 "relevance": "Zero-shot chain-of-thought baseline combined with debate; demonstrates orthogonality of the two approaches" 462 }, 463 { 464 "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models (BIG-Bench)", 465 "relevance": "Source of chess-state tracking benchmark used for chess move validity evaluation" 466 }, 467 { 468 "title": "Reflexion: An Autonomous Agent with Dynamic Memory and Self-Reflection", 469 "relevance": "Second reflection baseline compared against multiagent debate" 470 } 471 ], 472 "engagement_factors": { 473 "practical_relevance": { 474 "score": 3, 475 "justification": "Directly applicable to any practitioner with black-box API access; prompts are fully provided and method requires no fine-tuning or model internals." 476 }, 477 "surprise_contrarian": { 478 "score": 2, 479 "justification": "The finding that all agents can start wrong yet converge correctly, and that self-reflection hurts factuality, both challenge intuitions about iterative LLM improvement." 480 }, 481 "fear_safety": { 482 "score": 1, 483 "justification": "Connects to AI safety debate framing (Irving et al.) but focuses on capability improvement rather than safety risks; no safety concerns are raised." 484 }, 485 "drama_conflict": { 486 "score": 1, 487 "justification": "Challenges single-agent reflection approaches, but the framing is constructive and the paper proposes debate as complementary rather than adversarial to prior work." 488 }, 489 "demo_ability": { 490 "score": 2, 491 "justification": "Project website exists and Table 15 provides all prompts; practitioners can implement the approach immediately using any chat API." 492 }, 493 "brand_recognition": { 494 "score": 2, 495 "justification": "MIT CSAIL authors including Tenenbaum and Torralba, plus Google Brain co-author; ICML publication adds significant credibility." 496 } 497 }, 498 "hn_data": { 499 "threads": [ 500 { 501 "hn_id": "34541836", 502 "title": "MusicLM: Generating music from text", 503 "points": 291, 504 "comments": 107, 505 "url": "https://news.ycombinator.com/item?id=34541836", 506 "created_at": "2023-01-27T02:44:37Z" 507 }, 508 { 509 "hn_id": "39454961", 510 "title": "How does one detect hallucinations?", 511 "points": 5, 512 "comments": 2, 513 "url": "https://news.ycombinator.com/item?id=39454961", 514 "created_at": "2024-02-21T15:27:36Z" 515 }, 516 { 517 "hn_id": "36102668", 518 "title": "Improving Factuality and Reasoning in Language Models Through Multiagent Debate", 519 "points": 4, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=36102668", 522 "created_at": "2023-05-28T10:03:03Z" 523 }, 524 { 525 "hn_id": "36097897", 526 "title": "Improving Factuality and Reasoning in Language Models Through Multiagent Debate", 527 "points": 2, 528 "comments": 1, 529 "url": "https://news.ycombinator.com/item?id=36097897", 530 "created_at": "2023-05-27T20:19:32Z" 531 }, 532 { 533 "hn_id": "36124743", 534 "title": "“According To..“ Prompting LLMs Improves Quoting from Pre-Training Data", 535 "points": 2, 536 "comments": 1, 537 "url": "https://news.ycombinator.com/item?id=36124743", 538 "created_at": "2023-05-30T13:37:40Z" 539 }, 540 { 541 "hn_id": "36963354", 542 "title": "“According to” Prompting Language Models Improves Quoting from Pre-Training Data", 543 "points": 2, 544 "comments": 0, 545 "url": "https://news.ycombinator.com/item?id=36963354", 546 "created_at": "2023-08-01T21:33:48Z" 547 }, 548 { 549 "hn_id": "45576976", 550 "title": "From Automation to Autonomy", 551 "points": 2, 552 "comments": 1, 553 "url": "https://news.ycombinator.com/item?id=45576976", 554 "created_at": "2025-10-14T06:48:24Z" 555 }, 556 { 557 "hn_id": "45962516", 558 "title": "Nearest Neighbor Speculative Decoding for LLM Generation and Attribution", 559 "points": 2, 560 "comments": 0, 561 "url": "https://news.ycombinator.com/item?id=45962516", 562 "created_at": "2025-11-18T08:00:42Z" 563 }, 564 { 565 "hn_id": "36326434", 566 "title": "Zero-shot lip-to-speech synthesis with face image based voice control", 567 "points": 1, 568 "comments": 0, 569 "url": "https://news.ycombinator.com/item?id=36326434", 570 "created_at": "2023-06-14T14:21:56Z" 571 }, 572 { 573 "hn_id": "36116514", 574 "title": "Learning to Generate Novel Scientific Directions with Contextualized Discovery", 575 "points": 1, 576 "comments": 0, 577 "url": "https://news.ycombinator.com/item?id=36116514", 578 "created_at": "2023-05-29T18:23:59Z" 579 } 580 ], 581 "top_points": 291, 582 "total_points": 312, 583 "total_comments": 112 584 } 585 }