scan.json (30636B)
1 { 2 "paper": { 3 "title": "Multiple LLM Agents Debate for Equitable Cultural Alignment", 4 "authors": ["Dayeon Ki", "Rachel Rudinger", "Tianyi Zhou", "Marine Carpuat"], 5 "year": 2025, 6 "venue": "Annual Meeting of the Association for Computational Linguistics", 7 "arxiv_id": "2505.24671", 8 "doi": "10.48550/arXiv.2505.24671" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Multi-agent debate between two LLMs improves cultural alignment accuracy over single-LLM baselines in 19/21 model combinations on the NORMAD-ETI benchmark, with an average 7.05% improvement. Debate-Only achieves the best cultural group parity (0.972 average), outperforming even the larger GEMMA-2-27B judge model (0.964). Small 7-9B models debating can match the accuracy of the 27B model (79.2%). The debate process is most valuable when agents initially disagree, with most revisions leading to correct outcomes.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states in the abstract footnote: 'We release our code and dataset at https://github.com/dayeonki/cultural_debate.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "They use the publicly available NORMAD-ETI benchmark (Rao et al., 2024) and state they release their dataset alongside code." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions GPU type (NVIDIA RTX A5000) and HuggingFace model names (Table 4) but provides no requirements.txt, Dockerfile, or detailed dependency specifications." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or specific commands to run." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All main results (Tables 1, 2, 3) report only point estimates of accuracy and parity scores. No confidence intervals, error bars, or ± notation is provided." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Table 2 caption states: 'All improvements are statistically significant (p < 0.05).' Statistical significance testing was performed on the debate results." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements are reported with baseline context throughout: e.g., Table 1 shows '+28.7%' improvement with green text, and absolute accuracies are given for both baseline and experimental conditions (e.g., Si w/o 49.5% → Si w/ 63.7%)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No power analysis or sample size justification is provided. The limitations section (§8) acknowledges 'NORMAD-ETI has 30-40 stories per country, which limits the generalization of the results' but this is post-hoc acknowledgment, not justification." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Main results use temperature 0.0 (deterministic, single run). For Self-Reflect+Debate experiments with temperature 0.8, results are averaged over only 2 runs (Appendix C.5) but no standard deviation, IQR, or other spread measure is reported." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines are compared: Single Model (with/without rule-of-thumb), Self-Reflection, and Oracle model selection. The prior best on NORMAD-ETI (MISTRAL-7B-INSTRUCT at 40.7%) is also referenced (§5.1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "All 7 LLMs tested are from 2024 (LLAMA-3, GEMMA-2, EXAONE-3, YI-1.5, INTERNLM-2.5, AYA-23, SEALLM-3), representing contemporary open-weight models." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple ablation-style experiments are conducted: with/without rule-of-thumb (Table 1), Debate-Only vs Self-Reflect+Debate (Table 2), varying number of debate rounds (Appendix C.3, Table 7), varying self-reflection iterations (Appendix C.4, Table 8)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two distinct metrics are used: accuracy (§4.3, primary) and cultural group parity (§4.3, Eq. 4, Table 3). Additionally, decision dynamics analysis captures initial/final/judge correctness." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation is entirely automated by comparing LLM outputs to ground truth labels. No human evaluation of system outputs is performed." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The judge LLM (GEMMA-2-27B) was selected by evaluating candidates on the same NORMAD-ETI data used for all reported results (Table 5). This means the test set was used for model selection, compromising the held-out nature of the evaluation." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive breakdowns are provided: per-country accuracies (Tables 16-18), per-cultural-group results (Table 3, Figure 9), per-label-group decision dynamics (Figures 6-8, Appendix C.7), and per-LLM-combination results (Table 2)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Appendix F presents detailed case studies including failure cases: Figure 16 shows correct→incorrect transitions, Figure 17 shows persistent incorrect decisions, Figure 18 shows mixed→incorrect outcomes during debate." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multiple negative results are reported: Self-Reflect+Debate does not outperform Debate-Only on average (75.6% < 76.3%, §5.4), INTERNLM-2.5 shows significant drops in S+D (§5.4), increasing debate rounds doesn't improve performance (Table 7, Appendix C.3), and adjudication exceeds individual accuracies in only half the settings (§5.3)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims are supported: 'debate improves both overall accuracy and cultural group parity' (Tables 2 and 3), 'multi-agent debate enables relatively small LLMs (7-9B) to achieve accuracies comparable to that of a much larger model (27B)' (GEMMA-2 achieves 79.6% vs 79.2% for 27B, §5.5)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims like 'debate improves accuracy' are tested via controlled single-variable comparisons: same models and data, different interaction strategies (single model → self-reflection → debate). The ablation design with controlled manipulation supports these causal claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract specifies '7 open-weight LLMs (and 21 LLM combinations) using the NORMAD-ETI benchmark for social etiquette norms in 75 countries.' The limitations section (§8) explicitly bounds generalization: 'a single dataset,' '30-40 stories per country,' 'ternary classification task.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not substantively discuss alternative explanations for why debate works. The additional compute explanation is partially addressed by comparing multi-iteration self-reflection (Appendix C.4) but not framed as testing an alternative explanation. No discussion of whether improvements come from the judge LLM's strength rather than the debate process itself." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper acknowledges the gap between the ternary classification proxy and true cultural alignment in §8: 'based on a ternary classification task which does not account for decisions more nuanced than \"Yes\", \"No\", or \"Neither\".'" 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Table 4 (Appendix) provides exact HuggingFace model identifiers for all 7 LLMs (e.g., 'meta-llama/Meta-Llama-3-8B-Instruct', 'google/gemma-2-9b-it'), which uniquely identify the model versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text for all experimental conditions is provided in Appendix A (Prompts A.1.1 through A.4.6), including Single Model, Self-Reflection, Debate-Only, and Self-Reflect+Debate variants." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.2 states: 'We set the default sampling temperature to 0.0, and employ 0.8 where multiple runs are required.' Temperature is the primary hyperparameter for inference-only evaluation." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The multi-agent debate framework is described in detail in §3.2 with formal notation: initial decisions, feedback exchange, final decisions, and judge adjudication. The workflow is illustrated in Figure 1 and mathematically formalized in Equations 1-3." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.1 describes the NORMAD-ETI dataset (2.6K stories, 75 countries, ternary labels from the Cultural Atlas). Table 19 provides detailed statistics per cultural group. The data is used as-is from the published benchmark." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 8 ('Limitation') provides extensive discussion spanning over a full page, covering dataset scope, LLM coverage, debate variant coverage, computational overhead, and inference variation." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed: '30-40 stories per country, which limits the generalization,' 'ternary classification task which does not account for decisions more nuanced,' 'scope of our multi-agent debate setup is as comprehensive as our computational budget allows,' 'there might be some variation across different LLM inference runs.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 8 explicitly states what was not tested: other datasets beyond NORMAD-ETI, other debate formats, assigning cultural roles to LLMs, debate with more than two agents. The conclusion (§7) also notes 'future work to explore optimal strategies.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "NORMAD-ETI is a publicly available benchmark, and the authors state they release their code and dataset at the GitHub repository. The underlying data can be independently verified." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes NORMAD-ETI: 2.6K stories from 75 countries derived from the Cultural Atlas social-etiquette norms, with ternary ground truth labels. The benchmark's construction is attributed to Rao et al. (2024)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants in this study. The data source is a standard public benchmark (NORMAD-ETI)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The full pipeline from input to output is documented: country + rule-of-thumb + story → initial decisions → feedback/self-reflection → final decisions → judge adjudication (§3.1-3.2). Each stage is formalized and illustrated in Figure 1." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The Acknowledgments section lists NSF Fairness in AI Grant 2147292, TRAILS (NSF Award No. 2229885), and ODNI/IARPA HIATUS Program contract #2022-22072200006." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All four authors are affiliated with University of Maryland, clearly stated in the paper header. They do not evaluate their own commercial product." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funders are NSF and ODNI/IARPA — government agencies with no financial interest in whether multi-agent debate outperforms single-model approaches." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the 7 LLMs used. The models are evaluated on NORMAD-ETI benchmark performance, and their training periods are not discussed." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether NORMAD-ETI data or the Cultural Atlas content could have appeared in the training data of any of the tested LLMs." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "NORMAD-ETI was published in 2024 (Rao et al., 2024) and the models are from 2024. No analysis of whether benchmark content could have been in training data. The Cultural Atlas (source of NORMAD-ETI) is publicly available online." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All evaluation is automated comparison of LLM outputs to ground truth labels." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates LLM outputs on an existing benchmark." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Appendix E (Table 13) reports average inference time per method (e.g., Single Model: 00:32, Debate-Only: 03:12 + 00:28 for judge) and GPU requirements (NVIDIA RTX A5000). Table 5 also reports inference time for judge LLM candidates." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 13 specifies GPU requirements (1-3 NVIDIA RTX A5000) and time budgets for each method. The paper notes all experiments use 7-9B open-weight models with a 27B judge, representing 'a more efficient alternative to closed-source, larger LLMs.'" 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Main results use temperature 0.0 (deterministic). For temperature 0.8 experiments (Self-Reflect+Debate), only 2 runs are averaged with no seed sensitivity analysis or variance reporting." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Temperature 0.0 implies deterministic single-run results. For Self-Reflect+Debate with temperature 0.8, Appendix C.5 states 'the process is repeated twice for each LLM pair, alternating the order of the options. Each entry is the average count across two runs.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. Fixed temperature values (0.0 and 0.8) and a single round of debate are used as defaults without reporting any search process." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The judge LLM (GEMMA-2-27B) was selected by evaluating candidate judges on the same NORMAD-ETI test data used for all reported results (Table 5). This is selection on the test set, not a held-out validation set." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Table 2 reports 'All improvements are statistically significant (p < 0.05)' across 21 LLM combinations but does not mention Bonferroni, Holm, or any multiple comparison correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement all methods (single model, self-reflection, debate variants) without acknowledging potential author-evaluation bias or having independent evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 13 compares computational cost across methods. Appendix C.4 compares multi-iteration self-reflection (matching debate compute) against debate, showing debate outperforms at comparable cost. The paper notes 'Self-Reflection and Debate-Only show comparable efficiency' (Appendix E)." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 8 discusses NORMAD-ETI's construct validity: provides ground truth labels, story-based scenarios close to real life, 75-country coverage, 'built on global community interviews with translators and rigorously validated by community experts, religious leaders, and academic researchers.' Limitations of ternary classification are acknowledged." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The debate framework IS the variable being tested. When comparing models, the same debate scaffold is used for both agents. The study's purpose is to evaluate the scaffolding strategies themselves, not to isolate model capabilities." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether NORMAD-ETI content (derived from the publicly available Cultural Atlas) existed before the training cutoffs of the tested models." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The rule-of-thumb is intentionally provided as input context, but no discussion of whether this constitutes feature leakage or how it relates to information available in real-world deployment." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Stories from the same country share the same cultural norms source (Cultural Atlas entries). No discussion of whether within-country story dependencies affect the independence assumption underlying statistical tests." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No contamination detection method is applied (no canary strings, membership inference, n-gram overlap, or decontamination pipeline)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Self-reflection improves single-LLM accuracy by an average of 3.26% across all tested LLMs.", 365 "evidence": "Table 1 shows Self-Reflection accuracies exceeding Single Model (w/ rule-of-thumb) for all 7 LLMs, with improvements ranging from 1.13% (YI-1.5) to 5.22% (GEMMA-2).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Multi-agent debate improves individual model accuracies over single-model baselines in 19 out of 21 LLM combinations with an average improvement of 7.05%.", 370 "evidence": "Table 2 shows D(Mi) values exceeding Si(Mi) in 19/21 settings (underlined in table). Average individual accuracy increases from 66.4% to 69.1% (M1) and 67.5% to 74.2% (M2).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Multi-agent debate enables 7-9B parameter LLMs to achieve accuracy comparable to the much larger GEMMA-2-27B (79.2%).", 375 "evidence": "Table 2 shows GEMMA-2 (9B) achieving 79.6% in Self-Reflect+Debate with EXAONE-3, and LLAMA-3+GEMMA-2 debate achieving 79.7% final accuracy. Both match or exceed GEMMA-2-27B's 79.2% single-model accuracy.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Debate-Only achieves the highest cultural group parity across methods, with an average parity premium of 0.972.", 380 "evidence": "Table 3 shows Debate-Only (D) achieving 0.972 average parity, exceeding Single Model w/o (0.905), w/ (0.960), Self-Reflection (0.969), Self-Reflect+Debate (0.958), and even the judge LLM alone (0.964).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Distinct LLMs exhibit complementary cultural knowledge, with oracle model selection improving accuracy by 22.5% on average over single models.", 385 "evidence": "Table 2 'Ora' column shows oracle accuracies averaging 81.9%, compared to individual single model averages of 66.4% and 67.5%. The best pair (EXAONE-3+AYA-23) achieves 91.6% oracle accuracy.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Increasing the number of debate rounds beyond one does not improve performance.", 390 "evidence": "Table 7 (Appendix C.3) shows that for LLAMA-3+GEMMA-2, 1 round achieves the highest final accuracy (79.7%) compared to 2-5 rounds (78.5%-79.5%).", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Judge LLM selected on test data", 397 "detail": "GEMMA-2-27B was chosen as judge by evaluating candidate judges' single-model accuracy on the same NORMAD-ETI data used for all reported results (Table 5). This means the test set was used for a key design decision, potentially inflating reported debate accuracies." 398 }, 399 { 400 "flag": "No multiple comparison correction", 401 "detail": "The paper claims 'all improvements are statistically significant (p < 0.05)' across 21 LLM combinations (Table 2) without mentioning any correction for multiple comparisons. With 21 comparisons at α=0.05, at least one false positive is expected by chance." 402 }, 403 { 404 "flag": "No contamination analysis", 405 "detail": "Seven LLMs from 2024 are evaluated on NORMAD-ETI (also 2024), which draws from the publicly available Cultural Atlas. No analysis of whether models may have seen this content during training, despite the Cultural Atlas being online." 406 }, 407 { 408 "flag": "Only two runs for stochastic experiments", 409 "detail": "Self-Reflect+Debate experiments use temperature 0.8 and are averaged over only 2 runs (Appendix C.5) with no variance reporting. Two runs provide essentially no information about result stability." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Improving factuality and reasoning in language models through multiagent debate", 415 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"], 416 "year": 2023, 417 "arxiv_id": "2305.14325", 418 "relevance": "Foundational multi-agent debate framework showing LLM collaboration reduces hallucination through 'society of minds' interaction." 419 }, 420 { 421 "title": "AI safety via debate", 422 "authors": ["Geoffrey Irving", "Paul Christiano", "Dario Amodei"], 423 "year": 2018, 424 "arxiv_id": "1805.00899", 425 "relevance": "Seminal proposal of debate as an AI alignment strategy, directly motivating the multi-agent debate approach." 426 }, 427 { 428 "title": "Debating with more persuasive LLMs leads to more truthful answers", 429 "authors": ["Akbir Khan", "John Hughes", "Dan Valentine"], 430 "year": 2024, 431 "arxiv_id": "2402.06782", 432 "relevance": "Demonstrates that LLM debate with more persuasive models improves truthfulness, directly relevant to multi-agent debate for alignment." 433 }, 434 { 435 "title": "Encouraging divergent thinking in large language models through multi-agent debate", 436 "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"], 437 "year": 2024, 438 "relevance": "Shows multi-agent debate encourages divergent thinking in LLMs for reasoning tasks, complementary approach to cultural alignment debate." 439 }, 440 { 441 "title": "Modular pluralism: Pluralistic alignment via multi-LLM collaboration", 442 "authors": ["Shangbin Feng", "Taylor Sorensen", "Yuhan Liu"], 443 "year": 2024, 444 "relevance": "Proposes multi-LLM collaboration for pluralistic alignment, directly related to using diverse models for cultural alignment." 445 }, 446 { 447 "title": "Don't hallucinate, abstain: Identifying LLM knowledge gaps via multi-LLM collaboration", 448 "authors": ["Shangbin Feng", "Weijia Shi", "Yike Wang"], 449 "year": 2024, 450 "relevance": "Uses multi-LLM collaboration to identify knowledge gaps, relevant to understanding complementary LLM strengths." 451 }, 452 { 453 "title": "NormAD: A framework for measuring the cultural adaptability of large language models", 454 "authors": ["Abhinav Rao", "Akhila Yerukola", "Vishwa Shah"], 455 "year": 2024, 456 "arxiv_id": "2404.12464", 457 "relevance": "The primary benchmark (NORMAD-ETI) used in this study for evaluating cultural alignment of LLMs." 458 }, 459 { 460 "title": "Reflexion: Language agents with verbal reinforcement learning", 461 "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman"], 462 "year": 2023, 463 "arxiv_id": "2303.11366", 464 "relevance": "Foundational work on LLM self-reflection as a feedback mechanism, directly adapted in the Self-Reflection baseline." 465 }, 466 { 467 "title": "CultureLLM: Incorporating cultural differences into large language models", 468 "authors": ["Cheng Li", "Mengzhou Chen", "Jindong Wang"], 469 "year": 2024, 470 "arxiv_id": "2402.10946", 471 "relevance": "Trains specialized LLMs for cultural alignment, representing the single-LLM training approach that multi-agent debate aims to complement." 472 }, 473 { 474 "title": "CulturePark: Boosting cross-cultural understanding in large language models", 475 "authors": ["Cheng Li", "Damien Teney", "Linyi Yang"], 476 "year": 2024, 477 "arxiv_id": "2405.15145", 478 "relevance": "Closest prior work using LLM multi-agent communication for cultural data collection, though focused on data generation rather than inference." 479 }, 480 { 481 "title": "Multi-LLM debate: Framework, principals, and interventions", 482 "authors": ["Andrew Estornell", "Yang Liu"], 483 "year": 2024, 484 "relevance": "Formal framework for multi-LLM debate analyzing collaborative approaches, directly relevant to debate methodology." 485 }, 486 { 487 "title": "On scalable oversight with weak LLMs judging strong LLMs", 488 "authors": ["Zachary Kenton", "Noah Y. Siegel", "János Kramár"], 489 "year": 2024, 490 "arxiv_id": "2407.04622", 491 "relevance": "Addresses scalable oversight through LLM judging, relevant to the judge LLM component of the debate framework." 492 } 493 ], 494 "engagement_factors": { 495 "practical_relevance": { 496 "score": 1, 497 "justification": "The multi-agent debate framework requires orchestrating multiple LLMs and a judge, making it impractical for most applications; limited to cultural alignment tasks on a specific benchmark." 498 }, 499 "surprise_contrarian": { 500 "score": 1, 501 "justification": "Multi-agent collaboration improving performance is expected; the finding that small models can match larger ones through debate is somewhat notable but not strongly contrarian." 502 }, 503 "fear_safety": { 504 "score": 0, 505 "justification": "No AI safety or security concerns raised; the paper focuses on improving cultural fairness in LLM predictions." 506 }, 507 "drama_conflict": { 508 "score": 0, 509 "justification": "No controversy or dramatic claims; straightforward empirical study of debate strategies." 510 }, 511 "demo_ability": { 512 "score": 2, 513 "justification": "Code released on GitHub with open-weight models, allowing replication with accessible hardware (RTX A5000)." 514 }, 515 "brand_recognition": { 516 "score": 0, 517 "justification": "University of Maryland research lab; not a major industry AI lab or well-known brand." 518 } 519 } 520 }