scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32768B)
      1 {
      2   "paper": {
      3     "title": "Disagreements in Reasoning: How a Model's Thinking Process Dictates Persuasion in Multi-Agent Systems",
      4     "authors": [
      5       "Haodong Zhao",
      6       "Jidong Li",
      7       "Zhaomin Wu",
      8       "Tianjie Ju",
      9       "Zhuosheng Zhang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2509.21054",
     14     "doi": "10.48550/arXiv.2509.21054"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor",
     19     "data_leakage"
     20   ],
     21   "methodology_tags": [
     22     "benchmark-eval"
     23   ],
     24   "key_findings": "The paper identifies a 'Persuasion Duality' in multi-agent LLM systems: Large Reasoning Models (LRMs) using thinking mode are substantially more resistant to persuasion, while sharing their thinking content with others dramatically increases persuasive efficacy (average 21% increase on objective tasks). Crucially, the persuasiveness comes from logical coherence rather than mere verbosity — replacing thinking content with mismatched reasoning from another model actively hurts persuasion below baseline. Models are more susceptible to persuasion on subjective questions than objective ones, and multi-hop persuasion chains exhibit non-linear propagation effects.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper uses publicly available datasets: MMLU (Hendrycks et al., 2020) and samples from PersuasionBench (Durmus et al., 2024) and Perspectrum (Chen et al., 2019). All are standard public benchmarks."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Appendix A.3 mentions VLLM v0.10.0 and transformers v4.56.0, but no Python version, OS, CUDA version, GPU type, or full dependency list is provided. Not sufficient to recreate the environment."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "All heatmaps (Figures 1, 2, 13, 14) report ± values for each cell (e.g., '7.0 ± 1.6'). These appear to be confidence intervals for the proportions."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Despite making claims like 'significantly greater resistance to persuasion' and comparing model pairs extensively, no formal statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Effect sizes are reported with baseline context throughout: e.g., 'PR of 65.78%, a 19% relative improvement over the baseline (PR=46.31%)' (Section 3.3.2), 'average of 21.07%' increase for thinking content (Section 3.2), and 'average gains of -7.41%, -1.92%, and 2.07%' for different models."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "MMLU provides ~10,000 questions and 1,000 subjective claims are sampled, but no justification is given for these sample sizes. No power analysis is discussed."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The ± values reported in all heatmaps (Figures 1, 2, 13, 14) provide spread measures for the persuasion rates across the evaluation sets."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper systematically compares LRMs vs LLMs, thinking vs non-thinking modes, and with/without thinking content. Each condition serves as a baseline for the others in the 10×10 evaluation matrix."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Models include o4-mini, DeepSeek-R1, Gemini-2.5-flash, and Qwen3-32B — all are 2025 frontier models representing current state of the art."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Figure 6 presents a clear ablation: w/o thinking content (baseline), w/ native thinking content, padding (length-matched non-semantic tokens), and replace (mismatched thinking from another LRM). This isolates the contribution of logical coherence vs. length."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Three metrics are defined and used: Persuaded-Rate (PR), Remain-Rate (RR), and Other-Rate (OR), formally defined in Equations 1-3."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation is entirely automated — measuring whether models change their selected option after exposure to persuasive content. No human evaluation of persuasion quality is conducted."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No explicit discussion of held-out test sets. The paper evaluates pre-trained models without fine-tuning, but does not discuss whether any selection decisions (e.g., prompt design, experimental conditions) were informed by preliminary results on the same data."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down by every model pair (heatmaps), objective vs. subjective datasets, with vs. without thinking content, and token length settings (Figure 5). Per-model average persuasion rates are shown in Figures 3 and 4."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section C.1 and Figure 12 present a detailed case study of a model being successfully misled, tracing the thinking process. Section 4.1 analyzes the attention mechanism weakness that enables failures."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Several negative results reported: thinking mode for persuaders 'yields inconsistent changes' with some models showing negative gains (e.g., Gemini -7.41% in Figure 1a). The 'Replace' condition in Figure 6 shows thinking content can be actively harmful (PR drops below baseline to 32.42%)."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract's claims about reasoning resistance (supported by heatmaps Figures 1-2, Figure 7), thinking content boosting persuasion (Figures 1-2 w/ vs w/o), and multi-hop propagation (Figures 9, 16) are all supported by corresponding experimental results."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper makes causal claims about thinking mode causing increased resistance and thinking content causing increased persuasion. These are supported by controlled single-variable comparisons: same model with thinking on/off, same content with/without thinking block. The ablation in Figure 6 (native vs padding vs replace) provides controlled manipulation to isolate content vs length."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper tests 7 models on MMLU and 1,000 subjective claims, but makes broad claims about 'the safety, robustness, and design of future MAS' and 'the cognitive architecture of persuasion.' The title says 'Multi-Agent Systems' generally despite testing only specific models on two datasets."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 3.3.2 explicitly tests the alternative explanation that persuasiveness comes from length rather than content quality, using padding and replace conditions (Figure 6). This is a substantive alternative-explanation investigation."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper formally distinguishes between human persuasion (Definition 2.1, involving intentional belief change) and LLM persuasion (Definition 2.2, measured behaviorally through output change). The metrics (PR, RR, OR) directly measure answer-switching, and the paper is explicit that this is what is being measured."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Table 1 lists model names (o4-mini, Gemini-2.5-flash, DeepSeek-R1, etc.) but without specific API versions or snapshot dates. Per the schema, marketing names without version identifiers do not count. The open-source models link to HuggingFace pages but accessed dates say 'YYYY-MM-DD' (placeholder not filled) and some say '2025-09-21'."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Full prompt templates for both persuader content generation and persuadee evaluation are provided in Appendix A.4, with the actual text used. Placeholders ({question}, {claim}, etc.) are clearly defined variables. The adversarial detection prompt is shown in Figure 15."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Appendix A.3 reports: temperature=0.7, top_p=0.8, and mentions VLLM v0.10.0 and transformers v4.56.0 for local model serving."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. The experiments involve direct prompt-response interactions between models without tools, feedback loops, or agentic workflows."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 3.1 and A.1 describe preprocessing: MMLU correct answers standardized to option A, persuasion targets fixed as option D. Subjective claims mapped to A/B/C (support/neutral/oppose) with defined target-setting rules. The evaluation set S is formally defined as instances where the model's initial answer differs from the correct answer."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No dedicated limitations section exists. The conclusion (Section 5) is brief and does not include substantive discussion of limitations."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No specific threats to validity are discussed anywhere in the paper."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not explicitly state what its results do NOT show. Claims extend to 'future MAS' design without bounding to the tested models and datasets."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No raw experimental data (model responses, individual trial results) is released. Only aggregated statistics are shown in figures."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 3.1 and A.1 describe data sources (MMLU, PersuasionBench, Perspectrum), how questions were selected, and how the evaluation protocol works (initial answer → persuasion → re-evaluation)."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. Data sources are standard public benchmarks (MMLU, PersuasionBench, Perspectrum)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The pipeline is documented: get initial answer a0 from LLM → filter to instances where model initially answered correctly → generate persuasive content → present to persuadee → measure PR/RR/OR. The formal definitions in Section 2.2 make the pipeline explicit."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding source or acknowledgments section is present in the paper."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are listed on the first page: Shanghai Jiao Tong University, National University of Singapore, and Inner Mongolia Research Institute."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding is disclosed, so independence cannot be assessed."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests statement or financial disclosure is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No training data cutoff dates are stated for any of the evaluated models, despite using MMLU which has been publicly available since 2020."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether models have seen MMLU questions during training. MMLU is one of the most widely used benchmarks and is almost certainly in the training data of frontier models like GPT-4o-mini, Gemini, and DeepSeek-R1."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "MMLU was published in 2020 and is known to be contaminated in most frontier models' training data. The paper does not address this at all, despite its direct relevance — if models have memorized MMLU answers, their resistance to persuasion on those questions reflects memorization confidence, not general persuasion dynamics."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference costs are reported despite running 100 model-pair combinations across ~10,000 MMLU questions and 1,000 subjective claims — a substantial computational effort."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total computational budget, GPU hours, or API costs are mentioned."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Results are presented with ± values but these appear to be proportion confidence intervals, not variance across random seeds. No explicit seed sensitivity analysis is conducted despite using temperature=0.7 which introduces stochasticity."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs per model pair is not explicitly stated. It is unclear whether results represent single runs or averages over multiple runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Temperature=0.7 and top_p=0.8 are used without justification for why these values were chosen. No hyperparameter search is described."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The hyperparameter configuration (temperature=0.7, top_p=0.8) is stated without justification. No explanation of how these values were selected."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The paper makes numerous pairwise comparisons across 100 model pairs on two datasets with multiple conditions, but no correction for multiple comparisons is applied."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors propose the 'Persuasion Duality' concept and test it using their own experimental framework. No acknowledgment of potential author-evaluation bias in experimental design or result interpretation."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Models of vastly different sizes (7B, 8B, 32B, unknown sizes for proprietary models) are compared without discussing compute differences. The paper argues against scale as a primary factor but doesn't control for compute budgets."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper uses MMLU answer-switching as a measure of 'persuasion' without discussing whether this actually measures persuasion as defined in their framework. A model changing its MMLU answer could reflect prompt sensitivity, sycophancy, or conformity rather than genuine persuasion dynamics. The formal definitions (2.1, 2.2) don't bridge this gap."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding is involved. Models are evaluated through direct prompt-response interactions."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "MMLU was published in 2020 and all tested models were trained well after that date. The paper does not discuss temporal leakage despite this being the primary confound — models resistant to persuasion on MMLU may simply have strongly memorized the correct answers."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. The experimental setup provides the correct answer as part of the initial evaluation (standardized to option A), which could leak information depending on how models process the prompt structure."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "Not discussed. MMLU questions may overlap with training data for all models tested."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, decontamination, or temporal splits."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "The Persuasion Duality: explicit reasoning in LRMs creates a trade-off where thinking content enhances persuasive ability while thinking mode enhances resistance to persuasion.",
    376       "evidence": "Heatmaps in Figures 1-2 show thinking-mode LRMs as persuadees have lower PR (average 7.82% reduction on objective, 29.68% on subjective). As persuaders with thinking content shared, average PR increases by 21.07% on objective tasks (comparing Figures 1a and 1b).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Weaker models are more easily persuaded, but model capability has less impact on persuasiveness.",
    381       "evidence": "Figures 1-4 show clear left-to-right gradient in heatmaps (weaker models on right have higher PR), while column-wise analysis shows less variation. Figures 3 and 4 visually confirm this asymmetry.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "LRM persuasiveness is driven by logical coherence, not merely length or presence of thinking content.",
    386       "evidence": "Figure 6: native thinking content yields PR=65.78%, padding tokens of equal length yield PR=62.34%, but mismatched thinking content drops PR to 32.42% (below the 46.31% baseline). This shows content quality matters beyond verbosity.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Models are more easily persuaded on subjective questions than objective ones.",
    391       "evidence": "Comparing Figures 1 and 2: subjective PRs are substantially higher across all model pairs (e.g., Meta-Llama-3-8B as persuadee: ~90% subjective vs ~46% objective).",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Simple CoT prompting enhances non-reasoning LLMs' resistance to persuasion.",
    396       "evidence": "Figure 8 shows small reductions in PR (57.7%→53.7% for Llama→Qwen objective, 71.9%→80.5% PR increase for subjective which contradicts the claim). Effects are small and inconsistent.",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Multi-hop persuasion exhibits non-linear propagation with both amplification and attenuation depending on chain composition.",
    401       "evidence": "Figures 9 and 16 show cases where chain persuasion (A→B→C) exceeds direct persuasion (A→C) and cases where it attenuates. Only 6 chain configurations tested in each figure.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "The attention mechanism reveals models prioritize confident assertions (11.1% attention) over reasoning content (0.39% attention), explaining susceptibility to persuasion.",
    406       "evidence": "Figure 10 shows attention analysis for a single case study. Average attention score of 11.1% on the short confident assertion vs 0.39% on the longer reasoning section.",
    407       "supported": "weak"
    408     },
    409     {
    410       "claim": "Adversarial argument detection prompts consistently reduce persuasion vulnerability across models.",
    411       "evidence": "Figure 11 shows PR reductions across 4 models when using the adversarial detection prompt (e.g., Qwen2.5-7B: 58.9%→15.5%, Hunyuan-7B w/o T: 46.1%→35.8%). Only tested with Llama-3-8B as persuader.",
    412       "supported": "moderate"
    413     }
    414   ],
    415   "red_flags": [
    416     {
    417       "flag": "MMLU contamination confound",
    418       "detail": "All tested frontier models (o4-mini, DeepSeek-R1, Gemini-2.5-flash, Qwen3-32B) were almost certainly trained on MMLU. A model's 'resistance to persuasion' on MMLU may reflect memorization confidence rather than general persuasion dynamics. The paper's central finding — that reasoning models are more resistant — could alternatively be explained by reasoning models having stronger memorization of correct answers. This critical confound is never discussed."
    419     },
    420     {
    421       "flag": "No formal significance testing",
    422       "detail": "The paper makes comparative claims ('significantly greater resistance', 'dramatically increases') across 100 model pairs without formal statistical tests. While ± confidence intervals are shown, no hypothesis tests determine whether observed differences are statistically significant."
    423     },
    424     {
    425       "flag": "Single case study for mechanistic explanation",
    426       "detail": "The attention analysis in Section 4.1 (Figure 10), which is presented as explaining WHY models are persuaded, is based on a single example with one model. This is far too thin to support the mechanistic claims being made."
    427     },
    428     {
    429       "flag": "No limitations section",
    430       "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. For a study making broad claims about MAS design principles, this is a significant omission."
    431     },
    432     {
    433       "flag": "Limited generalizability",
    434       "detail": "Only 7 models (3 with switchable thinking) are tested on 2 datasets (one heavily contaminated). The thinking/non-thinking comparison is limited to 3 model families (Gemini, Qwen, Hunyuan). Claims about 'the cognitive architecture of persuasion' and 'future MAS' design substantially outrun this evidence base."
    435     },
    436     {
    437       "flag": "Stochastic results without seed control",
    438       "detail": "Temperature=0.7 introduces significant stochasticity, yet the paper does not report results across multiple random seeds or clarify how many runs were conducted. The ± values appear to be proportion CIs, not cross-run variance."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "The persuasive power of large language models",
    444       "authors": [
    445         "Simon Martin Breum",
    446         "Daniel Vædele Egdal",
    447         "Victor Gram Mortensen",
    448         "Anders Giovanni Møller",
    449         "Luca Maria Aiello"
    450       ],
    451       "year": 2024,
    452       "relevance": "Early empirical study on LLM persuasion ability, framing persuasion as a function of model scale — the hypothesis this paper challenges."
    453     },
    454     {
    455       "title": "Scaling language model size yields diminishing returns for single-message political persuasion",
    456       "authors": [
    457         "Kobi Hackenburg",
    458         "Ben M Tappin",
    459         "Paul Röttger",
    460         "Scott A Hale",
    461         "Jonathan Bright",
    462         "Helen Margetts"
    463       ],
    464       "year": 2025,
    465       "relevance": "Demonstrates diminishing returns of scale for LLM persuasion, motivating this paper's shift from scale to cognitive architecture."
    466     },
    467     {
    468       "title": "Persuade me if you can: A framework for evaluating persuasion effectiveness and susceptibility among large language models",
    469       "authors": [
    470         "Nimet Beyza Bozdag",
    471         "Shuhaib Mehri",
    472         "Gokhan Tur",
    473         "Dilek Hakkani-Tür"
    474       ],
    475       "year": 2025,
    476       "arxiv_id": "2503.01829",
    477       "relevance": "Framework for evaluating LLM persuasion as persuader, persuadee, and judge — directly related to MAS safety evaluation."
    478     },
    479     {
    480       "title": "Large language models are more persuasive than incentivized human persuaders",
    481       "authors": [
    482         "Philipp Schoenegger",
    483         "Francesco Salvi",
    484         "Jiacheng Liu"
    485       ],
    486       "year": 2025,
    487       "arxiv_id": "2505.09662",
    488       "relevance": "Demonstrates LLM persuasion capabilities exceed human persuaders, raising safety concerns for AI systems."
    489     },
    490     {
    491       "title": "Lies, damned lies, and distributional language statistics: Persuasion and deception with large language models",
    492       "authors": [
    493         "Cameron R Jones",
    494         "Benjamin K Bergen"
    495       ],
    496       "year": 2024,
    497       "arxiv_id": "2412.17128",
    498       "relevance": "Foundational work on LLM persuasion and deception taxonomy, provides the definition of LLM persuasion used in this paper."
    499     },
    500     {
    501       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    502       "authors": [
    503         "Daya Guo",
    504         "Dejian Yang",
    505         "Haowei Zhang"
    506       ],
    507       "year": 2025,
    508       "arxiv_id": "2501.12948",
    509       "relevance": "Describes the architecture of one of the key Large Reasoning Models tested for persuasion dynamics."
    510     },
    511     {
    512       "title": "Red-teaming LLM multi-agent systems via communication attacks",
    513       "authors": [
    514         "Pengfei He",
    515         "Yupin Lin",
    516         "Shen Dong"
    517       ],
    518       "year": 2025,
    519       "arxiv_id": "2502.14847",
    520       "relevance": "Studies adversarial attacks on multi-agent LLM systems through communication channels, directly related to MAS robustness."
    521     },
    522     {
    523       "title": "Flooding spread of manipulated knowledge in LLM-based multi-agent communities",
    524       "authors": [
    525         "Tianjie Ju",
    526         "Yiting Wang",
    527         "Xinbei Ma"
    528       ],
    529       "year": 2024,
    530       "arxiv_id": "2407.07791",
    531       "relevance": "Studies how manipulated knowledge propagates through LLM multi-agent networks, complementary to persuasion propagation findings."
    532     },
    533     {
    534       "title": "Investigating the adaptive robustness with knowledge conflicts in LLM-based multi-agent systems",
    535       "authors": [
    536         "Tianjie Ju",
    537         "Bowen Wang",
    538         "Hao Fei"
    539       ],
    540       "year": 2025,
    541       "arxiv_id": "2502.15153",
    542       "relevance": "Examines knowledge conflicts in MAS collaborative coding tasks, studying how agents handle disagreements."
    543     },
    544     {
    545       "title": "Multiagent collaboration attack: Investigating adversarial attacks in large language model collaborations via debate",
    546       "authors": [
    547         "Alfonso Amayuelas",
    548         "Xianjun Yang",
    549         "Antonis Antoniades"
    550       ],
    551       "year": 2024,
    552       "relevance": "Investigates adversarial attacks in LLM debate frameworks, relevant to MAS safety and persuasion robustness."
    553     },
    554     {
    555       "title": "Measuring and improving persuasiveness of large language models",
    556       "authors": [
    557         "Somesh Singh",
    558         "Yaman K Singla",
    559         "Harini SI",
    560         "Balaji Krishnamurthy"
    561       ],
    562       "year": 2024,
    563       "arxiv_id": "2410.02653",
    564       "relevance": "Develops methods to measure and improve LLM persuasiveness, including PersuasionBench evaluation framework."
    565     },
    566     {
    567       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    568       "authors": [
    569         "Sirui Hong",
    570         "Mingchen Zhuge",
    571         "Jonathan Chen"
    572       ],
    573       "year": 2023,
    574       "relevance": "Influential multi-agent collaboration framework for software development, an example of MAS where persuasion dynamics matter."
    575     },
    576     {
    577       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    578       "authors": [
    579         "Jason Wei",
    580         "Xuezhi Wang",
    581         "Dale Schuurmans"
    582       ],
    583       "year": 2022,
    584       "relevance": "Foundational work on chain-of-thought reasoning that underpins the LRM vs LLM distinction central to this paper."
    585     }
    586   ],
    587   "engagement_factors": {
    588     "practical_relevance": {
    589       "score": 2,
    590       "justification": "The adversarial argument detection prompt and thinking-mode design guidance are directly actionable for practitioners building multi-agent LLM systems."
    591     },
    592     "surprise_contrarian": {
    593       "score": 2,
    594       "justification": "The finding that sharing thinking content dramatically boosts persuasion while mismatched reasoning hurts it below baseline is counterintuitive and challenges the assumption that persuasion scales with model size."
    595     },
    596     "fear_safety": {
    597       "score": 2,
    598       "justification": "Demonstrates concrete vulnerabilities in multi-agent systems where models can be manipulated into wrong answers, with safety implications for autonomous agent deployments."
    599     },
    600     "drama_conflict": {
    601       "score": 1,
    602       "justification": "Mildly challenges the scale-centric paradigm of persuasion but doesn't directly call out specific companies or benchmarks as fraudulent."
    603     },
    604     "demo_ability": {
    605       "score": 0,
    606       "justification": "No code repository, demo, or reproducible artifacts are provided — only experimental results in the paper."
    607     },
    608     "brand_recognition": {
    609       "score": 1,
    610       "justification": "From Shanghai Jiao Tong University and NUS — recognized institutions but not household names in tech; tests well-known models (o4-mini, DeepSeek-R1, Gemini) but the lab itself lacks brand pull."
    611     }
    612   }
    613 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs