scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33975B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Disagreements in Reasoning: How a Model's Thinking Process Dictates Persuasion in Multi-Agent Systems",
      6     "authors": [
      7       "Haodong Zhao",
      8       "Jidong Li",
      9       "Zhaomin Wu",
     10       "Tianjie Ju",
     11       "Zhuosheng Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2509.21054",
     16     "doi": "10.48550/arXiv.2509.21054"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's claims about reasoning resistance (supported by heatmaps Figures 1-2, Figure 7), thinking content boosting persuasion (Figures 1-2 w/ vs w/o), and multi-hop propagation (Figures 9, 16) are all supported by corresponding experimental results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims about thinking mode causing increased resistance and thinking content causing increased persuasion. These are supported by controlled single-variable comparisons: same model with thinking on/off, same content with/without thinking block. The ablation in Figure 6 (native vs padding vs replace) provides controlled manipulation to isolate content vs length.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper tests 7 models on MMLU and 1,000 subjective claims, but makes broad claims about 'the safety, robustness, and design of future MAS' and 'the cognitive architecture of persuasion.' The title says 'Multi-Agent Systems' generally despite testing only specific models on two datasets.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 3.3.2 explicitly tests the alternative explanation that persuasiveness comes from length rather than content quality, using padding and replace conditions (Figure 6). This is a substantive alternative-explanation investigation.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper formally distinguishes between human persuasion (Definition 2.1, involving intentional belief change) and LLM persuasion (Definition 2.2, measured behaviorally through output change). The metrics (PR, RR, OR) directly measure answer-switching, and the paper is explicit that this is what is being measured.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations section exists. The conclusion (Section 5) is brief and does not include substantive discussion of limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what its results do NOT show. Claims extend to 'future MAS' design without bounding to the tested models and datasets.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed on the first page: Shanghai Jiao Tong University, National University of Singapore, and Inner Mongolia Research Institute.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: LRM vs. LLM distinction is drawn in Section 1, 'LLM Persuasion' is formally defined in Definition 2.2, and PR/RR/OR metrics are formalized in Section 2.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions are itemized in the introduction: linking reasoning process to persuasion behavior, formalizing the Persuasion Duality, extending to multi-agent chains, and proposing a prompt-level mitigation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2.1 situates the work relative to PersuasionBench and PMIYC frameworks, and Appendix B provides extended related-work coverage of multi-agent debate and computational persuasion literature.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses publicly available datasets: MMLU (Hendrycks et al., 2020) and samples from PersuasionBench (Durmus et al., 2024) and Perspectrum (Chen et al., 2019). All are standard public benchmarks.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Appendix A.3 mentions VLLM v0.10.0 and transformers v4.56.0, but no Python version, OS, CUDA version, GPU type, or full dependency list is provided. Not sufficient to recreate the environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or scripts are provided.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "All heatmaps (Figures 1, 2, 13, 14) report ± values for each cell (e.g., '7.0 ± 1.6'). These appear to be confidence intervals for the proportions.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Despite making claims like 'significantly greater resistance to persuasion' and comparing model pairs extensively, no formal statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported with baseline context throughout: e.g., 'PR of 65.78%, a 19% relative improvement over the baseline (PR=46.31%)' (Section 3.3.2), 'average of 21.07%' increase for thinking content (Section 3.2), and 'average gains of -7.41%, -1.92%, and 2.07%' for different models.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "MMLU provides ~10,000 questions and 1,000 subjective claims are sampled, but no justification is given for these sample sizes. No power analysis is discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The ± values reported in all heatmaps (Figures 1, 2, 13, 14) provide spread measures for the persuasion rates across the evaluation sets.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper systematically compares LRMs vs LLMs, thinking vs non-thinking modes, and with/without thinking content. Each condition serves as a baseline for the others in the 10×10 evaluation matrix.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Models include o4-mini, DeepSeek-R1, Gemini-2.5-flash, and Qwen3-32B — all are 2025 frontier models representing current state of the art.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Figure 6 presents a clear ablation: w/o thinking content (baseline), w/ native thinking content, padding (length-matched non-semantic tokens), and replace (mismatched thinking from another LRM). This isolates the contribution of logical coherence vs. length.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are defined and used: Persuaded-Rate (PR), Remain-Rate (RR), and Other-Rate (OR), formally defined in Equations 1-3.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Evaluation is entirely automated — measuring whether models change their selected option after exposure to persuasive content. No human evaluation of persuasion quality is conducted.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "No explicit discussion of held-out test sets. The paper evaluates pre-trained models without fine-tuning, but does not discuss whether any selection decisions (e.g., prompt design, experimental conditions) were informed by preliminary results on the same data.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by every model pair (heatmaps), objective vs. subjective datasets, with vs. without thinking content, and token length settings (Figure 5). Per-model average persuasion rates are shown in Figures 3 and 4.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section C.1 and Figure 12 present a detailed case study of a model being successfully misled, tracing the thinking process. Section 4.1 analyzes the attention mechanism weakness that enables failures.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Several negative results reported: thinking mode for persuaders 'yields inconsistent changes' with some models showing negative gains (e.g., Gemini -7.41% in Figure 1a). The 'Replace' condition in Figure 6 shows thinking content can be actively harmful (PR drops below baseline to 32.42%).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Table 1 lists model names (o4-mini, Gemini-2.5-flash, DeepSeek-R1, etc.) but without specific API versions or snapshot dates. Per the schema, marketing names without version identifiers do not count. The open-source models link to HuggingFace pages but accessed dates say 'YYYY-MM-DD' (placeholder not filled) and some say '2025-09-21'.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt templates for both persuader content generation and persuadee evaluation are provided in Appendix A.4, with the actual text used. Placeholders ({question}, {claim}, etc.) are clearly defined variables. The adversarial detection prompt is shown in Figure 15.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendix A.3 reports: temperature=0.7, top_p=0.8, and mentions VLLM v0.10.0 and transformers v4.56.0 for local model serving.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The experiments involve direct prompt-response interactions between models without tools, feedback loops, or agentic workflows.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.1 and A.1 describe preprocessing: MMLU correct answers standardized to option A, persuasion targets fixed as option D. Subjective claims mapped to A/B/C (support/neutral/oppose) with defined target-setting rules. The evaluation set S is formally defined as instances where the model's initial answer differs from the correct answer.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data (model responses, individual trial results) is released. Only aggregated statistics are shown in figures.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 and A.1 describe data sources (MMLU, PersuasionBench, Perspectrum), how questions were selected, and how the evaluation protocol works (initial answer → persuasion → re-evaluation).",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public benchmarks (MMLU, PersuasionBench, Perspectrum).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented: get initial answer a0 from LLM → filter to instances where model initially answered correctly → generate persuasive content → present to persuadee → measure PR/RR/OR. The formal definitions in Section 2.2 make the pipeline explicit.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the evaluated models, despite using MMLU which has been publicly available since 2020.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether models have seen MMLU questions during training. MMLU is one of the most widely used benchmarks and is almost certainly in the training data of frontier models like GPT-4o-mini, Gemini, and DeepSeek-R1.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "MMLU was published in 2020 and is known to be contaminated in most frontier models' training data. The paper does not address this at all, despite its direct relevance — if models have memorized MMLU answers, their resistance to persuasion on those questions reflects memorization confidence, not general persuasion dynamics.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference costs are reported despite running 100 model-pair combinations across ~10,000 MMLU questions and 1,000 subjective claims — a substantial computational effort.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, GPU hours, or API costs are mentioned.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Results are presented with ± values but these appear to be proportion confidence intervals, not variance across random seeds. No explicit seed sensitivity analysis is conducted despite using temperature=0.7 which introduces stochasticity.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs per model pair is not explicitly stated. It is unclear whether results represent single runs or averages over multiple runs.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Temperature=0.7 and top_p=0.8 are used without justification for why these values were chosen. No hyperparameter search is described.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The hyperparameter configuration (temperature=0.7, top_p=0.8) is stated without justification. No explanation of how these values were selected.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper makes numerous pairwise comparisons across 100 model pairs on two datasets with multiple conditions, but no correction for multiple comparisons is applied.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors propose the 'Persuasion Duality' concept and test it using their own experimental framework. No acknowledgment of potential author-evaluation bias in experimental design or result interpretation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "Models of vastly different sizes (7B, 8B, 32B, unknown sizes for proprietary models) are compared without discussing compute differences. The paper argues against scale as a primary factor but doesn't control for compute budgets.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses MMLU answer-switching as a measure of 'persuasion' without discussing whether this actually measures persuasion as defined in their framework. A model changing its MMLU answer could reflect prompt sensitivity, sycophancy, or conformity rather than genuine persuasion dynamics. The formal definitions (2.1, 2.2) don't bridge this gap.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved. Models are evaluated through direct prompt-response interactions.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "MMLU was published in 2020 and all tested models were trained well after that date. The paper does not discuss temporal leakage despite this being the primary confound — models resistant to persuasion on MMLU may simply have strongly memorized the correct answers.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "Not discussed. The experimental setup provides the correct answer as part of the initial evaluation (standardized to option A), which could leak information depending on how models process the prompt structure.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "Not discussed. MMLU questions may overlap with training data for all models tested.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, decontamination, or temporal splits.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "LRMs in thinking mode exhibit significantly greater resistance to persuasion than non-thinking counterparts.",
    457       "evidence": "Figures 1–4 show consistently lower PR and higher RR for thinking-mode models as persuadees across both objective and subjective datasets; Figure 7 plots thinking vs. non-thinking pairs showing thinking models cluster toward lower PR and higher RR.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Sharing LRM thinking content dramatically increases persuasive capability, yielding an average 21.07% increase in persuaded rate.",
    462       "evidence": "Figures 1b and 2b versus 1a and 2a show large increases in PR when thinking content is shared; the 21.07% average is stated explicitly in Section 3.2.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "LRM persuasiveness from thinking content is driven by logical coherence, not merely verbosity.",
    467       "evidence": "Figure 6 ablation: native thinking (PR 65.8%) outperforms length-matched padding (62.3%), while mismatched thinking drops below baseline (32.4%).",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "Persuasion propagates non-linearly through multi-agent chains, sometimes exceeding direct persuasion rates.",
    472       "evidence": "Figure 9 shows multi-hop chains where A→B→C PR exceeds direct A→C PR in some configurations (e.g., Gemini-T→Llama→Hunyuan: whole 65.5% vs. direct 23.5%).",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "An adversarial argument detection prompt consistently reduces persuasion susceptibility across model types.",
    477       "evidence": "Figure 11 shows clear PR reduction and RR increase across four tested persuadee models after adding the detection prompt.",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Models prioritize confident rhetorical cues over logical reasoning, as shown by attention analysis.",
    482       "evidence": "Figure 10 shows 11.1% average attention on short confident assertion vs. 0.39% on longer reasoning section, but this is a single case study with no systematic replication.",
    483       "supported": "weak"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "observational"
    489   ],
    490   "key_findings": "The paper identifies a 'Persuasion Duality' in LRM-based multi-agent systems: explicit reasoning processes make models substantially more resistant to persuasion as persuadees (lower PR, higher RR), while sharing that same thinking content dramatically increases their persuasive capability as persuaders (avg +21% PR). Ablation experiments confirm the effect is semantic — mismatched thinking content hurts persuasion below baseline, while length-matched padding captures only partial gains. Persuasion propagates non-linearly through multi-agent chains, with amplification and attenuation depending on chain composition and reasoning modes of intermediary agents.",
    491   "red_flags": [
    492     {
    493       "flag": "MMLU contamination unaddressed",
    494       "detail": "MMLU was published in 2020 and almost certainly appears in all tested models' training data. Resistance to persuasion on MMLU questions may reflect training memorization rather than genuine reasoning robustness — this confound is never acknowledged."
    495     },
    496     {
    497       "flag": "Attention analysis is a single case",
    498       "detail": "The mechanistic explanation in Section 4.1 (Figure 10) is based on a single example with no description of how attention scores were extracted, which layer(s) were analyzed, or whether the pattern holds systematically."
    499     },
    500     {
    501       "flag": "No significance tests despite comparative claims",
    502       "detail": "All comparative claims (thinking vs. non-thinking, with/without content) are made without formal statistical tests; overlapping error bars in some conditions may not be significant."
    503     },
    504     {
    505       "flag": "Sycophancy vs. persuasion conflation",
    506       "detail": "The measured 'persuasion' is indistinguishable from known sycophancy/conformity behavior in LLMs; the paper does not cite or distinguish from the substantial sycophancy literature."
    507     },
    508     {
    509       "flag": "No limitations section",
    510       "detail": "There is no dedicated limitations section; threats to validity including task scope, single-turn design, and model-specific training differences are entirely absent."
    511     },
    512     {
    513       "flag": "Overclaiming on MAS generalization",
    514       "detail": "Conclusions invoke broad implications for 'safe and resilient MAS architectures' but experiments only test pairwise single-round persuasion with two task types, not full agentic systems."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "The persuasive power of large language models",
    520       "relevance": "Baseline work on LLM persuasion efficacy that this paper builds on and challenges with the scale-plateauing argument."
    521     },
    522     {
    523       "title": "Scaling language model size yields diminishing returns for single-message political persuasion",
    524       "relevance": "PNAS paper providing evidence for diminishing returns to scale in persuasion, directly motivating the process-centric view."
    525     },
    526     {
    527       "title": "Persuade me if you can: A framework for evaluating persuasion effectiveness and susceptibility among large language models",
    528       "relevance": "PersuasionBench framework providing both the dataset used and the methodological comparison point for this work."
    529     },
    530     {
    531       "title": "Encouraging divergent thinking in large language models through multi-agent debate",
    532       "relevance": "Foundational multi-agent debate framework; key prior work on LLM-LLM interaction dynamics."
    533     },
    534     {
    535       "title": "Conformity in large language models",
    536       "relevance": "Closely related work on LLM susceptibility to social influence, overlapping with the sycophancy/persuasion behavior studied here."
    537     },
    538     {
    539       "title": "Investigating the adaptive robustness with knowledge conflicts in LLM-based multi-agent systems",
    540       "relevance": "Direct predecessor from overlapping authors on MAS robustness to conflicting information."
    541     },
    542     {
    543       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    544       "relevance": "Foundation for the CoT/thinking-mode distinction central to the paper's theoretical framework."
    545     },
    546     {
    547       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    548       "relevance": "One of the primary LRM models evaluated; represents the LRM class whose properties drive the core findings."
    549     }
    550   ],
    551   "engagement_factors": {
    552     "practical_relevance": {
    553       "score": 2,
    554       "justification": "The adversarial argument detection prompt and thinking-mode design guidance are directly actionable for practitioners building multi-agent LLM systems."
    555     },
    556     "surprise_contrarian": {
    557       "score": 2,
    558       "justification": "The finding that sharing thinking content dramatically boosts persuasion while mismatched reasoning hurts it below baseline is counterintuitive and challenges the assumption that persuasion scales with model size."
    559     },
    560     "fear_safety": {
    561       "score": 2,
    562       "justification": "Demonstrates concrete vulnerabilities in multi-agent systems where models can be manipulated into wrong answers, with safety implications for autonomous agent deployments."
    563     },
    564     "drama_conflict": {
    565       "score": 1,
    566       "justification": "Mildly challenges the scale-centric paradigm of persuasion but doesn't directly call out specific companies or benchmarks as fraudulent."
    567     },
    568     "demo_ability": {
    569       "score": 0,
    570       "justification": "No code repository, demo, or reproducible artifacts are provided — only experimental results in the paper."
    571     },
    572     "brand_recognition": {
    573       "score": 1,
    574       "justification": "From Shanghai Jiao Tong University and NUS — recognized institutions but not household names in tech; tests well-known models (o4-mini, DeepSeek-R1, Gemini) but the lab itself lacks brand pull."
    575     }
    576   },
    577   "hn_data": {
    578     "threads": [
    579       {
    580         "hn_id": "43243109",
    581         "title": "An Attempt to Catch Up with JIT Compilers",
    582         "points": 203,
    583         "comments": 142,
    584         "url": "https://news.ycombinator.com/item?id=43243109",
    585         "created_at": "2025-03-03T16:06:50Z"
    586       },
    587       {
    588         "hn_id": "44433899",
    589         "title": "Converting a large mathematical software package written in C++ to C++20 modules",
    590         "points": 141,
    591         "comments": 42,
    592         "url": "https://news.ycombinator.com/item?id=44433899",
    593         "created_at": "2025-07-01T13:46:56Z"
    594       },
    595       {
    596         "hn_id": "46339300",
    597         "title": "Signaling in the Age of AI: Evidence from Cover Letters",
    598         "points": 17,
    599         "comments": 1,
    600         "url": "https://news.ycombinator.com/item?id=46339300",
    601         "created_at": "2025-12-20T20:23:28Z"
    602       },
    603       {
    604         "hn_id": "45472586",
    605         "title": "Physics of Learning: A Lagrangian perspective to different learning paradigms",
    606         "points": 3,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=45472586",
    609         "created_at": "2025-10-04T11:38:44Z"
    610       },
    611       {
    612         "hn_id": "47195084",
    613         "title": "Limitations on Safe, Trusted, Artificial General Intelligence",
    614         "points": 2,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=47195084",
    617         "created_at": "2026-02-28T13:25:35Z"
    618       },
    619       {
    620         "hn_id": "45418635",
    621         "title": "Can LLMs Be Creative? Paper: Combinatorial Creativity: A New Frontier",
    622         "points": 2,
    623         "comments": 0,
    624         "url": "https://news.ycombinator.com/item?id=45418635",
    625         "created_at": "2025-09-29T20:53:22Z"
    626       },
    627       {
    628         "hn_id": "24567265",
    629         "title": "Context-Theoretic Semantics for Natural Language: An Algebraic Framework (2007)",
    630         "points": 2,
    631         "comments": 0,
    632         "url": "https://news.ycombinator.com/item?id=24567265",
    633         "created_at": "2020-09-23T14:11:23Z"
    634       },
    635       {
    636         "hn_id": "46479718",
    637         "title": "FakeParts: A New Family of AI-Generated DeepFakes",
    638         "points": 1,
    639         "comments": 0,
    640         "url": "https://news.ycombinator.com/item?id=46479718",
    641         "created_at": "2026-01-03T18:14:11Z"
    642       },
    643       {
    644         "hn_id": "45069333",
    645         "title": "A multi-task neural network for atypical mitosis recognition under domain shift",
    646         "points": 1,
    647         "comments": 0,
    648         "url": "https://news.ycombinator.com/item?id=45069333",
    649         "created_at": "2025-08-29T21:00:57Z"
    650       }
    651     ],
    652     "top_points": 203,
    653     "total_points": 372,
    654     "total_comments": 185
    655   }
    656 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs