scan-v5.json (27160B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Learning \"Partner-Aware\" Collaborators in Multi-Party Collaboration", 6 "authors": [ 7 "Abhijnan Nath", 8 "Nikhil Krishnaswamy" 9 ], 10 "year": 2025, 11 "venue": "NeurIPS 2025", 12 "arxiv_id": "2510.22462", 13 "doi": "10.48550/arXiv.2510.22462" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "Abstract claims that ICR achieves superior common ground convergence and outperforms standard RLHF/DPO baselines are directly supported by Table 1 results across both tasks in full-press and no-press conditions.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "Causal claims that counterfactual KL regularization produces better collaboration are supported by ablation of λIntent values (Fig 1b) and the PPO-CF baseline, which isolates the contribution of the KL term from counterfactual prompting alone.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "Limitations section explicitly bounds claims to two task domains and 8B-scale models, noting untested conditions including Diplomacy, human collaborators, and larger-scale centralized training.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper does not adequately address the alternative that GPT-4o's role as both expert trajectory generator and fixed evaluation intervention agent inflates ICR performance via shared distributional priors with the base Llama-3-8B-Instruct model.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper explicitly distinguishes proxy training rewards (task accuracy only) from gold evaluation rewards (accuracy × common ground convergence), noting consensus reward was deliberately withheld during training to prevent reward hacking.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 7 contains a dedicated multi-paragraph 'Limitations and Future Work' subsection covering compute constraints, task diversity, fixed intervention agent, and data bottlenecks.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Specific threats are named: 8B-scale training due to compute limits, only two task domains, GPT-4o's potential prior exposure to DeliData, and fixed GPT-4o intervention agent not reflecting real-world intervention diversity.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Results are explicitly limited to two domains, one intervention agent (GPT-4o), 8B models, and AI-AI collaborations; the paper explicitly notes human collaboration data as a bottleneck.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": true, 72 "justification": "Full funding disclosure in acknowledgments: DARPA FACT program (HR00112490377), NSF awards (DRL 2019805, DRL 2454151, IIS 2303019), ARO Knowledge Systems (W911NF-25-1-0096), and ARPA-H PARADIGM program.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Authors clearly affiliated with SIGNAL Lab, Department of Computer Science, Colorado State University, with institutional emails provided.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "All funders (DARPA, NSF, ARO, ARPA-H) are US government agencies with no commercial stake in ICR or the evaluated models.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is provided anywhere in the paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Key terms are defined precisely: 'common ground' via Stalnaker (2002), 'partner-aware' as adapting to specific intervention agents, 'MAMDP' formally defined in Section 3, 'counterfactual invariance' via KL divergence in Eq. 4.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Three explicit bullet-pointed contributions are stated in the introduction: MAMDP+counterfactual invariance formulation, theoretical proofs of PPO/DPO suboptimality, and empirical ICR results on two collaborative tasks.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 2 substantively engages prior work on collaborative reasoning, preference-based RL, and safe interruptibility, explicitly contrasting ICR against PSO-INTENT and RLHF approaches rather than merely listing related papers.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "Code is released at https://github.com/csu-signal/ICR, cited in footnote 2 of the paper.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "Both evaluation benchmarks (DeliData from Karadzhov et al. 2023 and Weights Task from Khebour et al. 2024) are publicly available datasets.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "While specific libraries (PEFT, TRL SFTTrainer, bitsandbytes), GPU hardware (NVIDIA A100), and model IDs are mentioned, no requirements.txt, Dockerfile, or equivalent environment spec is provided.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "Algorithm 1 describes the pipeline conceptually and Section D details hyperparameters, but no step-by-step runnable instructions are provided; the paper says code will be in 'supplementary material' without confirming it is in the released repo.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": true, 147 "justification": "Table 1 reports standard errors (±) for all metrics across 100 evaluation dialogues; Fig 1b reports training curves across 3 random seeds.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": false, 153 "justification": "No statistical significance tests (t-test, ANOVA, etc.) are reported for comparative claims between ICR and baselines; only standard errors are provided.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Percentage improvements are reported explicitly (47% improvement over DPO on Weights Task, 14% on CG metric for DeliData, 300% difference for inequality propositions in Fig 1a).", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "100 evaluation dialogues per task is used with no power analysis or justification for sufficiency given the observed effect sizes and variance.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": true, 171 "justification": "Standard errors are reported in Table 1 across 100 dialogues and across 3 seeds in ablation Fig 1b.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Five baselines included: BC-COLLABORATOR, DPO, IPO, PPO, and PSO-INTENT, covering behavior cloning, offline preference RL, on-policy RL, and the most directly relevant prior work.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "DPO (2024), IPO (2024), and PSO-INTENT (2023) are all recent methods; PPO is the standard on-policy algorithm used as the underlying optimizer for ICR itself.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "Fig 1b ablates λIntent values (0.01, 0.2, 1.0); Appendix A includes PPO-CF (isolates KL term from counterfactual prompting) and ICR-Phrasing (tests prompt robustness).", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "DeliData uses both accuracy (ACC) and common ground gain (CG); Weights Task uses composite ACC; cumulative CG curves by proposition type are also provided (Fig 1a).", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section D.4 reports a human validation study with 2 annotators evaluating 200 intervention pairs, finding Cohen's κ=0.92 on DeliData and κ=0.58 on Weights Task.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": true, 209 "justification": "Training uses GPT-4o generated expert trajectories collected prior to evaluation; all evaluation uses 100 fresh dialogue runs not part of the training data.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Fig 1a breaks Weights Task results into equality, inequality, and order proposition types; results are reported separately for full-press vs. no-press conditions for both tasks.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Appendix E provides concrete examples of misleading interventions degrading collaborator performance, and cases where well-meaning interventions are incorrectly ignored.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "BC-COLLABORATOR shows negative CG (-0.13) in DeliData full-press; λIntent=0.01 severely hampers learning; PSO-Skeptical degrades performance relative to PSO-Intent.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": false, 235 "justification": "Llama-3-8B-Instruct is specified via HuggingFace ID, but GPT-4o is used without a specific API snapshot date, despite being central to both expert data generation and all evaluation runs.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": true, 241 "justification": "All prompts are provided verbatim in Appendix C (Figs. 2–10) for both tasks and both full-press/no-press conditions, including counterfactual prefixes and alternative phrasings (Table 4).", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section D fully reports LoRA settings (α=16, rank=8, dropout=0.05), optimizer (AdamW, cosine scheduler), learning rates, batch sizes, training steps, and sampling parameters.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": true, 252 "answer": true, 253 "justification": "The MAMDP turn-taking protocol, 15-turn structure, counterfactual prefix computation, and PPO rollout with single additional forward pass are described in detail in Sections 3–5 and Appendix C.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Algorithm 1 and Section C document the full data collection pipeline including bootstrap dialogue seeding, personality assignment, token length statistics (Table 5), and expert response parsing procedures.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": false, 267 "justification": "The GPT-4o generated expert trajectory corpus used for training is not explicitly confirmed to be released; regenerating it requires costly GPT-4o API calls with stochastic sampling making exact reproduction impossible.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Section C and Algorithm 1 describe data collection comprehensively: GPT-4o as expert for both roles, bootstrap seeding, personality sampling, 15-turn collection per dialogue, and preference annotation procedure.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": true, 278 "answer": false, 279 "justification": "Human annotators in Section D.4 are described only as 'two fluent English-speaking college undergraduates'; no recruitment procedure, compensation, or selection criteria is described.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "Algorithm 1 provides complete pseudocode of the full pipeline from dialogue seed initialization through expert trajectory collection to ICR training.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Training data cutoffs for GPT-4o and Llama-3-8B-Instruct are not stated despite both being evaluated on DeliData, a published dataset that likely predates their training.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": true, 298 "answer": true, 299 "justification": "Appendix A explicitly acknowledges 'GPT-4o's extensive pretraining on reasoning tasks, potentially including exposure to DeliData or DeliData-like problems' when interpreting GPT-4o paired results.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": true, 304 "answer": false, 305 "justification": "Contamination is acknowledged as a concern for GPT-4o baselines but no concrete mitigation (date-restricted models, held-out task variants, contamination tests) is performed for the primary evaluations.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": true, 312 "answer": false, 313 "justification": "No pre-registration is mentioned for the human annotation validation study in Section D.4.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": true, 318 "answer": false, 319 "justification": "No IRB or ethics approval is mentioned for the human annotator study in Section D.4; the NeurIPS checklist incorrectly states 'we do not conduct any human evaluations' contradicting the actual paper content.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": true, 324 "answer": false, 325 "justification": "Only minimal characterization is given ('two fluent English-speaking college undergraduates'); no age, gender, NLP background, or other demographics are reported.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": true, 330 "answer": false, 331 "justification": "No explicit inclusion/exclusion criteria are described for human annotators beyond the informal description 'fluent English-speaking college undergraduates.'", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "Randomization is not applicable to a pairwise quality annotation task with 2 annotators.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": true, 342 "answer": true, 343 "justification": "Section D.4 explicitly states annotators 'were not shown the GPT-4o reward scores or correct task solutions,' implementing effective blinding.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "With only 2 annotators completing the full study, attrition reporting is not applicable.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": false, 357 "justification": "No inference cost or GPT-4o API cost is reported despite GPT-4o being used for all expert trajectory generation and as the fixed evaluation intervention agent.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": true, 363 "justification": "Training compute is specified: ~12 GPU hours for standard baselines and ~24 hours for PPO models on NVIDIA A100s; full-press experiments require two A100s.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "ICR-trained collaborators achieve substantially higher accuracy and common ground convergence than all baselines, including a 47% improvement over DPO on the Weights Task.", 372 "evidence": "Table 1: ICR achieves 14.06±0.13 vs DPO 9.56±0.09 on Weights Task full-press; 3.35±0.19 vs 2.94±0.18 CG on DeliData full-press.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Standard preference-aligned collaborators (RLHF/DPO/IPO) are theoretically suboptimal in MAMDP settings because they treat interventions as static state information rather than causally evaluating them.", 377 "evidence": "Theorem 3.2 and Theorem B.3 formally prove this result; empirically all preference-aligned baselines underperform ICR across tasks.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Counterfactual invariance regularization produces emergent common ground convergence without explicit CG rewards during training.", 382 "evidence": "ICR trained with only task-accuracy proxy reward achieves CG=3.35 vs BC-COLLABORATOR's CG=-0.13; common ground-based rewards were deliberately withheld.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "ICR performance gains persist in language-free (no-press) conditions, demonstrating principled partner-awareness beyond language processing.", 387 "evidence": "ICR achieves 10.87±0.13 ACC in no-press vs 7.81±0.11 for PPO (next best) on Weights Task; 0.85±0.02 vs 0.78±0.03 on DeliData no-press.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Llama-3-8B trained with ICR performs comparably to GPT-4o acting as both agents, despite GPT-4o's distributional advantage.", 392 "evidence": "GPT-4o self-paired achieves 15.23±0.21 on Weights Task vs ICR (Llama-3-8B) at 14.06±0.13; gap is ~8% but GPT-4o may have DeliData contamination.", 393 "supported": "weak" 394 }, 395 { 396 "claim": "ICR is robust to counterfactual prefix phrasing variations, with near-zero mean log-probability response gaps across 6 semantic variants.", 397 "evidence": "Mean response gap of 0.0008 log-probability units (σ=0.1568) across 6 counterfactual phrasings on 50 contexts; untrained model shows 0.0247 mean gap.", 398 "supported": "moderate" 399 } 400 ], 401 "methodology_tags": [ 402 "benchmark-eval", 403 "theoretical" 404 ], 405 "key_findings": "ICR substantially outperforms all RLHF/DPO/IPO baselines on multi-party collaborative task performance and common ground convergence, with ~47% improvement over the best offline RL baseline on the Weights Task and 14% on the common ground metric for DeliData. The paper provides formal theoretical proof that preference-aligned LLMs are suboptimal in Modified-Action MDPs where interventions have causal structure, and demonstrates that counterfactual KL regularization produces emergent common ground alignment without explicit CG training rewards. Performance gains persist in language-free conditions and across alternative counterfactual prompt phrasings, suggesting the improvement reflects principled policy learning rather than language-specific surface effects.", 406 "red_flags": [ 407 { 408 "flag": "GPT-4o version unspecified", 409 "detail": "GPT-4o is used for all expert trajectory collection and as the fixed evaluation intervention agent across all 100+100 evaluation dialogues, but no API snapshot date is given, making exact reproduction impossible and contamination analysis incomplete." 410 }, 411 { 412 "flag": "No statistical significance tests", 413 "detail": "Despite strong comparative claims, the paper reports only standard errors over 100 dialogues without t-tests or other significance tests to confirm ICR improvements are statistically significant rather than within natural variation." 414 }, 415 { 416 "flag": "NeurIPS checklist inconsistency", 417 "detail": "The NeurIPS checklist (items 14-15) states 'we do not conduct any crowdsourcing or human evaluations' but Section D.4 clearly reports a human annotation study with 2 annotators evaluating 200 intervention pairs." 418 }, 419 { 420 "flag": "Training corpus reproducibility", 421 "detail": "Expert trajectories generated via GPT-4o API constitute the training corpus but are not explicitly released; regenerating requires costly API calls with stochastic outputs, making exact reproduction practically impossible." 422 }, 423 { 424 "flag": "Contamination not mitigated", 425 "detail": "GPT-4o's potential prior exposure to DeliData (a published HCI dataset from 2023) is acknowledged in the appendix but not formally tested or mitigated for the primary evaluation runs." 426 }, 427 { 428 "flag": "Human validation understaffed", 429 "detail": "The human validation study (Section D.4) uses only 2 annotators to evaluate 200 intervention pairs; this is insufficient to establish reliable inter-rater statistics and contradicts the paper's own NeurIPS checklist." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "How RL agents behave when their actions are modified", 435 "relevance": "Foundation of the MAMDP framework used throughout; establishes Bellman-optimal agents are suboptimal when actions can be modified by another strategic agent." 436 }, 437 { 438 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 439 "relevance": "Key baseline (DPO) and the primary preference-alignment paradigm the paper critiques as structurally suboptimal for collaborative settings." 440 }, 441 { 442 "title": "Safely interruptible agents", 443 "relevance": "Core safety literature motivating the 'interruptible' design goal; paper extends safe interruptibility from RL safety to multi-party collaborative dialogue." 444 }, 445 { 446 "title": "Honesty is the best policy: defining and mitigating AI deception", 447 "relevance": "Source of PSO-INTENT baseline and the 'intentionality' concept that ICR operationalizes via counterfactual KL regularization." 448 }, 449 { 450 "title": "DeliData: A dataset for deliberation in multi-party problem solving", 451 "relevance": "Primary evaluation dataset providing the Wason Card Selection collaborative task used in main experiments." 452 }, 453 { 454 "title": "When text and speech are not enough: A multimodal dataset of collaboration in a situated task", 455 "relevance": "Source of the Weights Task, the second evaluation domain for ICR experiments." 456 }, 457 { 458 "title": "Path-specific objectives for safer agent incentives", 459 "relevance": "Provides the counterfactual influence pathway concept that ICR's counterfactual regularization is derived from." 460 }, 461 { 462 "title": "Proximal Policy Optimization Algorithms", 463 "relevance": "The base RL algorithm used for ICR training and as a direct baseline; also relevant to the broader agentic LLM training literature." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "Applicable to AI tutoring and collaborative work settings, though deployment requires RL training infrastructure and GPT-4o API access for expert trajectory generation." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "The formal proof that RLHF/DPO agents are structurally suboptimal for multi-party collaboration challenges the dominant preference alignment paradigm with a principled theoretical argument." 474 }, 475 "fear_safety": { 476 "score": 1, 477 "justification": "Raises concerns about partner-aware LLMs being misused for covert manipulation, with explicit discussion linking to sleeper agents and alignment faking in limitations." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "The claim that 8B Llama trained with ICR approaches GPT-4o performance on these tasks creates modest controversy, though the comparison is carefully caveated." 482 }, 483 "demo_ability": { 484 "score": 1, 485 "justification": "Code is available on GitHub but requires RL training infrastructure, GPU access, and GPT-4o API calls; not easily demonstrable without significant setup." 486 }, 487 "brand_recognition": { 488 "score": 1, 489 "justification": "Colorado State University SIGNAL Lab is a legitimate research group but lacks the brand recognition of major AI labs; NeurIPS 2025 acceptance provides venue credibility." 490 } 491 }, 492 "hn_data": { 493 "threads": [], 494 "top_points": 0, 495 "total_points": 0, 496 "total_comments": 0 497 } 498 }