ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29436B)


      1 {
      2   "paper": {
      3     "title": "Learning \"Partner-Aware\" Collaborators in Multi-Party Collaboration",
      4     "authors": ["Abhijnan Nath", "Nikhil Krishnaswamy"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025",
      7     "arxiv_id": "2510.22462",
      8     "doi": "10.48550/arXiv.2510.22462"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper states 'Our code is available at https://github.com/csu-signal/ICR' in Section 1 (footnote 2)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available datasets: DeliData (Karadzhov et al., 2023) and the Weights Task (Khebour et al., 2024a). Bootstrap dialogues are drawn from these public sources."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Section D.1 mentions specific libraries (PEFT, TRL, bitsandbytes, SFTTrainer) and hardware (NVIDIA A100), but no requirements.txt, Dockerfile, or comprehensive environment specification with library versions is provided."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions 'Code to run the experiment will be provided in supplementary material' but no step-by-step reproduction instructions are included in the paper itself. The GitHub repo is referenced but no README or reproduction guide is described."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Table 1 reports ± standard error for all metrics across all baselines (e.g., 'ICR 14.06±0.13'). Table 2 similarly reports ± values."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Despite claiming ICR 'consistently outperforms all baselines,' no statistical significance tests (t-tests, bootstrap tests, etc.) are reported. Comparisons are based solely on point estimates with standard errors."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports percentage improvements with baseline context, e.g., 'ICR agents achieve an high accuracy of 14.06, which represents a dramatic 47% improvement over the next best performer (DPO, at 9.56)' in Section 6."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper uses 100 dialogues per task for evaluation (50 for supplementary experiments) but provides no justification for why these sample sizes are sufficient. No power analysis is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 1 reports standard error across 100 collaboration trials. Figure 1b shows ablation results across 3 random seeds. Section 7 (NeurIPS checklist) states 'we report the standard error over 100 collaboration trials.'"
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 5 describes five baselines: BC-COLLABORATOR, DPO, IPO, PPO, and PSO-INTENT. Section A adds additional baselines including GPT-4o-based comparisons."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include DPO (Rafailov et al., 2024b), IPO (Azar et al., 2024), and PPO (Schulman et al., 2017), all standard and contemporary for LLM alignment. PSO-INTENT (Ward et al., 2023) is also recent and relevant."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Figure 1b presents an ablation on λ_Intent values (0.01, 0.2, 1.0) across 3 random seeds over 8k training steps. Section A includes ablations: ICR-Masked, ICR-Small, ICR-Phrasing, PPO-CF."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper reports ACC (accuracy) and CG (common ground gain) for DeliData, and ACC for Weights Task. Figure 1a further breaks down results by proposition type (equality, inequality, order)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section D.4 describes human validation where two annotators evaluated 200 interventions (100 pairs) across both tasks, measuring agreement with LLM-Judge rankings (Cohen's κ = 0.92 on DeliData, κ = 0.58 on Weights Task)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section C states 'training/evaluation splits for both datasets are consistent with prior work [Nath et al., 2024].' Evaluation uses 100 dialogues initialized from bootstrap dialogues, separate from training data."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Figure 1a provides per-relation-type breakdowns (equality, inequality, order propositions) for the Weights Task. Table 1 breaks down by full-press vs. no-press conditions and by task."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section E discusses adoption effects including a 'Misleading Intervention → Poor Outcome' example and an 'Ignored Intervention' example showing cases where the approach produces suboptimal results."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that BC-COLLABORATOR achieves negative CG values (-0.13, -0.15), showing that behavior cloning reduces solution diversity. Footnote 8 mentions that 'rewarding agents with a consensus signal is counterproductive.' PSO-Skeptical shows degraded performance."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims ICR 'is more capable of promoting successful CG convergence and exploring more diverse solutions,' which is supported by Table 1 showing ICR outperforming all baselines on both ACC and CG metrics across both tasks."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper makes causal claims about ICR's counterfactual regularization causing improved performance. The ablation study (Fig 1b, varying λ_Intent) and controlled comparisons (PPO-CF vs ICR in Table 2) provide evidence for the causal mechanism through controlled single-variable manipulation."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title claims 'Multi-Party Collaboration' generally but results are limited to two specific tasks (Wason Card Selection and Weights Task) with one base model (Llama-3-8B-Instruct) and one fixed intervention agent (GPT-4o). The limitations section acknowledges this ('we could only test our method on two collaborative domains') but the title and abstract framing is broader."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section A discusses alternative explanations: GPT-4o's potential prior exposure to DeliData tasks, the effect of shared underlying distributions when GPT-4o serves as both agents, LLM limitations with negation (for PSO-Skeptical results), and whether prompt augmentation alone (PPO-CF) can explain results."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly distinguishes proxy training rewards (task-specific accuracy) from gold evaluation rewards (composite of correctness and common ground). Section D.2 describes this in detail: 'we compute a composite reward of task-specific accuracy and common-ground convergence since this accurately measures the quality of the collaborator, and therefore can be treated as the \"gold reward.\"'"
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper specifies 'Meta-Llama-3-8B-Instruct' and 'GPT-4o' but provides no API version or snapshot date for GPT-4o. 'GPT-4o' is a marketing name that changes over time."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The appendix (Figs. 2-10) provides complete prompt text for all experimental conditions: expert collaborator prompts, intervention agent prompts, full-press and no-press collaborator prompts, counterfactual prompts, and final submission prompts. Table 4 lists all counterfactual prefix variants."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section D.1 provides detailed hyperparameters: LoRA α=16, dropout=0.05, rank R=8, 4-bit quantization, AdamW optimizer, cosine scheduler, weight decay 0.05, 100 warmup steps, β=0.1 for DPO/IPO, learning rates, batch sizes, max token lengths. GPT-4o sampling: T=0, top-p=0.9."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The MAMDP interaction framework is described in detail in Section 3: turn-taking between collaborator and intervention agents over T=15 turns, with the collaborator responding to interventions. Algorithm 1 details the full training pipeline."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section C describes data collection: bootstrap dialogues from original task datasets, expert trajectories via GPT-4o roleplay, parsing of per-participant utterances, and preference data generation (West-of-N sampling for full-press, synthetic swaps for no-press). Section D.2 details the reward computation pipeline."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "A dedicated 'Limitations and Future Work' section appears after Section 7 (Conclusion), with substantial discussion of constraints including compute budgets, fixed intervention agent, limited domains, and lack of human collaboration data."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The limitations section identifies specific threats: only 8B-scale models tested due to compute budgets, fixed GPT-4o as intervention agent (real-world interventions vary), only two collaborative domains tested, and lack of LLM-scale human-collaboration data as a bottleneck."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states what was NOT tested: larger models, human interventions, more domains (e.g., Diplomacy), ad hoc collaborators, centralized coordination, multimodal interaction. The limitations section asks 'how would ICR perform in more challenging domains like Diplomacy?'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The expert trajectory data generated via GPT-4o is not released. Only the code repository is provided. The underlying DeliData and Weights Task datasets are public, but the generated MAMDP interaction trajectories used for training are not made available."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 5 and Section C describe data collection in detail: GPT-4o generates both intervener and collaborator responses over 15 turns, using specific prompts (Figs. 2-6), with personality traits sampled from a pool (Table 3), and bootstrap dialogues from original task datasets."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section D.4 describes human annotator recruitment: 'Two human annotators—both fluent English-speaking college undergraduates.' For data generation, the expert model (GPT-4o) and its configuration are described. No human participants were recruited for the main study."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Algorithm 1 documents the full data pipeline: Phase 1 collects expert trajectories, then per-participant utterances are parsed from continuations (Section C), preference data is generated via West-of-N sampling or synthetic swaps, and Phase 2 trains using PPO with the documented loss function."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgments section lists funding from DARPA (HR00112490377), NSF (DRL 2019805, DRL 2454151, IIS 2303019), ARO (W911NF-25-1-0096), and ARPA-H (1AY2AX000062)."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Authors are from the SIGNAL Lab, Department of Computer Science, Colorado State University. No commercial product is being evaluated—the paper proposes a new method. Affiliations are clearly stated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funding is from government agencies (DARPA, NSF, ARO, ARPA-H) which have no financial stake in whether ICR outperforms baselines. The paper includes 'Views expressed herein do not reflect the policy or position of the National Science Foundation, the Department of Defense, or the U.S. Government.'"
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper. While this is an academic paper from a university, the absence of an explicit declaration is noted."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses GPT-4o for data generation and as the fixed intervention agent but does not state GPT-4o's training data cutoff date. It also uses Llama-3-8B-Instruct without stating its training cutoff."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section A acknowledges GPT-4o's 'extensive pretraining on reasoning tasks, potentially including exposure to DeliData or DeliData-like problems' and discusses this as a confounder for the GPT-4o baseline results."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "While the paper acknowledges GPT-4o may have seen DeliData, it does not address whether the Llama-3-8B-Instruct model may have been exposed to these tasks during pretraining, nor does it apply any contamination detection methods."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in the main study. The human validation in D.4 is a small annotation task, not a human subjects experiment."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in the main study. The NeurIPS checklist confirms 'No human experiments are conducted in our work.'"
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in the main study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in the main study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in the main study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in the main study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in the main study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost or API cost is reported despite extensive use of GPT-4o for data generation and as the fixed intervention agent during evaluation across 100+ dialogues per condition."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Section D.1 states: 'Training standard baselines for 2,000 steps typically required around 12 GPU hours, while PPO models—trained over 6,000 mini-batches with an effective batch size of 8—took approximately 24 hours to converge.' Hardware is specified as NVIDIA A100 GPUs."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Figure 1b shows ablation results 'across 3 random seeds.' The shaded regions in the figure show variance across seeds. Table 1 reports standard error across 100 dialogue trials."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 5 states '100 DeliData and 100 Weights Task dialogues' for evaluation. Section 6 confirms 'over 100 collaboration runs.' Figure 1b states '3 random seeds.'"
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section D.1 mentions 'Based on early validation experiments on the DeliData task, we found β = 0.1 to yield consistently strong performance' but does not report how many configurations were tried or the search budget."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "For λ_Intent, Figure 1b shows results across three values (0.01, 0.2, 1.0) with the selection of 0.2 justified by the learning curves. For β, D.1 states it was selected via 'early validation experiments on the DeliData task.'"
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper compares ICR against 5 baselines across multiple metrics and conditions without applying any multiple comparison correction (Bonferroni, etc.)."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement all baselines (BC, DPO, IPO, PPO, PSO-INTENT) themselves and compare against their own ICR method, but do not acknowledge the bias of evaluating their own system against their own implementations of baselines."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "ICR adds an additional forward pass per sample compared to PPO (Section 4, Computational Cost) but no performance-vs-compute curve is provided. The paper does not compare methods at matched compute budgets."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper discusses why DeliData (Wason Card Selection) and Weights Task are appropriate for testing collaborative reasoning, and Section 5 explains the choice of evaluation metrics (ACC, CG) and their relationship to the claimed capabilities."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "All baselines use the same base model (Llama-3-8B-Instruct), same intervention agent (GPT-4o), same evaluation setup, and same prompt structure. The only variable is the training algorithm. The scaffold is controlled across comparisons."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Section A acknowledges 'GPT-4o's extensive pretraining on reasoning tasks, potentially including exposure to DeliData or DeliData-like problems' as a confounder, constituting awareness of temporal leakage risk."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. For example, the system prompts for evaluation include task-specific information, but no analysis of whether this constitutes feature leakage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The bootstrap dialogues for evaluation are drawn from the same datasets used for training data collection. No discussion of whether train and test dialogues share structural similarities or are independent."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination)."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "ICR-trained collaborators achieve 47% improvement over the next best performer (DPO) on the Weights Task in full-press conditions (14.06 vs 9.56 ACC).",
    363       "evidence": "Table 1, Section 6. Results averaged over 100 dialogues with 15 turns each, reported with standard error.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Standard RL/preference alignment methods (PPO, DPO) are Bellman-optimal for the underlying MDP but suboptimal in the MAMDP setting.",
    368       "evidence": "Theorem 3.2 and proof in Theorem B.3, building on Langlois and Everitt (2021). Empirical validation in Table 1 showing all standard baselines underperform ICR.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "ICR's performance improvements hold across both language-rich (full-press) and language-free (no-press) conditions.",
    373       "evidence": "Table 1 shows ICR outperforms all baselines in both full-press and no-press settings for both tasks. No-press: 10.87 ACC Weights (vs 7.81 PPO), 0.85 ACC DeliData (vs 0.79 DPO).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Llama-3-8B-Instruct trained with ICR performs comparably with GPT-4o acting as both agents.",
    378       "evidence": "Table 2, Section A. ICR (14.06 Weights, 0.88 DeliData) vs GPT-4o paired (15.23 Weights, 0.91 DeliData). Close but GPT-4o still higher.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "λ_Intent = 0.2 provides optimal learning, with too low (0.01) or too high (1.0) values degrading performance.",
    383       "evidence": "Figure 1b ablation over 8k training steps with 3 random seeds on DeliData no-press variant.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Common ground convergence emerges as an emergent property of counterfactual training without explicitly training for it.",
    388       "evidence": "Section 6: ICR agents trained with only proxy task rewards (no CG signal) achieve highest CG scores in Table 1 (3.35 full-press, 3.18 no-press on DeliData).",
    389       "supported": "strong"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval", "theoretical"],
    393   "key_findings": "ICR (Interruptible Collaborative Roleplayer) introduces counterfactual KL-divergence regularization to train LLM-based collaborators that selectively incorporate helpful interventions while resisting misleading ones. On the DeliData Wason Card Selection task and Weights Task, ICR consistently outperforms BC, DPO, IPO, PPO, and PSO-INTENT baselines in both task accuracy and common ground convergence. Notably, common ground convergence emerges without explicit CG-based training rewards, supporting the hypothesis that counterfactual robustness induces partner-aware collaboration. A Llama-3-8B model trained with ICR performs comparably to GPT-4o paired with itself.",
    394   "red_flags": [
    395     {
    396       "flag": "No statistical significance tests",
    397       "detail": "Despite strong claims of superiority ('consistently outperforms all baselines'), no statistical significance tests are reported. Comparisons rely on point estimates with standard errors. Given the variability across runs, some comparisons (e.g., DPO vs IPO vs PPO in DeliData) may not be statistically significant."
    398     },
    399     {
    400       "flag": "Fixed intervention agent introduces confound",
    401       "detail": "All evaluation uses a single fixed GPT-4o instance as the intervention agent. Results may not generalize to other intervention sources. The paper acknowledges this in limitations but the main claims are not bounded to this setting."
    402     },
    403     {
    404       "flag": "GPT-4o version unspecified",
    405       "detail": "GPT-4o is used extensively for data generation, as the fixed intervention agent, and as the LLM-Judge for reward modeling, but no API version or snapshot date is provided. GPT-4o behavior changes across versions."
    406     }
    407   ],
    408   "cited_papers": [
    409     {
    410       "title": "How RL agents behave when their actions are modified",
    411       "authors": ["Eric D Langlois", "Tom Everitt"],
    412       "year": 2021,
    413       "relevance": "Core theoretical foundation for the MAMDP framework used in ICR; proves Bellman-optimal policies are suboptimal in MAMDPs."
    414     },
    415     {
    416       "title": "Honesty is the best policy: defining and mitigating AI deception",
    417       "authors": ["Francis Ward", "Francesca Toni", "Francesco Belardinelli", "Tom Everitt"],
    418       "year": 2023,
    419       "relevance": "Provides the 'intentionality' concept and Path-Specific Objectives that ICR extends for counterfactual regularization in collaborative settings."
    420     },
    421     {
    422       "title": "Direct preference optimization: Your language model is secretly a reward model",
    423       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    424       "year": 2024,
    425       "relevance": "Key baseline method; ICR demonstrates limitations of DPO in collaborative MAMDP settings."
    426     },
    427     {
    428       "title": "Safely interruptible agents",
    429       "authors": ["Laurent Orseau", "M Armstrong"],
    430       "year": 2016,
    431       "relevance": "Foundational work on safe interruptibility in RL that ICR builds upon for collaborative agent design."
    432     },
    433     {
    434       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    435       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    436       "year": 2024,
    437       "relevance": "Cited in safety discussion about risks of partner-aware LLMs potentially being used for covert collusion or manipulation."
    438     },
    439     {
    440       "title": "Alignment faking in large language models",
    441       "authors": ["Ryan Greenblatt", "Carson Denison", "Benjamin Wright"],
    442       "year": 2024,
    443       "relevance": "Cited regarding risks that partner-aware agents could fake alignment; relevant to AI safety evaluation."
    444     },
    445     {
    446       "title": "Social influence as intrinsic motivation for multi-agent deep reinforcement learning",
    447       "authors": ["Natasha Jaques", "Angeliki Lazaridou", "Edward Hughes"],
    448       "year": 2019,
    449       "relevance": "Prior work on multi-agent RL with social influence that ICR extends by addressing intervention quality discrimination."
    450     },
    451     {
    452       "title": "Delidata: A dataset for deliberation in multi-party problem solving",
    453       "authors": ["Georgi Karadzhov", "Tom Stafford", "Andreas Vlachos"],
    454       "year": 2023,
    455       "relevance": "Primary evaluation dataset; provides the Wason Card Selection task dialogues used for both training and evaluation."
    456     },
    457     {
    458       "title": "Language instructed reinforcement learning for human-ai coordination",
    459       "authors": ["Hengyuan Hu", "Dorsa Sadigh"],
    460       "year": 2023,
    461       "relevance": "InstructRL method for human-AI coordination that ICR builds upon for incorporating human priors."
    462     },
    463     {
    464       "title": "From r to Q*: Your Language Model is Secretly a Q-Function",
    465       "authors": ["Rafael Rafailov", "Joey Hejna", "Ryan Park", "Chelsea Finn"],
    466       "year": 2024,
    467       "relevance": "Provides token-MDP framework showing DPO performs credit assignment; theoretical foundation for ICR's analysis of preference-aligned policy limitations."
    468     },
    469     {
    470       "title": "Path-specific objectives for safer agent incentives",
    471       "authors": ["Sebastian Farquhar", "Ryan Carey", "Tom Everitt"],
    472       "year": 2022,
    473       "relevance": "Introduces Path-Specific Objectives used to assign 'intention' to parametric agents; ICR extends this for collaborative settings."
    474     }
    475   ]
    476 }

Impressum · Datenschutz