scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27160B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Learning \"Partner-Aware\" Collaborators in Multi-Party Collaboration",
      6     "authors": [
      7       "Abhijnan Nath",
      8       "Nikhil Krishnaswamy"
      9     ],
     10     "year": 2025,
     11     "venue": "NeurIPS 2025",
     12     "arxiv_id": "2510.22462",
     13     "doi": "10.48550/arXiv.2510.22462"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims that ICR achieves superior common ground convergence and outperforms standard RLHF/DPO baselines are directly supported by Table 1 results across both tasks in full-press and no-press conditions.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Causal claims that counterfactual KL regularization produces better collaboration are supported by ablation of λIntent values (Fig 1b) and the PPO-CF baseline, which isolates the contribution of the KL term from counterfactual prompting alone.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Limitations section explicitly bounds claims to two task domains and 8B-scale models, noting untested conditions including Diplomacy, human collaborators, and larger-scale centralized training.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper does not adequately address the alternative that GPT-4o's role as both expert trajectory generator and fixed evaluation intervention agent inflates ICR performance via shared distributional priors with the base Llama-3-8B-Instruct model.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper explicitly distinguishes proxy training rewards (task accuracy only) from gold evaluation rewards (accuracy × common ground convergence), noting consensus reward was deliberately withheld during training to prevent reward hacking.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 7 contains a dedicated multi-paragraph 'Limitations and Future Work' subsection covering compute constraints, task diversity, fixed intervention agent, and data bottlenecks.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are named: 8B-scale training due to compute limits, only two task domains, GPT-4o's potential prior exposure to DeliData, and fixed GPT-4o intervention agent not reflecting real-world intervention diversity.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Results are explicitly limited to two domains, one intervention agent (GPT-4o), 8B models, and AI-AI collaborations; the paper explicitly notes human collaboration data as a bottleneck.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Full funding disclosure in acknowledgments: DARPA FACT program (HR00112490377), NSF awards (DRL 2019805, DRL 2454151, IIS 2303019), ARO Knowledge Systems (W911NF-25-1-0096), and ARPA-H PARADIGM program.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Authors clearly affiliated with SIGNAL Lab, Department of Computer Science, Colorado State University, with institutional emails provided.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All funders (DARPA, NSF, ARO, ARPA-H) are US government agencies with no commercial stake in ICR or the evaluated models.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is provided anywhere in the paper.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are defined precisely: 'common ground' via Stalnaker (2002), 'partner-aware' as adapting to specific intervention agents, 'MAMDP' formally defined in Section 3, 'counterfactual invariance' via KL divergence in Eq. 4.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit bullet-pointed contributions are stated in the introduction: MAMDP+counterfactual invariance formulation, theoretical proofs of PPO/DPO suboptimality, and empirical ICR results on two collaborative tasks.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 substantively engages prior work on collaborative reasoning, preference-based RL, and safe interruptibility, explicitly contrasting ICR against PSO-INTENT and RLHF approaches rather than merely listing related papers.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Code is released at https://github.com/csu-signal/ICR, cited in footnote 2 of the paper.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Both evaluation benchmarks (DeliData from Karadzhov et al. 2023 and Weights Task from Khebour et al. 2024) are publicly available datasets.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "While specific libraries (PEFT, TRL SFTTrainer, bitsandbytes), GPU hardware (NVIDIA A100), and model IDs are mentioned, no requirements.txt, Dockerfile, or equivalent environment spec is provided.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Algorithm 1 describes the pipeline conceptually and Section D details hyperparameters, but no step-by-step runnable instructions are provided; the paper says code will be in 'supplementary material' without confirming it is in the released repo.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Table 1 reports standard errors (±) for all metrics across 100 evaluation dialogues; Fig 1b reports training curves across 3 random seeds.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No statistical significance tests (t-test, ANOVA, etc.) are reported for comparative claims between ICR and baselines; only standard errors are provided.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Percentage improvements are reported explicitly (47% improvement over DPO on Weights Task, 14% on CG metric for DeliData, 300% difference for inequality propositions in Fig 1a).",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "100 evaluation dialogues per task is used with no power analysis or justification for sufficiency given the observed effect sizes and variance.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Standard errors are reported in Table 1 across 100 dialogues and across 3 seeds in ablation Fig 1b.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Five baselines included: BC-COLLABORATOR, DPO, IPO, PPO, and PSO-INTENT, covering behavior cloning, offline preference RL, on-policy RL, and the most directly relevant prior work.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "DPO (2024), IPO (2024), and PSO-INTENT (2023) are all recent methods; PPO is the standard on-policy algorithm used as the underlying optimizer for ICR itself.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Fig 1b ablates λIntent values (0.01, 0.2, 1.0); Appendix A includes PPO-CF (isolates KL term from counterfactual prompting) and ICR-Phrasing (tests prompt robustness).",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "DeliData uses both accuracy (ACC) and common ground gain (CG); Weights Task uses composite ACC; cumulative CG curves by proposition type are also provided (Fig 1a).",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Section D.4 reports a human validation study with 2 annotators evaluating 200 intervention pairs, finding Cohen's κ=0.92 on DeliData and κ=0.58 on Weights Task.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Training uses GPT-4o generated expert trajectories collected prior to evaluation; all evaluation uses 100 fresh dialogue runs not part of the training data.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Fig 1a breaks Weights Task results into equality, inequality, and order proposition types; results are reported separately for full-press vs. no-press conditions for both tasks.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Appendix E provides concrete examples of misleading interventions degrading collaborator performance, and cases where well-meaning interventions are incorrectly ignored.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "BC-COLLABORATOR shows negative CG (-0.13) in DeliData full-press; λIntent=0.01 severely hampers learning; PSO-Skeptical degrades performance relative to PSO-Intent.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "Llama-3-8B-Instruct is specified via HuggingFace ID, but GPT-4o is used without a specific API snapshot date, despite being central to both expert data generation and all evaluation runs.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "All prompts are provided verbatim in Appendix C (Figs. 2–10) for both tasks and both full-press/no-press conditions, including counterfactual prefixes and alternative phrasings (Table 4).",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Section D fully reports LoRA settings (α=16, rank=8, dropout=0.05), optimizer (AdamW, cosine scheduler), learning rates, batch sizes, training steps, and sampling parameters.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "The MAMDP turn-taking protocol, 15-turn structure, counterfactual prefix computation, and PPO rollout with single additional forward pass are described in detail in Sections 3–5 and Appendix C.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Algorithm 1 and Section C document the full data collection pipeline including bootstrap dialogue seeding, personality assignment, token length statistics (Table 5), and expert response parsing procedures.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "The GPT-4o generated expert trajectory corpus used for training is not explicitly confirmed to be released; regenerating it requires costly GPT-4o API calls with stochastic sampling making exact reproduction impossible.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section C and Algorithm 1 describe data collection comprehensively: GPT-4o as expert for both roles, bootstrap seeding, personality sampling, 15-turn collection per dialogue, and preference annotation procedure.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": true,
    278           "answer": false,
    279           "justification": "Human annotators in Section D.4 are described only as 'two fluent English-speaking college undergraduates'; no recruitment procedure, compensation, or selection criteria is described.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Algorithm 1 provides complete pseudocode of the full pipeline from dialogue seed initialization through expert trajectory collection to ICR training.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "Training data cutoffs for GPT-4o and Llama-3-8B-Instruct are not stated despite both being evaluated on DeliData, a published dataset that likely predates their training.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": true,
    299           "justification": "Appendix A explicitly acknowledges 'GPT-4o's extensive pretraining on reasoning tasks, potentially including exposure to DeliData or DeliData-like problems' when interpreting GPT-4o paired results.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "Contamination is acknowledged as a concern for GPT-4o baselines but no concrete mitigation (date-restricted models, held-out task variants, contamination tests) is performed for the primary evaluations.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": true,
    312           "answer": false,
    313           "justification": "No pre-registration is mentioned for the human annotation validation study in Section D.4.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "No IRB or ethics approval is mentioned for the human annotator study in Section D.4; the NeurIPS checklist incorrectly states 'we do not conduct any human evaluations' contradicting the actual paper content.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": true,
    324           "answer": false,
    325           "justification": "Only minimal characterization is given ('two fluent English-speaking college undergraduates'); no age, gender, NLP background, or other demographics are reported.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": true,
    330           "answer": false,
    331           "justification": "No explicit inclusion/exclusion criteria are described for human annotators beyond the informal description 'fluent English-speaking college undergraduates.'",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "Randomization is not applicable to a pairwise quality annotation task with 2 annotators.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": true,
    342           "answer": true,
    343           "justification": "Section D.4 explicitly states annotators 'were not shown the GPT-4o reward scores or correct task solutions,' implementing effective blinding.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "With only 2 annotators completing the full study, attrition reporting is not applicable.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost or GPT-4o API cost is reported despite GPT-4o being used for all expert trajectory generation and as the fixed evaluation intervention agent.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Training compute is specified: ~12 GPU hours for standard baselines and ~24 hours for PPO models on NVIDIA A100s; full-press experiments require two A100s.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "ICR-trained collaborators achieve substantially higher accuracy and common ground convergence than all baselines, including a 47% improvement over DPO on the Weights Task.",
    372       "evidence": "Table 1: ICR achieves 14.06±0.13 vs DPO 9.56±0.09 on Weights Task full-press; 3.35±0.19 vs 2.94±0.18 CG on DeliData full-press.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Standard preference-aligned collaborators (RLHF/DPO/IPO) are theoretically suboptimal in MAMDP settings because they treat interventions as static state information rather than causally evaluating them.",
    377       "evidence": "Theorem 3.2 and Theorem B.3 formally prove this result; empirically all preference-aligned baselines underperform ICR across tasks.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Counterfactual invariance regularization produces emergent common ground convergence without explicit CG rewards during training.",
    382       "evidence": "ICR trained with only task-accuracy proxy reward achieves CG=3.35 vs BC-COLLABORATOR's CG=-0.13; common ground-based rewards were deliberately withheld.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "ICR performance gains persist in language-free (no-press) conditions, demonstrating principled partner-awareness beyond language processing.",
    387       "evidence": "ICR achieves 10.87±0.13 ACC in no-press vs 7.81±0.11 for PPO (next best) on Weights Task; 0.85±0.02 vs 0.78±0.03 on DeliData no-press.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Llama-3-8B trained with ICR performs comparably to GPT-4o acting as both agents, despite GPT-4o's distributional advantage.",
    392       "evidence": "GPT-4o self-paired achieves 15.23±0.21 on Weights Task vs ICR (Llama-3-8B) at 14.06±0.13; gap is ~8% but GPT-4o may have DeliData contamination.",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "ICR is robust to counterfactual prefix phrasing variations, with near-zero mean log-probability response gaps across 6 semantic variants.",
    397       "evidence": "Mean response gap of 0.0008 log-probability units (σ=0.1568) across 6 counterfactual phrasings on 50 contexts; untrained model shows 0.0247 mean gap.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "benchmark-eval",
    403     "theoretical"
    404   ],
    405   "key_findings": "ICR substantially outperforms all RLHF/DPO/IPO baselines on multi-party collaborative task performance and common ground convergence, with ~47% improvement over the best offline RL baseline on the Weights Task and 14% on the common ground metric for DeliData. The paper provides formal theoretical proof that preference-aligned LLMs are suboptimal in Modified-Action MDPs where interventions have causal structure, and demonstrates that counterfactual KL regularization produces emergent common ground alignment without explicit CG training rewards. Performance gains persist in language-free conditions and across alternative counterfactual prompt phrasings, suggesting the improvement reflects principled policy learning rather than language-specific surface effects.",
    406   "red_flags": [
    407     {
    408       "flag": "GPT-4o version unspecified",
    409       "detail": "GPT-4o is used for all expert trajectory collection and as the fixed evaluation intervention agent across all 100+100 evaluation dialogues, but no API snapshot date is given, making exact reproduction impossible and contamination analysis incomplete."
    410     },
    411     {
    412       "flag": "No statistical significance tests",
    413       "detail": "Despite strong comparative claims, the paper reports only standard errors over 100 dialogues without t-tests or other significance tests to confirm ICR improvements are statistically significant rather than within natural variation."
    414     },
    415     {
    416       "flag": "NeurIPS checklist inconsistency",
    417       "detail": "The NeurIPS checklist (items 14-15) states 'we do not conduct any crowdsourcing or human evaluations' but Section D.4 clearly reports a human annotation study with 2 annotators evaluating 200 intervention pairs."
    418     },
    419     {
    420       "flag": "Training corpus reproducibility",
    421       "detail": "Expert trajectories generated via GPT-4o API constitute the training corpus but are not explicitly released; regenerating requires costly API calls with stochastic outputs, making exact reproduction practically impossible."
    422     },
    423     {
    424       "flag": "Contamination not mitigated",
    425       "detail": "GPT-4o's potential prior exposure to DeliData (a published HCI dataset from 2023) is acknowledged in the appendix but not formally tested or mitigated for the primary evaluation runs."
    426     },
    427     {
    428       "flag": "Human validation understaffed",
    429       "detail": "The human validation study (Section D.4) uses only 2 annotators to evaluate 200 intervention pairs; this is insufficient to establish reliable inter-rater statistics and contradicts the paper's own NeurIPS checklist."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "How RL agents behave when their actions are modified",
    435       "relevance": "Foundation of the MAMDP framework used throughout; establishes Bellman-optimal agents are suboptimal when actions can be modified by another strategic agent."
    436     },
    437     {
    438       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    439       "relevance": "Key baseline (DPO) and the primary preference-alignment paradigm the paper critiques as structurally suboptimal for collaborative settings."
    440     },
    441     {
    442       "title": "Safely interruptible agents",
    443       "relevance": "Core safety literature motivating the 'interruptible' design goal; paper extends safe interruptibility from RL safety to multi-party collaborative dialogue."
    444     },
    445     {
    446       "title": "Honesty is the best policy: defining and mitigating AI deception",
    447       "relevance": "Source of PSO-INTENT baseline and the 'intentionality' concept that ICR operationalizes via counterfactual KL regularization."
    448     },
    449     {
    450       "title": "DeliData: A dataset for deliberation in multi-party problem solving",
    451       "relevance": "Primary evaluation dataset providing the Wason Card Selection collaborative task used in main experiments."
    452     },
    453     {
    454       "title": "When text and speech are not enough: A multimodal dataset of collaboration in a situated task",
    455       "relevance": "Source of the Weights Task, the second evaluation domain for ICR experiments."
    456     },
    457     {
    458       "title": "Path-specific objectives for safer agent incentives",
    459       "relevance": "Provides the counterfactual influence pathway concept that ICR's counterfactual regularization is derived from."
    460     },
    461     {
    462       "title": "Proximal Policy Optimization Algorithms",
    463       "relevance": "The base RL algorithm used for ICR training and as a direct baseline; also relevant to the broader agentic LLM training literature."
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 2,
    469       "justification": "Applicable to AI tutoring and collaborative work settings, though deployment requires RL training infrastructure and GPT-4o API access for expert trajectory generation."
    470     },
    471     "surprise_contrarian": {
    472       "score": 2,
    473       "justification": "The formal proof that RLHF/DPO agents are structurally suboptimal for multi-party collaboration challenges the dominant preference alignment paradigm with a principled theoretical argument."
    474     },
    475     "fear_safety": {
    476       "score": 1,
    477       "justification": "Raises concerns about partner-aware LLMs being misused for covert manipulation, with explicit discussion linking to sleeper agents and alignment faking in limitations."
    478     },
    479     "drama_conflict": {
    480       "score": 1,
    481       "justification": "The claim that 8B Llama trained with ICR approaches GPT-4o performance on these tasks creates modest controversy, though the comparison is carefully caveated."
    482     },
    483     "demo_ability": {
    484       "score": 1,
    485       "justification": "Code is available on GitHub but requires RL training infrastructure, GPU access, and GPT-4o API calls; not easily demonstrable without significant setup."
    486     },
    487     "brand_recognition": {
    488       "score": 1,
    489       "justification": "Colorado State University SIGNAL Lab is a legitimate research group but lacks the brand recognition of major AI labs; NeurIPS 2025 acceptance provides venue credibility."
    490     }
    491   },
    492   "hn_data": {
    493     "threads": [],
    494     "top_points": 0,
    495     "total_points": 0,
    496     "total_comments": 0
    497   }
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs