scan.json (28118B)
1 { 2 "paper": { 3 "title": "Mixture-of-Models: Unifying Heterogeneous Agents via N-Way Self-Evaluating Deliberation", 4 "authors": ["Tims Pecerskis", "Aivars Smirnovs"], 5 "year": 2026, 6 "venue": "arXiv.org", 7 "arxiv_id": "2601.16863", 8 "doi": "10.5281/zenodo.18234923" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "The NSED protocol enables ensembles of small (<20B) open-weight models to match or exceed 100B+ monolithic models on AIME 2025 (84% consumer, 90% high-perf) and LiveCodeBench Hard (60.2%). An empirical 'Efficiency-Fatigue' model fits with R²≈0.99, predicting optimal stopping rounds. Identity-masked quadratic voting reduces sycophancy by 40% on DarkBench, though topology alone cannot fix all safety dimensions (e.g., Sneaking).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL or code archive is provided in the paper. No GitHub/Zenodo link for the NSED implementation." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmarks used (AIME 2025, LiveCodeBench v5, DarkBench) are publicly available datasets. The paper references their public sources." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix B (Table 9) provides detailed vLLM serving configuration including dtype, KV cache settings, context lengths, tensor parallelism, attention backends, and special optimization flags for each model." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While hyperparameters and serving configs are documented, there are no runnable instructions." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section 5.1 states 'the standard error of the mean varies dynamically with model accuracy, ranging from ≈±4.2% in initial rounds to ≈±2.7% at peak convergence (p ≥ 0.90)' for AIME." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used. Claims like 'match or exceed' are made by comparing point estimates without p-values, t-tests, or bootstrap tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Absolute performance levels and improvements are reported with baselines for context, e.g., NSED Consumer 84% vs Majority Voting 54% on AIME, 60.2% vs 33.1% on LCB Hard (Table 1)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 5.1 acknowledges the small AIME dataset (N=30) and justifies using 4 independent runs (N_total=120 trials) with different random seeds to address this." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Despite running 4 independent trials, the paper reports only 'aggregated mean performance' without standard deviations, confidence intervals per round, or inter-run variance. The SE ranges mentioned are derived analytically, not empirically from run variance." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 1 compares against Majority Voting (Qwen-8B), Gemini-2.5-Pro, and DeepSeek-R1. Ablation baselines also included." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include DeepSeek-R1 and Gemini-2.5-Pro-06-05, which are contemporary state-of-the-art models as of early 2026." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.6 provides three ablations: topological displacement (removing QV and identity masking), presence penalty variation (α=1.0 vs 1.5), and homogeneous vs heterogeneous ensembles." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper evaluates on three distinct benchmarks covering math (AIME Pass@1), code (LCB Pass@1), and safety (DarkBench RMS across 6 subcategories)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. All evaluation is automated (exact-match for AIME, pass/fail for LCB, automated DarkBench scoring)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "AIME 2025 and LiveCodeBench v5 are external, standardized benchmarks not used for any tuning. The paper explicitly states the Broker used fixed profiles rather than runtime optimization." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "DarkBench results are broken down across 6 subcategories (Table 2). Per-round trajectories are shown for all benchmarks. Agent-level influence matrices and win rates are provided." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.3 discusses the 'Refactoring Risk' failure mode in code generation. Section 5.4 discusses the 'Median Voter' limit where Sneaking scores did not improve. Section 5.7 analyzes sycophancy vs noise failure modes." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results: Sneaking metric worsened (0.741 vs 0.136 baseline), late-round degradation occurs past optimal stop, α=1.0 ablation showed collapse. The 'Over-Refactoring' phenomenon in code generation is a reported failure." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 'match or exceed' 100B+ models are supported by Table 1 (84% vs DeepSeek-R1 84.2% for consumer, 90% for high-perf). R²≈0.99 claim is supported by Figure 4a. Claims are generally well-hedged." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about components (e.g., identity masking reduces sycophancy, presence penalty prevents attractor collapse) and backs them with ablation studies (Sections 5.6.1-5.6.3) that isolate single variables." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims 'Unifying Heterogeneous Agents' generally, but results are on only 3 benchmarks with 2 fixed ensembles. The abstract and conclusion make broad claims about 'decentralized AGI' and 'Hardware Arbitrage' that far exceed what was tested. The paper extrapolates to 10-20x cost savings with prior-gen hardware without evidence." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not consider alternative explanations for the performance gains. Could the improvement be due simply to increased token budget (more total compute via multiple agents)? The homogeneous ablation partially addresses this but doesn't fully control for total tokens generated." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses Pass@1 on AIME and LCB as proxies for 'reasoning capability' and frames results as evidence that topology can substitute for scale. No discussion of what these benchmarks actually measure vs. the broader claims about 'cognitive architectures' and 'AGI'." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as 'GPT-OSS-20B', 'Qwen3-8B', 'Gemma-12B-it', 'GPT-OSS-120B', 'Qwen3-80B-Next-A3B' without specific version hashes, snapshot dates, or HuggingFace model IDs. 'GPT-OSS' is not a standard model name — no version or source clarified." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4.4 provides the actual persona prompts used for each agent role (Balanced, Creative, Analytical). Section 4.6 describes the tool-calling API with specific function signatures." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix A (Table 8) reports temperature, presence penalty, and max tokens per agent. Section 4.7 documents the presence penalty selection rationale. Appendix B provides vLLM serving parameters." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The agentic scaffolding is described in extensive detail: the NSED protocol, orchestrator, broker, tool-calling API (Section 4.6), sliding history window for γ (Section 4.5), dual-mode parsing (Section 4.6), and Algorithm 1." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 5.1 describes the bootstrapping approach (4 runs × 30 = 120 trials), the evaluation harness for each benchmark, and the text-only regime with no compiler access." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated Limitations section. Section 5.7 discusses 'Thermodynamic Limits & Failure Modes' which covers some limitations within the results discussion, but this is about model behavior limits, not study limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address threats like the small number of benchmarks, fixed team compositions, potential confounds from the specific models chosen, or the generalizability of the thermodynamic model." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper makes sweeping claims about AGI, cognitive architectures, and hardware arbitrage without stating what the results do NOT show. Section 5.1 notes 'Text-Only regime' exclusion of tool-use, but the conclusion extends claims far beyond tested settings." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (individual trial results, per-problem scores, voting matrices, agent traces) is released. Only aggregated results are presented." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.1 describes the evaluation procedure: 4 independent runs with different random seeds, N=120 total trials for AIME, fixed ensemble configurations." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The study uses benchmark datasets and model ensembles." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw model outputs to final scores is not fully documented. How were individual round scores aggregated? How were the 4 AIME runs combined? The consensus selection strategies are described but which was used for final reported numbers is unclear." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosure or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Authors are listed as affiliated with 'Peeramid Labs' with email addresses. 'The AI Futures Collective' is also listed." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "The authors are from Peeramid Labs, which appears to be the entity developing this technology. No funding independence is established. The paper's conclusions about 'decentralized AGI' could directly benefit Peeramid Labs commercially." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present. The DOI points to Zenodo which may indicate a product/project. No patent or equity disclosures." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the models used (GPT-OSS-20B, Qwen3-8B, Gemma-12B, etc.)." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether AIME 2025 problems or LiveCodeBench v5 tasks could have appeared in the training data of the models used." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No contamination analysis is performed. AIME 2025 problems may have been available online before model training, but this is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 7 provides detailed latency breakdown per round (generation phase, eval phase, cumulative time). Section 6 provides hardware cost analysis (Table 6). Table 1 includes qualitative cost estimates." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Section 6 provides hardware specifications (RTX 5090 cluster vs H100). Table 7 gives wall-clock times. Appendix B documents the serving infrastructure. Total time per problem is ~5 minutes." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Section 5.1 states '4 independent runs of the full benchmark for each configuration (N_total = 120 trials), utilizing different random seeds (T > 0) to capture the variance in probabilistic generation.'" 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 5.1 explicitly states 4 independent runs for AIME. However, number of runs for LiveCodeBench and DarkBench is not stated." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Section 4.7 mentions 'Preliminary grid searches indicated that standard values (α ≈ 0.0) led to immediate consensus collapse' but does not report how many configurations were tried or the total search budget." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The presence penalty α=1.5 was 'empirically determined' but the selection process is not described. The ensemble compositions appear hand-picked without justification for why these specific models were chosen over alternatives." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many comparisons made (3 benchmarks, multiple rounds, multiple configurations)." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors designed and evaluate their own NSED system. No acknowledgment of self-comparison bias. Baselines like Majority Voting use a single weak model (Qwen-8B) which is an unfair comparison — the NSED ensemble uses multiple larger models." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "NSED uses 7 rounds × 3 agents of generation and evaluation, far exceeding the compute of single-model baselines. This massive compute difference is not controlled for. The Majority Voting baseline uses only Qwen-8B, while NSED uses GPT-OSS-20B + Qwen-8B + Gemma-12B." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether AIME math problems and LiveCodeBench code tasks are valid proxies for the paper's claims about 'cognitive architectures', 'Hardware Arbitrage', and 'decentralized AGI'." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper compares NSED (an elaborate scaffold) against zero-shot/majority-voting baselines. The performance difference could be largely due to the scaffolding, not the specific NSED innovations. This confound is not addressed." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether AIME 2025 problems were available online before the models' training cutoffs. LiveCodeBench is designed to mitigate this but no explicit discussion." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The iterative deliberation protocol provides agents with peer solutions, which could constitute a form of information leakage not present in baseline evaluations." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between train and test data for any of the models used." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods applied. No canary strings, membership inference, or decontamination." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Ensembles of small (<20B) consumer-grade models can match or exceed 100B+ parameter models on AIME 2025", 365 "evidence": "Table 1: NSED Consumer achieves 84.0% Pass@1 vs DeepSeek-R1 84.2%. NSED High-Perf achieves 90.0%. Based on 4 runs × 30 problems = 120 trials (Section 5.1).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "The Efficiency-Fatigue model fits empirical data with R² ≈ 0.99", 370 "evidence": "Figure 4a shows thermodynamic model overlay with R² = 0.99 for mediocre ensemble. Table 3 reports parameters. However, the model has 4 free parameters (pg, Λ, β, pv) fit to only 7 data points.", 371 "supported": "weak" 372 }, 373 { 374 "claim": "NSED achieves 4x to 8x reduction in hardware CAPEX compared to monolithic models", 375 "evidence": "Table 6 compares RTX 5090 cluster ($6-7.5K) vs H100 setups ($30-60K). However, this is a theoretical cost comparison, not an empirical measurement of equivalent performance at matched cost.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Identity-masked topology reduces sycophancy by 40% over best single agent", 380 "evidence": "Table 2: NSED-R2 sycophancy score 0.040 vs best single agent (Qwen-8B) 0.073. However, Sneaking worsened dramatically (0.810 vs 0.136 GPT-OSS baseline).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "NSED Consumer ensemble reaches 60.2% Pass@1 on LiveCodeBench v5 Hard, matching SOTA proprietary models", 385 "evidence": "Figure 5 and Table 1. However, only the consumer ensemble result is shown in detail; number of runs for LCB not stated. No statistical comparison against baselines.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Higher-capability ensembles reach entropic saturation earlier than mediocre ones", 390 "evidence": "Table 3: High-Perf Topt=5 vs Mediocre Topt=6. Attributed to 'Sycophancy Barrier' (Table 4 shows uniform ~38% hallucination rate). Interesting finding but based on only 2 ensemble configurations.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Overclaiming", 397 "detail": "The paper claims to pave 'the road towards robust, verifiable and decentralized Artificial General Intelligence' based on results from 3 benchmarks with 2 fixed ensembles. The abstract and conclusion make claims about 'cognitive architectures' and 'scale-invariant recursion' that far exceed the empirical evidence." 398 }, 399 { 400 "flag": "Unfair baseline comparisons", 401 "detail": "The Majority Voting baseline uses only Qwen-8B (a single weak model), while NSED Consumer uses GPT-OSS-20B + Qwen-8B + Gemma-12B. This is not a fair comparison of topology — the NSED ensemble has access to a much stronger model. A fairer baseline would be majority voting with the same 3 models." 402 }, 403 { 404 "flag": "Overfitting thermodynamic model", 405 "detail": "The R² ≈ 0.99 claim for the Efficiency-Fatigue model is based on fitting 4 free parameters to 7 data points (rounds 1-7). With this parameter-to-data ratio, a high R² is expected and does not validate the theoretical framework. The High-Perf fit is already R² = 0.88." 406 }, 407 { 408 "flag": "Unspecified model identities", 409 "detail": "'GPT-OSS-20B' and 'GPT-OSS-120B' are not standard, publicly known model names. Without clarification of what these models actually are, the results cannot be reproduced or independently verified." 410 }, 411 { 412 "flag": "No variance reported despite multiple runs", 413 "detail": "The paper conducts 4 independent runs for AIME but only reports mean performance without standard deviation or confidence intervals derived from actual run variation. The ±4.2% and ±2.7% SE figures appear to be analytically derived, not empirical." 414 }, 415 { 416 "flag": "Compute budget not controlled", 417 "detail": "NSED uses 7 rounds × 3 agents of generation + evaluation, consuming orders of magnitude more tokens than single-model baselines. The paper does not compare against baselines at matched compute (e.g., best-of-N sampling with equivalent token budget)." 418 }, 419 { 420 "flag": "Company evaluating its own product", 421 "detail": "Authors are from Peeramid Labs, which appears to develop the NSED technology. No conflict of interest disclosure. The paper's conclusions about commercial viability ('Hardware Arbitrage') directly benefit the authors' organization." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters", 427 "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 428 "year": 2024, 429 "arxiv_id": "2408.03314", 430 "relevance": "Core reference on inference-time compute scaling, which NSED extends to multi-agent ensembles." 431 }, 432 { 433 "title": "Mixture-of-Agents Enhances Large Language Model Capabilities", 434 "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun", "Ce Zhang", "James Zou"], 435 "year": 2024, 436 "arxiv_id": "2406.04692", 437 "relevance": "Primary baseline architecture that NSED claims to improve upon with recurrent topology." 438 }, 439 { 440 "title": "Chain of Agents: Large Language Models Collaborating on Long-Context Tasks", 441 "authors": ["Yusen Zhang", "Ruoxi Sun"], 442 "year": 2024, 443 "arxiv_id": "2406.02818", 444 "relevance": "Sequential multi-agent architecture that NSED contrasts with for error propagation issues." 445 }, 446 { 447 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 448 "authors": ["Qingyun Wu", "Gagan Bansal"], 449 "year": 2023, 450 "arxiv_id": "2308.08155", 451 "relevance": "Major multi-agent framework that NSED positions against as a DAG-based architecture." 452 }, 453 { 454 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 455 "authors": ["DeepSeek-AI"], 456 "year": 2025, 457 "arxiv_id": "2501.12948", 458 "relevance": "Key baseline model for comparison; represents monolithic RL-based reasoning approach." 459 }, 460 { 461 "title": "Why Do Multi-Agent LLM Systems Fail?", 462 "authors": ["Mert Cemri", "Melissa Z. Pan"], 463 "year": 2025, 464 "arxiv_id": "2503.13657", 465 "relevance": "Directly relevant to understanding failure modes in multi-agent LLM systems." 466 }, 467 { 468 "title": "DarkBench: Benchmarking Dark Patterns in Large Language Models", 469 "authors": ["Esben Kran"], 470 "year": 2025, 471 "arxiv_id": "2503.10728", 472 "relevance": "Safety benchmark used to evaluate NSED's effect on sycophancy and manipulation." 473 }, 474 { 475 "title": "Towards Understanding Sycophancy in Language Models", 476 "authors": ["Mrinank Sharma", "Meg Tong"], 477 "year": 2025, 478 "arxiv_id": "2310.13548", 479 "relevance": "Foundational work on sycophancy that NSED's identity masking aims to address." 480 }, 481 { 482 "title": "Lost in the Middle: How Language Models Use Long Contexts", 483 "authors": ["Nelson F. Liu", "Kevin Lin"], 484 "year": 2023, 485 "arxiv_id": "2307.03172", 486 "relevance": "Motivates NSED's recurrent state design over append-only context logs." 487 }, 488 { 489 "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling", 490 "authors": ["Bradley Brown", "Jordan Juravsky"], 491 "year": 2024, 492 "arxiv_id": "2407.21787", 493 "relevance": "Alternative approach to inference-time compute scaling via repeated sampling." 494 }, 495 { 496 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 497 "authors": ["Shunyu Yao"], 498 "year": 2023, 499 "arxiv_id": "2210.03629", 500 "relevance": "Foundational agentic framework; NSED assumes ReAct-style capabilities in its agents." 501 }, 502 { 503 "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models", 504 "authors": ["Maciej Besta"], 505 "year": 2024, 506 "relevance": "Alternative topology for LLM reasoning that NSED compares against conceptually." 507 } 508 ] 509 }