scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20029B)
      1 {
      2   "paper": {
      3     "title": "Curriculum Guided Massive Multi Agent System Solving for Robust Long Horizon Tasks",
      4     "authors": ["Indrajit Kar", "Kalathur Chenchu Kishore Kumar"],
      5     "year": 2025,
      6     "venue": "Wipro Innovation Networks (preprint)"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No dataset or benchmark data is released. The Tower of Hanoi benchmark is custom-built but not shared."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions Mistral and DeepSeek but gives no version details or software dependencies."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Figures 1.1, 1.2, 1.7, and 1.8 show 95% confidence intervals on run-rate and cumulative regret curves."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims Thompson Sampling outperforms UCB and ε-Greedy but provides no statistical significance tests to support these comparative claims."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No effect sizes (Cohen's d, percentage improvement with baseline context, etc.) are reported. Results are shown only in figures without quantified differences."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification is given for the number of experimental runs, the choice of 2000 ticks, or the 64×64 grid size. No power analysis is discussed."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The 95% confidence interval bands in the run-rate and regret figures implicitly convey variance across runs."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares Thompson Sampling against UCB and ε-Greedy (Figures 1.1, 1.2), and compares NLL+Curriculum vs. Curriculum-only vs. Base RL (Figures 1.7, 1.8). Figure 1.9 compares against traditional ML, deep learning, and RL-only agents."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The baselines are all bandit algorithm variants (UCB, ε-Greedy) and generic ML/DL/RL agents. No comparison against contemporary multi-agent LLM frameworks like MAKER [19], which is discussed in the related work, or other recent systems."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares NLL+Curriculum learning vs. Curriculum-only (Figure 1.7) vs. Base RL without NLL or curriculum (Figure 1.8), effectively ablating the NLL and curriculum components."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports run-rate (stage progression speed), cumulative regret, NLL, competence scores, and oracle usage as evaluation metrics."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Human evaluation is not relevant for this simulated multi-agent system benchmark evaluation."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The system is evaluated on a simulated Tower of Hanoi task, not on a train/test split dataset. The concept of held-out test sets does not structurally apply."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by curriculum stage (Stages 1-4), by spatial region (arm selections), and by algorithm type. Table 1 shows per-stage details."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 1.9 shows failure cases for traditional ML, deep learning, and base RL agents. The paper also discusses posterior lock-in as a limitation of Thompson Sampling (Section 5)."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper reports that traditional ML and DL pixel agents fail entirely (Figure 1.9a, 1.9b), and that base RL without curriculum shows limited success (Figure 1.9c). Posterior lock-in of suboptimal arms under Thompson Sampling is also noted."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims improved stability, reduced oracle usage, and stronger long-range reasoning. The figures show confidence intervals for stability, oracle escalation mechanisms are demonstrated, and curriculum progression results support the long-range reasoning claim."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper makes causal claims (e.g., 'NLL-based calibration prevents premature advancement', 'Thompson Sampling achieves the lowest cost because it reaches competence earliest') but the experimental design does not control for all variables simultaneously. The ablation covers some components but the simulation itself conflates multiple design choices."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract and discussion claim applicability to 'robotic manipulation and planning tasks' and 'automotive factory AI workflows' (Section 6), but the evaluation is solely on a simulated Tower of Hanoi benchmark. The title says 'Robust Long Horizon Tasks' but only one task is tested."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No alternative explanations for the results are discussed. The paper does not consider whether the improvements stem from the curriculum structure, the specific bandit algorithm, the NLL metric, or the simulation parameters."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper mentions 'Mistral' for the SLM and 'DeepSeek' for the Oracle but provides no version numbers, model sizes, or snapshot dates for either."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper describes prompts in natural language (e.g., 'pixel:10,50,cat:2') but does not provide the full prompt text sent to either Mistral or DeepSeek."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Key hyperparameters like η (competence learning rate), α (pity bonus coefficient), γ (SLM confidence weight), θ (verifier threshold), τ_green (green-pixel threshold), and the bandit weight coefficients w_c and w_n are defined symbolically but their actual numerical values used in experiments are not stated."
    142       },
    143       "scaffolding_described": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The agentic scaffolding is described in detail across Sections 4.2-4.10: the PixelGrid substrate, Verifier module, Oracle escalation via Router, Curriculum Manager, and Thompson Sampling meta-controller are all specified with mathematical formulations."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper does not describe how the Tower of Hanoi moves are generated, how the spiral mapping was validated, or what preprocessing occurs between raw simulation output and the reported figures."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "There is no dedicated limitations or threats-to-validity section in the paper."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No specific threats to validity are discussed. The brief mention of posterior lock-in (Section 5) is a property of Thompson Sampling, not a limitation of the study design."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. It generalizes freely from Tower of Hanoi to factory automation without bounding the scope of its claims."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw simulation data, logs, or intermediate results are made available."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 4.1 describes the experimental setup: 64×64 grid, 4096 agents, 20 ticks/second, Tower of Hanoi benchmark with spiral mapping. The simulation procedure is described in detail."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants; the study uses a simulated multi-agent system."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not document how simulation outputs are aggregated into the reported figures, how many runs were performed, or how confidence intervals were computed."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding source is disclosed. The authors are affiliated with Wipro Innovation Networks but no funding acknowledgment is provided."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Both authors list their affiliation as Wipro Innovation Networks on the first page."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. Wipro as a technology company may have commercial interest in multi-agent AI systems."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial disclosure statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "The paper uses Mistral and DeepSeek models but does not state their training data cutoff dates."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No discussion of whether the Tower of Hanoi task structure or solutions could be in the training data of Mistral or DeepSeek."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Tower of Hanoi is a well-known classical problem. The paper does not discuss whether LLMs may have seen Tower of Hanoi solutions during training, which would undermine claims about reasoning capability."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "The paper discusses reducing oracle usage conceptually but does not report actual inference costs, API costs, tokens consumed, or wall-clock time."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No total computational budget, GPU hours, API spend, or hardware specifications are reported."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "Thompson Sampling achieves the fastest run-rate and lowest cumulative regret compared to UCB and ε-Greedy for curriculum management.",
    285       "evidence": "Figures 1.1 and 1.2 show run-rate trajectories and cumulative regret with 95% confidence intervals across the three algorithms (Section 5).",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "NLL-augmented curriculum learning outperforms curriculum-only and base RL approaches.",
    290       "evidence": "Figures 1.1, 1.7, and 1.8 compare NLL+Curriculum, Curriculum-only, and Base RL run-rate trajectories. Figure 1.9 shows qualitative success/failure comparisons.",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "The framework can be transferred to robotic manufacturing systems and automotive factory AI workflows.",
    295       "evidence": "Section 6 (Discussion) draws an analogy between Tower of Hanoi dependency structure and factory workflows. No empirical evidence for this transfer is provided.",
    296       "supported": "unsupported"
    297     },
    298     {
    299       "claim": "Selective oracle escalation reduces unnecessary oracle usage while maintaining correctness.",
    300       "evidence": "The Verifier module (Section 4.6) and Oracle escalation (Section 4.7) describe the mechanism, but no quantitative measurement of oracle usage reduction is reported.",
    301       "supported": "weak"
    302     }
    303   ],
    304   "methodology_tags": ["benchmark-eval"],
    305   "key_findings": "The paper proposes a hierarchical multi-agent architecture using a 64x64 grid of lightweight agents (Mistral SLM) with selective escalation to a DeepSeek oracle, guided by a spatial curriculum and Thompson Sampling curriculum manager. NLL-based calibration is integrated into the reward signal to ensure agents advance only when both accurate and confident. On a spatially-grounded Tower of Hanoi benchmark, Thompson Sampling outperforms UCB and ε-Greedy in run-rate and cumulative regret, and the full NLL+curriculum system outperforms curriculum-only and base RL variants.",
    306   "red_flags": [
    307     {
    308       "flag": "No reproducibility artifacts",
    309       "detail": "No code, data, environment specs, or reproduction instructions are provided. Key hyperparameter values are defined symbolically but never given numerical values."
    310     },
    311     {
    312       "flag": "Unbounded generalization claims",
    313       "detail": "The paper claims applicability to robotic manufacturing and automotive factory workflows (Section 6) based solely on a simulated Tower of Hanoi benchmark. The title claims 'Robust Long Horizon Tasks' but only one task is evaluated."
    314     },
    315     {
    316       "flag": "Contamination risk unaddressed",
    317       "detail": "Tower of Hanoi is a classical problem whose solutions are widely available online. The paper does not discuss whether Mistral or DeepSeek have seen Tower of Hanoi solutions during training, which could confound claims about reasoning capability."
    318     },
    319     {
    320       "flag": "No limitations section",
    321       "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries."
    322     },
    323     {
    324       "flag": "Unspecified model versions and hyperparameters",
    325       "detail": "Mistral and DeepSeek are referenced without version numbers. All hyperparameters (η, α, γ, θ, τ, weights) are defined symbolically but actual values used in experiments are not reported."
    326     }
    327   ],
    328   "cited_papers": [
    329     {
    330       "title": "Solving a Million-Step LLM Task with Zero Errors (MAKER)",
    331       "authors": ["E. Meyerson", "G. Paolo", "R. Dailey", "H. Shahrzad", "O. Francon", "C. F. Hayes", "X. Qiu", "B. Hodjat", "R. Miikkulainen"],
    332       "year": 2025,
    333       "arxiv_id": "2511.09030",
    334       "relevance": "Directly relevant as a competing long-horizon LLM task-solving framework."
    335     },
    336     {
    337       "title": "Measuring AI Ability to Complete Long Tasks",
    338       "authors": ["T. Kwa", "B. West", "J. Becker"],
    339       "year": 2025,
    340       "relevance": "Benchmark for evaluating AI agents on long-horizon tasks, directly relevant to the survey scope."
    341     },
    342     {
    343       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    344       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    345       "year": 2023,
    346       "relevance": "Foundational work questioning LLM capability claims, relevant to methodology quality assessment."
    347     },
    348     {
    349       "title": "Faith and Fate: Limits of Transformers on Compositionality",
    350       "authors": ["N. Dziri", "X. Lu", "M. Sclar"],
    351       "year": 2023,
    352       "relevance": "Analyzes fundamental limitations of transformers on compositional reasoning tasks like Tower of Hanoi."
    353     },
    354     {
    355       "title": "The Illusion of Thinking: Understanding the Strengths and Limitations of Reasoning Models via the Lens of Problem Complexity",
    356       "authors": ["P. Shojaee", "I. Mirzadeh", "K. Alizadeh"],
    357       "year": 2025,
    358       "arxiv_id": "2506.06941",
    359       "relevance": "Evaluates reasoning model capabilities and limitations, relevant to LLM capability assessment."
    360     },
    361     {
    362       "title": "OPTIMA: Optimizing Effectiveness and Efficiency for LLM-Based Multi-Agent System",
    363       "authors": ["W. Chen", "J. Yuan", "C. Qian"],
    364       "year": 2025,
    365       "relevance": "Multi-agent LLM system optimization, directly in survey scope."
    366     },
    367     {
    368       "title": "Plan-and-Act: Improving Planning of Agents for Long-Horizon Tasks",
    369       "authors": ["L. E. Erdogan", "H. Furuta", "S. Kim"],
    370       "year": 2025,
    371       "relevance": "Long-horizon agent planning at ICML, directly relevant to agentic AI capabilities."
    372     },
    373     {
    374       "title": "MAS Failure Taxonomy: Failure Modes in LLM-Based Multi-Agent Systems",
    375       "authors": ["X. Zhang"],
    376       "year": 2025,
    377       "relevance": "Taxonomy of multi-agent system failures, relevant to AI safety and reliability."
    378     },
    379     {
    380       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    381       "authors": ["X. Wang", "J. Wei", "D. Schuurmans", "Q. Le", "E. Chi"],
    382       "year": 2022,
    383       "arxiv_id": "2203.11171",
    384       "relevance": "Foundational LLM reasoning technique relevant to the survey's coverage of reasoning methods."
    385     }
    386   ]
    387 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs