scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22963B)
      1 {
      2   "paper": {
      3     "title": "Cognitive Models and AI Algorithms Provide Templates for Designing Language Agents",
      4     "authors": ["Ryan Liu", "Dilip Arumugam", "Cedegao E. Zhang", "Sean Escola", "Xaq Pitkow", "Thomas L. Griffiths"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.22523"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code or repository URL is provided anywhere in the paper. Although the paper is theoretical/position, it formalizes agent templates and surveys implementations — analysis scripts or template specifications could have been released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or structured data (e.g., a catalog of agent templates mapped to cognitive models) is released. The survey content exists only in prose form."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a position/survey paper that does not run any experiments, so environment specifications are structurally inapplicable."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No experiments are conducted, so reproduction instructions are structurally inapplicable."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "This is a position paper that does not report any new experimental results; no statistical analysis is performed."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No new experiments are conducted, so significance tests are not applicable."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No new experiments are conducted; effect sizes are not applicable."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No new experiments are conducted; sample size justification is not applicable."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No new experiments are conducted; variance reporting is not applicable."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a position paper, not an empirical evaluation. It does not compare against prior surveys or provide a systematic evaluation of templates."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No empirical evaluation is conducted; baseline recency is not applicable."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system or method is evaluated, so ablation is structurally inapplicable."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No evaluation is conducted, so multiple metrics are not applicable."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation is conducted; human evaluation is not applicable to a position paper making no empirical claims about system outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experiments are conducted; held-out test sets are not applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper organizes its survey by category: communication (Section 4.1), reasoning and planning (Section 4.2), representation (Section 4.3), search (Section 5.1.1), divide and conquer (Section 5.1.2), policy iteration (Section 5.2.1), posterior sampling (Section 5.2.2), and information-directed sampling (Section 5.2.3)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.1 discusses alternative views and limitations of the agent template approach, including the No Free Lunch Theorem argument, the paradigm-shift argument, and the automated-discovery argument. Section 5.1.2 notes that divide-and-conquer approaches may be less necessary given modern reasoning models."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not report any negative results from its own work. It references results from cited works but does not present cases where the proposed approach (cognitive/AI-inspired templates) failed or was inferior."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that cognitive models and AI algorithms provide templates for language agents, and the paper supports this through formal definitions (Section 3) and multiple detailed examples from communication (Section 4.1), reasoning (Section 4.2), representation (Section 4.3), search (Section 5.1.1), divide-and-conquer (Section 5.1.2), and RL algorithms (Section 5.2)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper is a position paper that does not make causal claims. It argues for the value of an approach and surveys existing work rather than claiming that using templates causes improved performance."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6.1 explicitly addresses generalization limits: the No Free Lunch Theorem means no single template works for all tasks, and the paper acknowledges that 'if such a shift does exist, it has yet to make itself apparent in the agents of today.' The paper frames its claims as about the utility of templates as a design approach, not universal superiority."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6.1 ('Alternative Views') discusses three specific alternatives: (1) finding a single best template instead of a general framework, (2) the possibility that LLMs represent a paradigm shift requiring novel templates, and (3) automatically discovering templates via LLMs rather than drawing from cognitive science/AI. Each is substantively addressed."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "The paper does not run any experiments with LLMs; it surveys existing work. Model version specification is not applicable."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use prompting in any experiments; it is a position/survey paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are conducted; hyperparameter reporting is not applicable."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is implemented or evaluated; the paper describes templates at a conceptual level."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper surveys existing work but does not describe how papers were selected for inclusion. There is no systematic search methodology, inclusion/exclusion criteria, or filtering pipeline documented. Papers appear to be selected based on the authors' knowledge of the field."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6.1 ('Alternative Views') functions as a limitations section, discussing three substantive reservations about the proposed approach. While not titled 'Limitations,' it provides substantive discussion of weaknesses."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.1 discusses specific threats: the No Free Lunch Theorem implying no universal template, the possibility that LLMs require fundamentally novel architectures, and the potential for automated template discovery to supersede human-designed templates. These are specific to this work's claims."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what it does NOT cover. It does not bound which cognitive models or AI algorithms are out of scope, nor does it explain why certain relevant areas (e.g., attention mechanisms, memory consolidation models) are excluded. Section 6.2 mentions some future directions (hypothesis generation, evolutionary algorithms, game theory) but frames these as calls to action rather than explicit scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "This is a position/survey paper with no collected data. Raw data availability is structurally inapplicable."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper surveys existing work but does not describe how the surveyed papers were identified or collected. No search methodology, databases queried, or time period is stated."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved and no data source is a standard benchmark. This is a position paper surveying the literature."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No pipeline from literature collection to analysis is documented. The paper does not describe how papers were found, filtered, or categorized for inclusion in the survey."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgements section states: 'This work was supported by funds provided by the National Science Foundation and by DoD OUSD (R & E) under Cooperative Agreement PHY-2229929 (the NSF AI Institute for Artificial and Natural Intelligence) and by ONR MURI N00014-24-1-2748.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Princeton University (Departments of Computer Science and Psychology), MIT (Brain and Cognitive Sciences), Columbia University (Zuckerman Mind Brain Behavior Institute and Psychiatry), and Carnegie Mellon University (Neuroscience Institute and Machine Learning)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding comes from NSF and DoD/ONR, which are government agencies with no direct financial stake in whether cognitive models or AI algorithms are good templates for language agents."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a position/survey paper."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model is evaluated on any benchmark; contamination concerns are structurally inapplicable."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is conducted; contamination is structurally inapplicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this position paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a position/survey paper; no method with inference cost is proposed or evaluated."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a position/survey paper; no computation is performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Cognitive models and AI algorithms provide effective templates for designing language agents, with agent templates formalized as directed acyclic graphs (DAGs) over LLMs and tools.",
    286       "evidence": "Section 3 provides a formal definition of agent templates as DAGs G = (V, E) where V ⊆ F are LLMs or tools and E are connections. Sections 4 and 5 give concrete examples of templates derived from cognitive models (RSA, reasoning/planning, Language of Thought) and AI algorithms (search, divide-and-conquer, policy iteration, posterior sampling, IDS).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "RSA-inspired language agents outperform baselines and ablations for communication tasks according to human judgments.",
    291       "evidence": "Section 4.1 cites Liu et al. (2023) as demonstrating that an agent using episodic future thinking to simulate audience reactions 'outperforms baselines and ablations according to human judgments.' This is a claim from cited work, not new evidence.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The Modular Agentic Planner (MAP), inspired by prefrontal cortex models, achieves strong results on Tower of Hanoi and PlanBench.",
    296       "evidence": "Section 4.2 cites Webb et al. (2025) for MAP's results. No new evaluation is presented in this paper.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Tree of Thoughts and related search-based agent templates improve LLM reasoning by externalizing the heuristic search process.",
    301       "evidence": "Section 5.1.1 describes Tree of Thoughts (Yao et al., 2023a) and beam-search variants (Xie et al., 2023) as instantiations of search algorithms. Evidence comes from cited works.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "LLM-based PSRL agents retain the efficient exploration properties of the original PSRL algorithm, as confirmed by cumulative regret curves.",
    306       "evidence": "Section 5.2.2 cites Arumugam & Griffiths (2025) for 'cumulative regret curves confirming that this agent template can succeed' in natural-language tasks like Wordle. Evidence is from cited work.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "IDS-based language agents approximate the Bayes-optimal policy more closely than PSRL-based agents in a numerical Wordle variant.",
    311       "evidence": "Section 5.2.3 cites Arumugam & Griffiths (2025) for this finding. The authors note 'the empirical study conducted... for IDS was preliminary and limited to a single domain.'",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["theoretical"],
    316   "key_findings": "This position paper argues that cognitive models (Rational Speech Acts, reasoning/planning models, Language of Thought) and AI algorithms (search algorithms, divide-and-conquer, policy iteration, posterior sampling, information-directed sampling) provide principled templates for designing modular language agents. The authors formalize agent templates as directed acyclic graphs (DAGs) over LLMs and tools and survey existing language agents that instantiate these templates. The paper calls for researchers to explore additional cognitive and algorithmic templates, including game-theoretic models and evolutionary algorithms.",
    317   "red_flags": [
    318     {
    319       "flag": "No systematic survey methodology",
    320       "detail": "Despite surveying existing language agents and their connections to cognitive models, the paper provides no systematic search methodology, inclusion/exclusion criteria, or explanation of how surveyed works were selected. This makes the coverage appear selective rather than comprehensive."
    321     },
    322     {
    323       "flag": "All cited empirical evidence comes from co-authored work",
    324       "detail": "Most of the concrete empirical claims (PSRL agents, IDS agents, communication agents) cite work where the paper's own authors are co-authors (Arumugam & Griffiths, 2025; Liu et al., 2023; Webb et al., 2025). While not inherently problematic for a position paper, this means the evidence base is largely self-referential."
    325     },
    326     {
    327       "flag": "No quality assessment of surveyed work",
    328       "detail": "The paper reports results from cited works at face value without assessing the methodological quality of those studies. For a survey-style paper arguing for an approach, some critical evaluation of the strength of evidence would be expected."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    334       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao", "Izhak Shafran", "Thomas Griffiths", "Yuan Cao", "Karthik Narasimhan"],
    335       "year": 2023,
    336       "relevance": "Key example of search-algorithm-inspired agent design; uses BFS/DFS with LLM-based node expansion and evaluation."
    337     },
    338     {
    339       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    340       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    341       "year": 2023,
    342       "relevance": "Foundational language agent design combining reasoning and acting, relevant to agentic workflow evaluation."
    343     },
    344     {
    345       "title": "Cognitive architectures for language agents",
    346       "authors": ["Theodore Sumers", "Shunyu Yao", "Karthik Narasimhan", "Thomas Griffiths"],
    347       "year": 2023,
    348       "relevance": "CoALA framework connecting language agent research with cognitive science, directly related to evaluating agent design methodology."
    349     },
    350     {
    351       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    352       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    353       "year": 2024,
    354       "relevance": "Major language agent for automated software engineering; relevant to evaluating agentic coding tool methodology."
    355     },
    356     {
    357       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    358       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"],
    359       "year": 2024,
    360       "relevance": "Key agent architecture using verbal self-reflection for iterative improvement; relevant to agent design evaluation."
    361     },
    362     {
    363       "title": "Autogen: Enabling next-gen llm applications via multi-agent conversations",
    364       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Beibin Li", "Erkang Zhu", "Li Jiang", "Xiaoyun Zhang", "Shaokun Zhang", "Jiale Liu"],
    365       "year": 2024,
    366       "relevance": "Multi-agent conversation framework; relevant to evaluating multi-agent system design."
    367     },
    368     {
    369       "title": "Openhands: An open platform for ai software developers as generalist agents",
    370       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song", "Frank F. Xu", "Xiangru Tang", "Mingchen Zhuge"],
    371       "year": 2024,
    372       "relevance": "Open platform for AI coding agents; directly relevant to evaluating agentic software engineering tools."
    373     },
    374     {
    375       "title": "A brain-inspired agentic architecture to improve planning with llms",
    376       "authors": ["Taylor Webb", "Shanka Subhra Mondal", "Ida Momennejad"],
    377       "year": 2025,
    378       "relevance": "MAP architecture inspired by prefrontal cortex models for LLM planning; key example of cognitive-model-derived agent template."
    379     },
    380     {
    381       "title": "Large Language Models can Implement Policy Iteration",
    382       "authors": ["Ethan Brooks", "Logan Walls", "Richard L. Lewis", "Satinder Singh"],
    383       "year": 2023,
    384       "relevance": "Early work on LLM-based RL agent templates (ICPI); directly relevant to evaluating AI-algorithm-derived agent designs."
    385     },
    386     {
    387       "title": "Toward Efficient Exploration by Large Language Model Agents",
    388       "authors": ["Dilip Arumugam", "Thomas L. Griffiths"],
    389       "year": 2025,
    390       "arxiv_id": "2504.20997",
    391       "relevance": "Introduces PSRL and IDS agent templates for LLM exploration; key empirical backing for the position paper's claims."
    392     },
    393     {
    394       "title": "Why do multi-agent LLM systems fail?",
    395       "authors": ["Mert Cemri", "Michael Z. Pan", "Shuyan Yang", "Lakshya A. Agrawal"],
    396       "year": 2025,
    397       "relevance": "Studies failure modes of multi-agent LLM systems; directly relevant to evaluating agent system reliability."
    398     },
    399     {
    400       "title": "The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery",
    401       "authors": ["Cong Lu", "Chris Lu", "Robert T. Lange", "Jakob Foerster", "Jeff Clune", "David Ha"],
    402       "year": 2024,
    403       "arxiv_id": "2408.06292",
    404       "relevance": "Automated scientific discovery agent; relevant to evaluating agentic AI capabilities and methodology."
    405     }
    406   ]
    407 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs