scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24471B)
      1 {
      2   "paper": {
      3     "title": "From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review",
      4     "authors": [
      5       "Mohamed Amine Ferrag",
      6       "Norbert Tihanyi",
      7       "Merouane Debbah"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2504.19678",
     12     "doi": "10.48550/arXiv.2504.19678"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["survey_methodology"],
     16   "methodology_tags": ["meta-analysis"],
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No code repository, analysis scripts, or supplementary materials are mentioned or linked in the paper. A survey can release its search corpus, comparison data, or analysis scripts."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No dataset is released. The benchmark comparison tables (Tables II-IV) and framework tables (Tables V, VII-XI) are in the paper text only, not available as structured downloadable data."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No environment or tooling specifications are provided. The paper is a literature survey with no computational experiments."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No instructions for reproducing the survey's paper selection, taxonomy construction, or analysis process are provided."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "Survey paper that does not conduct experiments; no original statistical results to report uncertainty for."
     45       },
     46       "significance_tests": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "Survey paper with no original experiments or statistical comparisons."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "Survey paper with no original experiments. Numbers reported are from reviewed papers, not the authors' own analyses."
     55       },
     56       "sample_size_justified": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Survey paper; no experimental sample sizes involved."
     60       },
     61       "variance_reported": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "Survey paper with no original experimental runs."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Table I provides a structured comparison of the authors' survey against 13 prior related surveys, marking coverage of benchmarks, frameworks, applications, protocols, and challenges for each."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The compared surveys are from 2024-2025 (Wang et al. 2024, Jin et al. 2024, Singh et al. 2025, Yehudai et al. 2025, Yan et al. 2025, etc.), which are contemporary with this 2025 paper."
     77       },
     78       "ablation_study": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "Survey paper with no system to ablate."
     82       },
     83       "multiple_metrics": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Survey paper with no experiments requiring evaluation metrics."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Survey paper with no system outputs to evaluate."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "Survey paper with no experiments or test sets."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper provides extensive categorization: Figure 2 classifies benchmarks into 8 categories; Table IV breaks down benchmarks by multimodal capability, task diversity, reasoning level, and agentic AI support; frameworks and applications are organized by domain (healthcare, finance, SE, etc.)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section V.B ('Why Do Multi-Agent LLM Systems Fail?') explicitly discusses 14 failure modes of multi-agent systems grouped into three categories. Section V also discusses reasoning limitations and protocol vulnerabilities."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports negative findings from reviewed benchmarks: LLMs score below 10% on HLE (Section III.D), GPT-4 achieves only 15% on GAIA (Section III.V), BBEH average accuracy is 9.8% (Section III.T), and Claude 3.5 Sonnet achieves only 26.2% pass rate on SWE-Lancer (Section III.O)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims to present a benchmark comparison (Tables II-IV), a taxonomy of ~60 benchmarks (Figure 2), a review of AI agent frameworks (Table V), applications (Tables VII-XI), and agent protocols (Section IV.C, Table XII). All are substantiated in the paper body."
    119       },
    120       "causal_claims_justified": {
    121         "applies": false,
    122         "answer": false,
    123         "justification": "The paper is a descriptive survey that makes no causal claims of its own. Claims about system performance are reported from reviewed papers."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract explicitly bounds scope: 'benchmarks developed between 2019 and 2025' and 'AI-agent frameworks introduced between 2023 and 2025.' The title's claim of 'comprehensive review' is somewhat overbroad, but the specific scope is stated."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "Pure survey/taxonomy with no original empirical results to explain."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "Survey paper with no measurements of its own."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "Survey paper that does not use any models directly."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Survey paper that does not use prompting."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Survey paper with no experiments."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "Survey paper that does not use agentic scaffolding."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "The paper does not describe how the reviewed papers were selected. There is no documentation of search queries, databases searched, screening criteria, or the filtering pipeline. The paper jumps directly to presenting benchmarks and frameworks without explaining how they were identified."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "Section V ('Challenges and Open Problems') discusses challenges in the field of AI agents, not limitations of the survey itself. Section VI (Conclusion) does not include limitations of the survey's methodology. No dedicated section discusses the survey's own shortcomings."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to the validity of the survey are discussed. There is no consideration of selection bias in paper inclusion, potential gaps in coverage, or limitations of the ad-hoc collection methodology."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The abstract states temporal boundaries: 'benchmarks developed between 2019 and 2025' and 'AI-agent frameworks introduced between 2023 and 2025.' The paper structure (Figure 1) delineates what is covered."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No supplementary data, paper lists, or analysis artifacts are made available for independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The paper does not describe how the ~60 benchmarks, frameworks, or applications were identified. No search databases, queries, or collection methodology are documented."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No description of how papers were found or selected for inclusion. The survey appears to be based on ad-hoc collection rather than a systematic search."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No documentation of the pipeline from initial paper discovery to final inclusion. No filtering stages, exclusion criteria, or counts at each stage are provided."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding information or acknowledgments section is present. Authors are affiliated with Technology Innovation Institute (UAE) and Khalifa University, but no funding source is stated."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: Guelma University (Algeria), Technology Innovation Institute (UAE), Eötvös Loránd University (Hungary), and Khalifa University (UAE)."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding is disclosed, so independence cannot be assessed. The authors' affiliations with TII and Khalifa University are noted, but without funding disclosure, funder independence is unverifiable."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interest statement is provided in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Survey paper with no human participants."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "Survey paper with no human participants."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "Survey paper with no human participants."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "Survey paper with no human participants."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "Survey paper with no human participants."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "Survey paper with no human participants."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "Survey paper with no human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper with no method of its own whose cost could be reported."
    288       },
    289       "compute_budget_stated": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Survey paper with no computational experiments."
    293       }
    294     },
    295     "survey_methodology": {
    296       "prisma_or_structured_protocol": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No PRISMA flow diagram, protocol registration, or systematic search strategy is described. The paper does not document search queries, databases used, or reproducible selection criteria. Papers appear to be collected ad-hoc."
    300       },
    301       "quality_assessment_of_sources": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The survey does not assess the methodological quality of its source papers. Table IV classifies benchmarks by features (multimodal, reasoning, agentic) but does not evaluate rigor, validity, or quality. All papers are treated equally regardless of their methodological strength."
    305       },
    306       "publication_bias_discussed": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No discussion of publication bias. The survey does not consider whether its sources are biased toward positive results, whether negative or null benchmark results are underrepresented, or whether the landscape of published frameworks is biased toward novel approaches."
    310       }
    311     }
    312   },
    313   "claims": [
    314     {
    315       "claim": "The landscape of LLM and AI agent evaluation remains fragmented and lacks a unified taxonomy or comprehensive survey.",
    316       "evidence": "Table I compares 13 prior surveys showing each covers only a subset of themes (benchmarks, frameworks, applications, protocols, challenges). Section II.E argues none integrate the full spectrum.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "This survey is the first to systematically combine benchmarks, framework design, application domains, communication protocols, and a forward-looking discussion in a single unified treatment.",
    321       "evidence": "Table I uses a checkmark matrix to show prior surveys' partial coverage vs. the authors' full coverage across all five themes.",
    322       "supported": "weak"
    323     },
    324     {
    325       "claim": "State-of-the-art LLMs perform below 10% accuracy on Humanity's Last Exam (HLE).",
    326       "evidence": "Section III.D reports this from Phan et al. [60], citing DeepSeek R1, OpenAI models, Gemini Thinking, and Anthropic Sonnet 3.5.",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "Even advanced models like Claude 3.5 Sonnet achieve only 26.2% pass rate on SWE-Lancer independent tasks.",
    331       "evidence": "Section III.O reports this from the OpenAI SWE-Lancer paper [71].",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "GPT-4 with plugins achieves only 15% on GAIA while humans achieve 92%.",
    336       "evidence": "Section III.V reports this from Mialon et al. [78].",
    337       "supported": "strong"
    338     },
    339     {
    340       "claim": "Multi-agent LLM systems continue to underperform compared to single-agent counterparts due to 14 distinct failure modes.",
    341       "evidence": "Section V.B cites Pan et al. [222] who studied 5 open-source frameworks across 150 tasks with expert human annotators.",
    342       "supported": "moderate"
    343     },
    344     {
    345       "claim": "The MCP protocol faces critical security vulnerabilities due to its decentralized design, including lack of standardized authentication.",
    346       "evidence": "Section V.F discusses vulnerabilities cited from Hou et al. [216], including uneven defenses, no standardized authentication, and deficient logging/debugging.",
    347       "supported": "moderate"
    348     }
    349   ],
    350   "key_findings": "This survey taxonomizes approximately 60 LLM and AI agent benchmarks from 2019-2025 across eight categories, reviews major agent frameworks (LangChain, LlamaIndex, CrewAI, Swarm, Agents SDK), and catalogs applications across healthcare, finance, software engineering, mathematics, chemistry, and multimedia domains. The paper also surveys three agent communication protocols (ACP, MCP, A2A) and identifies key challenges including multi-agent failure modes, reasoning limitations, and protocol security vulnerabilities. A notable gap is that the survey does not employ a systematic review methodology or assess the quality of its source papers.",
    351   "red_flags": [
    352     {
    353       "flag": "No systematic search methodology",
    354       "detail": "The survey does not describe how the ~60 benchmarks, frameworks, or applications were identified. No search databases, queries, inclusion/exclusion criteria, or PRISMA-style flow diagram are provided. Papers appear to be collected ad-hoc, making the claimed 'comprehensive' coverage unverifiable."
    355     },
    356     {
    357       "flag": "No quality assessment of sources",
    358       "detail": "The survey treats all reviewed papers equally regardless of methodological quality. Benchmarks, frameworks, and applications are described descriptively without evaluating their rigor, validity, or limitations. This launders the signal-to-noise ratio of its sources — weak papers are given equal standing with strong ones."
    359     },
    360     {
    361       "flag": "Duplicate paragraph",
    362       "detail": "Section II.A contains Jin et al. [48] described twice in nearly identical paragraphs, suggesting incomplete editing. The text 'In a complementary study, Jin et al. [48] investigate...' repeats the preceding paragraph verbatim."
    363     },
    364     {
    365       "flag": "Self-citation prominence",
    366       "detail": "Authors prominently feature their own benchmarks — CyberMetric [75] by Tihanyi et al., DIA [74] by Tihanyi et al., and CASTLE [79] by Dubniczky/Tihanyi/Ferrag — in the benchmark tables and discussion. While self-citation is not inherently problematic, three self-authored benchmarks are included among ~60 total (~5%) without explicit disclosure of authorship overlap."
    367     },
    368     {
    369       "flag": "Comprehensiveness claim without evidence",
    370       "detail": "The paper claims to be a 'comprehensive review' and 'the first to systematically combine' five themes, but provides no evidence of systematic coverage. Without a defined search protocol, it is impossible to assess what was missed or whether inclusion was biased toward certain subfields or author networks."
    371     },
    372     {
    373       "flag": "No survey limitations section",
    374       "detail": "Section V discusses challenges in the AI agent field, but the paper never discusses limitations of the survey itself — potential selection bias, coverage gaps, recency bias, or geographic/language biases in the reviewed literature."
    375     }
    376   ],
    377   "cited_papers": [
    378     {
    379       "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real World Freelance Software Engineering?",
    380       "authors": ["S. Miserendino", "M. Wang", "T. Patwardhan", "J. Heidecke"],
    381       "year": 2025,
    382       "arxiv_id": "2502.12115",
    383       "relevance": "Benchmark evaluating LLM agents on real-world freelance software engineering tasks with monetary payouts."
    384     },
    385     {
    386       "title": "Agent-as-a-Judge: Evaluate Agents with Agents",
    387       "authors": ["M. Zhuge", "C. Zhao", "D. Ashley"],
    388       "year": 2024,
    389       "arxiv_id": "2410.10934",
    390       "relevance": "Novel evaluation methodology using agentic systems to evaluate other agents, achieving 90% alignment with human judgments on code generation tasks."
    391     },
    392     {
    393       "title": "MultiAgentBench: Evaluating the Collaboration and Competition of LLM Agents",
    394       "authors": ["K. Zhu", "H. Du", "Z. Hong"],
    395       "year": 2025,
    396       "arxiv_id": "2503.01935",
    397       "relevance": "Benchmark for multi-agent LLM coordination across six domains including collaborative coding and research writing."
    398     },
    399     {
    400       "title": "R2E-Gym: Procedural Environments and Hybrid Verifiers for Scaling Open-Weights SWE Agents",
    401       "authors": ["N. Jain", "J. Singh", "M. Shetty"],
    402       "year": 2025,
    403       "arxiv_id": "2504.07164",
    404       "relevance": "Training environment for SWE agents achieving 51% pass rate on SWE-Bench Verified with synthetic data curation."
    405     },
    406     {
    407       "title": "Training Software Engineering Agents and Verifiers with SWE-Gym",
    408       "authors": ["J. Pan", "X. Wang", "G. Neubig"],
    409       "year": 2024,
    410       "arxiv_id": "2412.21139",
    411       "relevance": "First dedicated training environment for real-world SWE agents with 2,438 Python task instances."
    412     },
    413     {
    414       "title": "GAIA: A Benchmark for General AI Assistants",
    415       "authors": ["G. Mialon", "C. Fourrier", "T. Wolf", "Y. LeCun", "T. Scialom"],
    416       "year": 2023,
    417       "relevance": "Benchmark exposing large gap between human (92%) and SOTA model (15%) performance on general AI assistant tasks."
    418     },
    419     {
    420       "title": "Why Do Multiagent Systems Fail?",
    421       "authors": ["M. Z. Pan", "M. Cemri", "L. A. Agrawal"],
    422       "year": 2025,
    423       "relevance": "Critical analysis of 14 failure modes in multi-agent LLM systems across 150 tasks and 5 frameworks."
    424     },
    425     {
    426       "title": "MLE-bench: Evaluating Machine Learning Agents on Machine Learning Engineering",
    427       "authors": ["J. S. Chan", "N. Chowdhury", "O. Jaffe"],
    428       "year": 2025,
    429       "arxiv_id": "2410.07095",
    430       "relevance": "Benchmark evaluating ML engineering agents on Kaggle-style competitions."
    431     },
    432     {
    433       "title": "MLGym: A New Framework and Benchmark for Advancing AI Research Agents",
    434       "authors": ["D. Nathani", "L. Madaan", "N. Roberts"],
    435       "year": 2025,
    436       "arxiv_id": "2502.14499",
    437       "relevance": "Framework and benchmark for ML research agents that automate ML research workflows."
    438     },
    439     {
    440       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    441       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    442       "year": 2023,
    443       "relevance": "Foundational work on language agent self-improvement through verbal reinforcement learning."
    444     },
    445     {
    446       "title": "CASTLE: Benchmarking Dataset for Static Code Analyzers and LLMs towards CWE Detection",
    447       "authors": ["R. A. Dubniczky", "K. Z. Horvát", "T. Bisztray", "M. A. Ferrag", "N. Tihanyi"],
    448       "year": 2025,
    449       "arxiv_id": "2503.09433",
    450       "relevance": "Benchmark for software vulnerability detection comparing static analyzers, LLMs, and formal verification across 25 CWEs."
    451     },
    452     {
    453       "title": "SWE-PolyBench: A Multi-Language Benchmark for Repository Level Evaluation of Coding Agents",
    454       "authors": ["M. S. Rashid", "C. Bock", "Y. Zhuang"],
    455       "year": 2025,
    456       "relevance": "Multi-language benchmark for evaluating coding agents at repository level across programming languages."
    457     },
    458     {
    459       "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents",
    460       "authors": ["J. Wei", "Z. Sun", "S. Papay"],
    461       "year": 2025,
    462       "relevance": "Benchmark for evaluating browsing agents from OpenAI, testing information retrieval capabilities."
    463     },
    464     {
    465       "title": "Towards an AI Co-Scientist",
    466       "authors": ["J. Gottweis", "W.-H. Weng", "A. Daryin"],
    467       "year": 2025,
    468       "arxiv_id": "2502.18864",
    469       "relevance": "Google's multi-agent system for automated scientific hypothesis generation and refinement using Gemini 2.0."
    470     },
    471     {
    472       "title": "DARS: Dynamic Action Re-Sampling to Enhance Coding Agent Performance by Adaptive Tree Traversal",
    473       "authors": ["V. Aggarwal", "O. Kamal", "A. Japesh"],
    474       "year": 2025,
    475       "arxiv_id": "2503.14269",
    476       "relevance": "Method for scaling inference-time compute in coding agents, achieving 55% pass@k on SWE-Bench Lite."
    477     }
    478   ]
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs