ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (18741B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "From LLM Reasoning to Autonomous AI Agents: A Comprehensive Review",
      6     "authors": [
      7       "M. Ferrag",
      8       "Norbert Tihanyi",
      9       "M. Debbah"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2504.19678",
     14     "doi": "10.48550/arXiv.2504.19678"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract promises a side-by-side benchmark comparison, taxonomy of ~60 benchmarks, framework review, application survey, protocol survey, and future directions — all of which are delivered in the corresponding sections.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "This is a descriptive survey making no original causal claims; all performance results and causal assertions are attributed to cited papers.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper claims to be 'comprehensive' and 'the first to systematically combine' all aspects (Table I), but no systematic selection methodology is documented to validate these broad generalization claims.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "As a descriptive survey without original empirical findings, alternative explanations for results are not applicable to the paper's own contributions.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "The survey does not measure outcomes directly; benchmark results are reproduced from cited papers without making proxy-vs-outcome claims.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Section V ('Challenges and Open Problems') discusses field-level challenges but contains no limitations section for the survey itself — no discussion of coverage gaps, missed papers, or methodological weaknesses.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats to the survey's validity are discussed, including potential selection bias, the risk that cited papers disproportionately come from the authors' own network, or the undocumented inclusion criteria.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Date ranges are stated (benchmarks 2019-2025, frameworks 2023-2025) but not justified, and there is no explicit statement of what application domains or benchmark types were excluded.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment is present anywhere in the paper text.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are disclosed on the first page: Guelma University (Algeria), Technology Innovation Institute (UAE), Eötvös Loránd University (Hungary), and Khalifa University (UAE).",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement is present. All three survey authors (Tihanyi, Ferrag, Debbah) are co-creators of benchmarks reviewed positively in this survey — CyberMetric [75], CASTLE [79], and DIA [74] — a material conflict of interest not disclosed.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Terms like 'autonomous AI agents,' 'agentic AI,' and 'agent' are used throughout without formal definitions and are used interchangeably; the paper never delineates what distinguishes an 'agent' from an 'LLM' in its taxonomy.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The introduction contains an explicit six-bullet list of contributions: benchmark comparison table, taxonomy of ~60 benchmarks, framework review, application domains overview, protocol survey, and future research recommendations.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section II reviews 13 related surveys across six thematic areas, and Table I provides an explicit column-by-column comparison of topic coverage against prior work.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "survey": {
    118       "search_and_selection": {
    119         "search_strategy_reproducible": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No search strategy is described anywhere in the paper; it does not state how papers were identified, which queries were used, or how completeness was assessed.",
    123           "source": "haiku"
    124         },
    125         "inclusion_exclusion_explicit": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "No inclusion or exclusion criteria are stated; it is entirely unclear why certain benchmarks or frameworks appear while others are absent.",
    129           "source": "haiku"
    130         },
    131         "prisma_or_structured_protocol": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No structured review protocol (PRISMA or otherwise) is mentioned or followed.",
    135           "source": "haiku"
    136         },
    137         "search_terms_provided": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No search queries or keywords are provided.",
    141           "source": "haiku"
    142         },
    143         "databases_listed": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "No databases, repositories, or search engines used to identify papers are mentioned.",
    147           "source": "haiku"
    148         },
    149         "screening_process_documented": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No screening process is described; there are no paper counts at any selection stage.",
    153           "source": "haiku"
    154         },
    155         "review_scope_justified": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Date ranges (2019-2025 benchmarks, 2023-2025 frameworks) are asserted but not justified; no explanation is given for why these windows were chosen or why earlier frameworks are excluded.",
    159           "source": "haiku"
    160         }
    161       },
    162       "synthesis_quality": {
    163         "conflicting_findings_acknowledged": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Each reviewed work is presented positively with no acknowledgment of contradictory results between benchmarks or cases where different studies reach different conclusions about agent capabilities.",
    167           "source": "haiku"
    168         },
    169         "quality_assessment_of_sources": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation is applied to reviewed papers; all sources are treated as equally reliable regardless of methodology.",
    173           "source": "haiku"
    174         },
    175         "publication_bias_discussed": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Publication bias is not mentioned; the survey does not acknowledge that reviewed benchmarks and frameworks disproportionately report positive results.",
    179           "source": "haiku"
    180         },
    181         "quantitative_synthesis_present": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "The survey is entirely narrative; comparison tables are provided but no statistical aggregation, meta-analysis, vote counting, or effect size synthesis is performed.",
    185           "source": "haiku"
    186         },
    187         "recommendations_supported_by_evidence": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Section V recommendations are each derived from a single cited paper (e.g., Meta-CoT from Xiang et al., Chain-of-Tools from Wu et al.) rather than from synthesized evidence across multiple studies.",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "This survey is 'the first to systematically combine state-of-the-art benchmarks, framework design, application domains, communication protocols, and challenges' in a single unified treatment.",
    199       "evidence": "Table I comparison with 13 prior surveys; Section II related work. No systematic selection methodology is documented to validate 'first' or 'systematic' status.",
    200       "supported": "weak"
    201     },
    202     {
    203       "claim": "The LLM/agent evaluation landscape 'remains fragmented and lacks a unified taxonomy or comprehensive survey.'",
    204       "evidence": "Table I shows prior surveys each covering only 1-2 dimensions out of the five covered here, supporting the fragmentation claim.",
    205       "supported": "moderate"
    206     },
    207     {
    208       "claim": "Current state-of-the-art LLMs perform below 10% accuracy on the Humanity's Last Exam (HLE) benchmark.",
    209       "evidence": "Directly sourced from Phan et al. HLE paper with specific models named (DeepSeek R1, OpenAI o1/o3, Gemini Thinking, Sonnet 3.5); well-supported.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Multi-agent LLM systems 'continue to underperform compared to single-agent counterparts' due to 14 distinct failure modes.",
    214       "evidence": "Sourced from Pan et al. [222] study across 5 frameworks and 150 tasks with human annotators; properly attributed to a single empirical study.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "Even GPT-4o succeeds on fewer than 50% of τ-bench conversational agent tasks, indicating significant reliability gaps.",
    219       "evidence": "Sourced from Yao et al. τ-bench [81] with novel passk metric; specific statistics cited (pass8 < 25% in retail domains).",
    220       "supported": "strong"
    221     },
    222     {
    223       "claim": "OctoTools outperforms AutoGen, GPT-Functions, and LangChain by up to 10.6% on varied tasks using the same toolset.",
    224       "evidence": "Sourced from Lu et al. [130]; specific benchmark results cited across 16 tasks but no independent replication.",
    225       "supported": "moderate"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "qualitative"
    230   ],
    231   "key_findings": "This narrative survey catalogs approximately 60 LLM and agentic AI evaluation benchmarks from 2019-2025 across eight categories, reviews 8 major agent frameworks (LangChain, LlamaIndex, CrewAI, Swarm, OctoTools, etc.), covers agent applications across 11 domains, and surveys three emerging inter-agent protocols (ACP, MCP, A2A). The paper's central organizing observation is that frontier LLMs still dramatically underperform humans on challenging benchmarks (below 10% on HLE, below 50% on τ-bench), and multi-agent systems exhibit 14 documented failure modes. No original empirical work is presented; all findings are synthesized from cited literature without systematic search methodology or quality assessment of sources.",
    232   "red_flags": [
    233     {
    234       "flag": "No search methodology",
    235       "detail": "The survey documents no search strategy, search terms, databases consulted, or screening process — paper selection appears entirely ad hoc, undermining any claim of comprehensiveness."
    236     },
    237     {
    238       "flag": "Undisclosed author-as-subject conflict",
    239       "detail": "All three survey authors (Tihanyi, Ferrag, Debbah) are co-creators of three benchmarks reviewed and described positively in this survey: CyberMetric [75], CASTLE [79], and DIA [74]. No conflict-of-interest statement is present."
    240     },
    241     {
    242       "flag": "Duplicate paragraph",
    243       "detail": "Section II.A contains an identical paragraph about Jin et al. [48] repeated verbatim within the same section, indicating the manuscript was not carefully reviewed prior to submission."
    244     },
    245     {
    246       "flag": "No survey limitations section",
    247       "detail": "There is no discussion of the survey's own methodological limitations, coverage gaps, selection bias, or risk that reviewed papers were chosen based on author familiarity rather than systematic criteria."
    248     },
    249     {
    250       "flag": "Comprehensiveness claim unverifiable",
    251       "detail": "The paper repeatedly claims to be 'comprehensive' and 'the first unified survey,' but without documented search methodology these claims cannot be verified or falsified by readers."
    252     },
    253     {
    254       "flag": "No quality assessment of sources",
    255       "detail": "All 200+ reviewed papers are presented as equally valid regardless of methodological quality; papers with weak empirical designs are described in the same positive register as rigorous benchmarks."
    256     },
    257     {
    258       "flag": "No conflicting findings acknowledged",
    259       "detail": "Every reviewed paper is described positively; the survey contains no discussion of failed replications, contradictory results between benchmarks, or papers that challenge the consensus narrative about agent progress."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Beyond Self-Talk: A Communication-Centric Survey of LLM-Based Multi-Agent Systems",
    265       "relevance": "Directly comparable survey of multi-agent LLM systems used in related work comparison table"
    266     },
    267     {
    268       "title": "Survey on Evaluation of LLM-Based Agents",
    269       "relevance": "Prior comprehensive survey of LLM agent evaluation methodologies; explicitly compared in Table I"
    270     },
    271     {
    272       "title": "Large Language Model Based Multi-Agents: A Survey of Progress and Challenges",
    273       "relevance": "Prior survey tracing evolution from single-agent to multi-agent LLM systems; compared in related work"
    274     },
    275     {
    276       "title": "Why Do Multiagent Systems Fail?",
    277       "relevance": "Empirical study identifying 14 failure modes in 5 multi-agent frameworks across 150 tasks; primary source for challenges section"
    278     },
    279     {
    280       "title": "GAIA: A Benchmark for General AI Assistants",
    281       "relevance": "Key benchmark showing large human-AI performance gap (92% vs 15%); illustrative of capability evaluation challenges"
    282     },
    283     {
    284       "title": "A Benchmark for Tool-Agent-User Interaction in Real-World Domains (τ-bench)",
    285       "relevance": "Key agentic benchmark showing <50% task success for GPT-4o in realistic conversational settings"
    286     },
    287     {
    288       "title": "Humanity's Last Exam",
    289       "relevance": "Expert-level benchmark showing <10% accuracy for all frontier LLMs; central evidence of remaining capability gaps"
    290     },
    291     {
    292       "title": "Model Context Protocol (MCP): Landscape, Security Threats, and Future Research Directions",
    293       "relevance": "Primary source for MCP security vulnerabilities discussed in Section V.F"
    294     },
    295     {
    296       "title": "MultiAgentBench: Evaluating the Collaboration and Competition of LLM Agents",
    297       "relevance": "Recent multi-agent evaluation benchmark covering 6 domains and 4 coordination topologies"
    298     },
    299     {
    300       "title": "Training Software Engineering Agents and Verifiers with SWE-Gym",
    301       "relevance": "Key benchmark for real-world software engineering agent training and evaluation"
    302     }
    303   ],
    304   "engagement_factors": {
    305     "practical_relevance": {
    306       "score": 3,
    307       "justification": "Serves as a dense reference guide for practitioners selecting benchmarks, frameworks, or application patterns for AI agent development across 11 domains."
    308     },
    309     "surprise_contrarian": {
    310       "score": 0,
    311       "justification": "Entirely descriptive and consensus-reinforcing; presents no findings that challenge conventional wisdom about LLM or agent capabilities."
    312     },
    313     "fear_safety": {
    314       "score": 1,
    315       "justification": "Section V.F discusses MCP security vulnerabilities (lack of authentication standards, logging gaps) and cybersecurity benchmarks, but treatment is superficial."
    316     },
    317     "drama_conflict": {
    318       "score": 0,
    319       "justification": "No controversial claims, no critical assessment of specific systems, and no discussion of disagreements in the field."
    320     },
    321     "demo_ability": {
    322       "score": 2,
    323       "justification": "References multiple immediately accessible frameworks (LangChain, LlamaIndex, CrewAI, OpenAI Agents SDK) with URLs; readers can try most reviewed tools."
    324     },
    325     "brand_recognition": {
    326       "score": 2,
    327       "justification": "Features Anthropic's MCP protocol and Claude 3.5 Computer Use, Google's A2A protocol and Gemini, OpenAI's Swarm and Agents SDK, and DeepSeek-R1 throughout."
    328     }
    329   },
    330   "hn_data": {
    331     "threads": [
    332       {
    333         "hn_id": "44407745",
    334         "title": "The Unreasonable Effectiveness of Mathematical Experiments",
    335         "points": 8,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=44407745",
    338         "created_at": "2025-06-28T20:07:21Z"
    339       },
    340       {
    341         "hn_id": "43889722",
    342         "title": "Mega Mass Assembly with JWST: The MIRI EGS Galaxy and AGN Survey",
    343         "points": 6,
    344         "comments": 0,
    345         "url": "https://news.ycombinator.com/item?id=43889722",
    346         "created_at": "2025-05-04T21:26:16Z"
    347       },
    348       {
    349         "hn_id": "44660406",
    350         "title": "Show HN: Single-agent long-horizon reasoning within one LLM run",
    351         "points": 4,
    352         "comments": 1,
    353         "url": "https://news.ycombinator.com/item?id=44660406",
    354         "created_at": "2025-07-23T15:35:17Z"
    355       },
    356       {
    357         "hn_id": "45680925",
    358         "title": "CausalRAG: Integrating Causal Graphs into RAG",
    359         "points": 2,
    360         "comments": 1,
    361         "url": "https://news.ycombinator.com/item?id=45680925",
    362         "created_at": "2025-10-23T12:10:04Z"
    363       }
    364     ],
    365     "top_points": 8,
    366     "total_points": 20,
    367     "total_comments": 2
    368   }
    369 }

Impressum · Datenschutz