ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (22057B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Five Fatal Assumptions: Why T-Shirt Sizing Systematically Fails for AI Projects",
      6     "authors": [
      7       "Raja Soundaramourty",
      8       "Ozkan Kilic",
      9       "Ramu Chenchaiah"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.17734",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims to identify five fatal assumptions and ground them in empirical literature. The paper delivers this in Section 4 with citations to specific studies for each assumption.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes causal claims throughout, e.g., 'these failures aren't about poor execution' (Section 1.1), AI development 'breaks these rules' (abstract). These causal claims about why estimation fails are argued analytically from cited literature but not tested with original data or controlled study.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 1.4 explicitly defines scope: LLM applications, agentic workflows, RAG systems, and model adaptation. Section 5.2 acknowledges 'simpler ML projects may violate fewer assumptions.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper does not consider alternative explanations for estimation failures in AI projects (e.g., team inexperience, poor requirements, organizational factors). It attributes failures entirely to the five assumptions without considering confounds.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper frames cited metrics like '39% performance degradation' and 'N(N-1) complexity' as direct evidence of estimation failure, without discussing whether these proxy measurements actually predict estimation accuracy.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 5.2 'Limitations' explicitly lists three limitations.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 5.2 lists specific limitations: (1) evidence is qualitative/analytical not from a new controlled study, (2) analysis focuses on LLM/multi-agent and may not apply to simpler ML, (3) Checkpoint Sizing is not empirically validated.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 1.4 explicitly defines what counts as an 'AI project' and Section 5.2 acknowledges the analysis focuses on LLM and multi-agent systems specifically.",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding disclosure or acknowledgments section. All authors are from Cisco Systems, a major technology company.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations at Cisco Systems are clearly stated in the header.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding statement is provided. Cisco, as a technology company developing AI products, has potential interest in outcomes related to AI project estimation methodology. The absence of disclosure is itself a concern.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement. Authors work at Cisco which develops and sells AI products and services.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Only 'AI projects' is precisely defined (1.4). Key terms like 'T-shirt sizing', 'LLMs', 'multi-agent systems', 'RAG', 'evaluation', and 'estimation failure' are used but not formally defined; relies on reader familiarity.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1.2 explicitly states four contributions: (1) identify five fatal assumptions, (2) ground in empirical literature, (3) provide quantitative evidence (N(N−1) complexity, 39% degradation), (4) propose Checkpoint Sizing framework.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Section 2 lists related work on agile estimation (2.1), AI project management (2.2), and LLM-integrated systems (2.3), but engagement is shallow. Cites papers without deeply discussing how this work builds on, differs from, or extends prior insights.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "position": {
    118       "argument_quality": {
    119         "argument_internally_consistent": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Logical structure: T-shirt sizing rests on 5 assumptions → these assumptions fail for AI → therefore T-shirt sizing fails for AI → Checkpoint Sizing is alternative. No internal contradictions or logical fallacies detected.",
    123           "source": "haiku"
    124         },
    125         "counterarguments_addressed": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Section 5.1 dismisses buffer argument ('Buffers Aren't Enough') but doesn't engage with strongest opposing views. A skeptic might argue 'this is just poor team execution' or 'AI estimation is just messier, not fundamentally different'—these are not addressed.",
    129           "source": "haiku"
    130         },
    131         "analogies_appropriate": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Analogies between traditional software (linear scaling, repeatability, fungibility) and AI (exponential curves, dataset uniqueness, sequential bottlenecks) are apt. Figures 1–13 illustrate contrasts clearly without false equivalence.",
    135           "source": "haiku"
    136         },
    137         "prescriptions_proportional": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Paper proposes Checkpoint Sizing as solution, which is proportional to the problem identified. However, the prescription itself is not validated—Section 5.2 admits 'Checkpoint Sizing... [is] not empirically validated.' Proportionality undermined by lack of evidence for the cure.",
    141           "source": "haiku"
    142         },
    143         "evidence_for_claims_cited": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Factual claims reference sources. Examples: scaling laws [11], [12]; multi-turn degradation [4]; multi-agent complexity [1], [3]; ML technical debt [10]; data work challenges [13], [14]. Citations include arXiv IDs and DOIs.",
    147           "source": "haiku"
    148         },
    149         "alternatives_discussed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Paper discusses story points and planning poker (2.1) but does not engage deeply with why these methods might succeed or fail for AI. Section 5.2 explicitly notes 'Checkpoint Sizing... [is] not empirically validated [relative to] other emerging AI estimation approaches,' indicating no comparison provided.",
    153           "source": "haiku"
    154         },
    155         "historical_context_accurate": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Historical references are accurate: Brooks's Law cited correctly (4.3.1); agile estimation history (Cohn 2005, Grenning 2002, McConnell 2006) matches publication records; ML technical debt (Sculley et al. 2015) appropriately cited.",
    159           "source": "haiku"
    160         }
    161       },
    162       "clarity_and_scope": {
    163         "key_terms_defined_precisely": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "'AI projects' is precisely defined (1.4: initiatives where primary risk is model behavior). Other key terms ('T-shirt sizing', 'estimation', 'multi-agent', 'agents') are explained contextually but not formally defined. Assumes reader familiarity.",
    167           "source": "haiku"
    168         },
    169         "engages_with_existing_literature": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Section 2 reviews agile estimation (2.1), AI project management (2.2), and LLM-integrated systems (2.3), but engagement is superficial. Papers are listed with brief summaries, not substantively discussed or integrated into the argument. Reads as background survey, not integrated analysis.",
    173           "source": "haiku"
    174         },
    175         "intended_audience_clear": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Abstract explicitly states: 'This paper is intended for engineering managers, technical leads, and product owners responsible for planning and delivering AI initiatives.' Specific, actionable audience definition.",
    179           "source": "haiku"
    180         },
    181         "assumptions_stated": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Paper derives five assumptions underlying T-shirt sizing (Section 3.1) but does not state its own assumptions. Implicit assumptions: (1) estimation happens upfront; (2) team size/composition is relatively fixed; (3) scope changes are not inherent; (4) traditional software practices are common baseline. These should be explicit.",
    185           "source": "haiku"
    186         },
    187         "scope_of_applicability_discussed": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Section 1.4 states scope but does not discuss edge cases or where argument breaks down. Boundaries stated (LLM apps, agentic workflows, RAG, fine-tuning) but not discussed: simple prompt-based systems, fine-tuning on fixed datasets, or internal tools with deterministic outputs.",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "Improving LLM accuracy from 85% to 95% requires ~10x increase in effort (exponential, not linear scaling)",
    199       "evidence": "Section 4.1.2 cites scaling laws research [11], [12] showing model performance follows power-law relationships with compute and data.",
    200       "supported": "moderate"
    201     },
    202     {
    203       "claim": "LLMs exhibit 39% average performance degradation in multi-turn conversations vs. single-turn",
    204       "evidence": "Section 4.2.2 and Table 3 reference [4] (Laban et al., 'LLMs Get Lost In Multi-Turn Conversation'); stated as 'Large-scale simulations find an average 39% performance drop.'",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Multi-agent interaction complexity grows as N(N−1), not linearly with agent count",
    209       "evidence": "Section 4.1.2 and Table 1 show combinatorial growth: 2 agents = 2 interactions, 3 agents = 6, 10 agents = 90. Supported by [1], [3].",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "AI development contains mandatory sequential phases (data→train→eval) that cannot be parallelized",
    214       "evidence": "Section 4.3.2 cites [13] (Paleyes et al., ML deployment survey) and [14] (Sambasivan et al., data cascades) documenting sequential bottlenecks.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Agent systems can enter infinite correction loops (Supervisor rejects Worker, Worker retries indefinitely)",
    219       "evidence": "Section 4.5.2 mentions 'Infinite correction loops' as an emergent failure mode; supported by [1] (multi-agent failure taxonomy). Illustrated in Figure 12 but no quantitative evidence provided.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Data work is the longest and most unpredictable phase in ML projects and cannot be easily parallelized",
    224       "evidence": "Section 4.3.2 cites [14] (Sambasivan et al., 'Everyone wants to do the model work, not the data work') documenting data cascades and bottlenecks.",
    225       "supported": "moderate"
    226     },
    227     {
    228       "claim": "Token starvation: one verbose agent can exhaust shared context window, starving others of reasoning capacity",
    229       "evidence": "Section 4.4.2 and Figure 10 describe scenario but no quantitative evidence provided. Illustrated conceptually but not measured in practice.",
    230       "supported": "weak"
    231     },
    232     {
    233       "claim": "Guardrail tuning requires weeks of iteration (tightening reduces utility, loosening increases risk)",
    234       "evidence": "Section 4.5.2 mentions 'weeks of RLHF or prompt engineering' but cites no source and provides no data to support duration claim.",
    235       "supported": "weak"
    236     }
    237   ],
    238   "methodology_tags": [
    239     "position",
    240     "theoretical",
    241     "case-study"
    242   ],
    243   "key_findings": "T-shirt sizing systematically fails for AI projects because five foundational assumptions that hold in traditional software break down: (1) effort scales linearly (not exponentially in AI), (2) past experience predicts future effort (every dataset is unique), (3) effort and duration are interchangeable (sequential bottlenecks exist), (4) tasks decompose independently (data/model/prompt layers are tightly coupled), and (5) completion criteria are deterministic (AI outcomes are probabilistic). The paper proposes Checkpoint Sizing—an iterative framework with explicit decision gates (data readiness, evaluation harness, safety/reliability, cost/latency, operationalization) where scope and feasibility are reassessed based on evidence rather than assumptions.",
    244   "red_flags": [
    245     {
    246       "flag": "No empirical validation",
    247       "detail": "Authors acknowledge (5.2) that 'assumption violations are characterized analytically rather than through a new controlled study.' All claims are theoretical or synthesized from other papers, not validated through original empirical work."
    248     },
    249     {
    250       "flag": "Proposed solution unvalidated",
    251       "detail": "Checkpoint Sizing is presented as alternative but Section 5.2 admits it is 'not empirically validated [relative to] other emerging AI estimation approaches.' The synthetic case study (5.4) does not validate the framework's effectiveness."
    252     },
    253     {
    254       "flag": "Heavy reliance on unreviewed preprints",
    255       "detail": "Primary citations [1]–[5] are all 2024–2025 arXiv preprints (not peer-reviewed). Only supporting references [11], [12], [13], [14] are published. This weakens evidence grounding."
    256     },
    257     {
    258       "flag": "Potential strawman of T-shirt sizing",
    259       "detail": "Paper assumes organizations use T-shirt sizing for AI projects as-is without adaptation. No evidence that teams actually rely on this method for AI without compensatory adjustments or other estimation techniques."
    260     },
    261     {
    262       "flag": "Conflicts of interest not disclosed",
    263       "detail": "Authors from Cisco Systems but no funding disclosure or statement on whether Cisco has business interests in estimation tools, project management software, or AI platforms. Cisco's motivation is unclear."
    264     },
    265     {
    266       "flag": "Synthetic case study only",
    267       "detail": "The 'Support Copilot with RAG + Tool Use' example (5.4) is explicitly synthetic ('Illustrative Case Study'). Outcomes are illustrative, not based on real project data. No retrospective validation against actual AI projects."
    268     },
    269     {
    270       "flag": "Limited engagement with counterarguments",
    271       "detail": "Paper dismisses buffer-based estimation (5.1) but does not deeply engage with skeptics who might argue this is poor execution, not a property of AI itself, or that teams already use different methods."
    272     },
    273     {
    274       "flag": "Generalization to all 'AI projects' may be overstated",
    275       "detail": "Analysis focuses on LLM and multi-agent systems. Simpler ML projects (fixed-data fine-tuning, deployment-only tasks) may not face all five assumption violations, but paper presents framework as universal to 'AI projects.'"
    276     }
    277   ],
    278   "cited_papers": [
    279     {
    280       "title": "Why Do Multi-Agent LLM Systems Fail?",
    281       "authors": "Cemri et al.",
    282       "year": 2025,
    283       "arxiv_id": "2503.13657",
    284       "relevance": "Validates assumptions 4 (decomposability via inter-agent coupling) and 5 (deterministic completion via verification/termination failures). Core reference for multi-agent failure modes."
    285     },
    286     {
    287       "title": "An LLM-based multi-agent framework for agile effort estimation",
    288       "authors": "Bui, Dam, Hoda",
    289       "year": 2025,
    290       "arxiv_id": "2509.14483",
    291       "relevance": "Validates assumptions 2 (repeatability via subjective inconsistency) and 5 (deterministic completion via estimation instability). Directly addresses AI effort estimation challenges."
    292     },
    293     {
    294       "title": "Towards a Science of Scaling Agent Systems",
    295       "authors": "Kim et al.",
    296       "year": 2025,
    297       "arxiv_id": "2512.08296",
    298       "relevance": "Validates assumptions 1 (non-linear scaling), 3 (effort-duration tradeoffs), and 4 (coordination overhead/error amplification). Core reference for agent scaling complexity."
    299     },
    300     {
    301       "title": "LLMs Get Lost In Multi-Turn Conversation",
    302       "authors": "Laban, Hayashi, Zhou, Neville",
    303       "year": 2025,
    304       "arxiv_id": "2505.06120",
    305       "relevance": "Validates assumption 2 (multi-turn unreliability, 39% degradation) and 5 (non-recovering error trajectories). Key evidence for conversation state drift."
    306     },
    307     {
    308       "title": "Effort and Size Estimation in Software Projects with Large Language Model-based Intelligent Interfaces",
    309       "authors": "Coelho et al.",
    310       "year": 2024,
    311       "arxiv_id": "2402.07158",
    312       "relevance": "Validates assumptions 2 (hidden uncertainty) and 5 (evolving spec/AI interface behavior). Directly addresses LLM-integrated project estimation."
    313     },
    314     {
    315       "title": "Hidden Technical Debt in Machine Learning Systems",
    316       "authors": "Sculley et al.",
    317       "year": 2015,
    318       "venue": "NIPS",
    319       "relevance": "Foundational reference on ML-specific technical challenges (data dependencies, feedback loops, configuration debt). Supports overall argument that AI differs from traditional software."
    320     },
    321     {
    322       "title": "Everyone wants to do the model work, not the data work: Data Cascades in High-Stakes AI",
    323       "authors": "Sambasivan et al.",
    324       "year": 2021,
    325       "venue": "CHI",
    326       "relevance": "Validates assumption 3 (sequential data bottlenecks cannot be parallelized). Empirical study of data work challenges in high-stakes ML projects."
    327     },
    328     {
    329       "title": "Scaling Laws for Neural Language Models",
    330       "authors": "Kaplan et al.",
    331       "year": 2020,
    332       "arxiv_id": "2001.08361",
    333       "relevance": "Foundational work on power-law relationships between compute/data and performance. Validates assumption 1 (exponential effort for marginal gains)."
    334     },
    335     {
    336       "title": "Challenges in Deploying Machine Learning: A Survey of Case Studies",
    337       "authors": "Paleyes, Urma, Lawrence",
    338       "year": 2022,
    339       "venue": "ACM Computing Surveys",
    340       "relevance": "Survey of real-world ML deployment challenges including sequential bottlenecks. Supports assumption 3 (irreducible sequential dependencies)."
    341     }
    342   ],
    343   "engagement_factors": {
    344     "practical_relevance": {
    345       "score": 2,
    346       "justification": "Checkpoint Sizing framework is conceptually actionable for engineering managers, but no tooling, templates, or implementation guide provided. Practitioners would need to operationalize the framework themselves."
    347     },
    348     "surprise_contrarian": {
    349       "score": 2,
    350       "justification": "The core insight (AI estimation differs from software) is not surprising to practitioners who have shipped AI projects. The 5-assumption framework adds structure but doesn't fundamentally challenge existing practice."
    351     },
    352     "fear_safety": {
    353       "score": 0,
    354       "justification": "Paper focuses on estimation methodology, not AI safety, risk, alignment, or societal impact. Safety is mentioned only as a checkpoint (Gate C) but not as a risk concern."
    355     },
    356     "drama_conflict": {
    357       "score": 1,
    358       "justification": "Title promises 'fatal assumptions,' but the framing is methodological rather than dramatic. Failure modes (infinite loops, guardrail oscillation) have some conflict angle but are not positioned sensationally."
    359     },
    360     "demo_ability": {
    361       "score": 1,
    362       "justification": "Position paper with no code, tooling, or interactive demo. Checkpoint Sizing could theoretically be applied to a real team, but no template or software to 'try now.' Synthetic case study is illustrative, not runnable."
    363     },
    364     "brand_recognition": {
    365       "score": 1,
    366       "justification": "Authors from Cisco Systems, a recognizable company but not a top-tier AI research lab (not OpenAI, Anthropic, DeepMind, Google, Meta). Limited brand pull in academic or AI circles."
    367     }
    368   },
    369   "hn_data": {
    370     "threads": [
    371       {
    372         "hn_id": "47279778",
    373         "title": "Nested Training for Mutual Adaptation in Human-AI Teaming",
    374         "points": 2,
    375         "comments": 0,
    376         "url": "https://news.ycombinator.com/item?id=47279778",
    377         "created_at": "2026-03-06T19:21:19Z"
    378       }
    379     ],
    380     "top_points": 2,
    381     "total_points": 2,
    382     "total_comments": 0
    383   }
    384 }

Impressum · Datenschutz