ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (21232B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "Five Fatal Assumptions: Why T-Shirt Sizing Systematically Fails for AI Projects",
      6     "authors": [
      7       "Raja Soundaramourty",
      8       "O. Kilic",
      9       "R. Chenchaiah"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.17734",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All five fatal assumptions claimed in the abstract are systematically analyzed with evidence in Sections 4.1–4.5. Checkpoint Sizing is presented in Section 5.3 with framework and pseudocode.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Causal mechanisms (e.g., 'T-shirt assumptions fail → estimation error') rely on cited literature rather than original causal studies by the authors. Paper synthesizes rather than independently validates causal claims.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Section 1.4 explicitly scopes 'AI projects' to LLM applications, agentic workflows, RAG, and model adaptation. Limitations (5.2) acknowledge analysis focuses on LLM/multi-agent systems; simpler ML may differ.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Paper attributes estimation failures to violated T-shirt sizing assumptions but does not explore alternative explanations: team inexperience, tool misuse, organizational factors, or whether problem is inherent to AI rather than the methodology.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Paper discusses direct concepts (effort, duration, completion criteria) without relying on problematic proxies. No confusion between measured variables and claimed outcomes.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 5.2 titled 'Limitations' explicitly states three limitations: qualitative evidence (no controlled study), generalization scope (LLM/multi-agent focus), and alternative methods not empirically validated.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Threats are specific: 'assumption violations are characterized analytically rather than through a new controlled study,' 'analysis focuses on LLM and multi-agent systems,' and 'Checkpoint Sizing effectiveness not empirically validated.'",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 1.4 explicitly bounds scope to AI projects where model/data-dependent uncertainty drives delivery risk. Limitations note simpler ML may violate fewer assumptions.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding statement provided. Paper does not disclose whether work was independently funded, sponsored, or supported by employer.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors list 'Cisco Systems, Inc.' as affiliation. However, potential conflicts of interest with product positioning or corporate interests are not discussed.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No explicit funder identified beyond employer. Independence from Cisco's interests in AI/estimation tooling cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement. No disclosure of patents, equity stakes, consulting relationships, or financial interests in estimation tools/frameworks.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: T-shirt sizing explained in introduction, 'AI projects' formally scoped in Section 1.4 (LLM applications, agentic workflows, RAG, model adaptation), Checkpoint Sizing framework presented in Section 5.3.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1.2 explicitly lists four contributions: (1) identification of five fatal assumptions, (2) empirical grounding in literature, (3) quantitative evidence, (4) alternative framework (Checkpoint Sizing).",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Section 2 surveys prior work on agile estimation and AI project management but engages superficially—mostly summarizing what prior work found rather than critically positioning new contribution relative to existing knowledge.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "position": {
    118       "argument_quality": {
    119         "argument_internally_consistent": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Argument is logically consistent: T-shirt sizing assumes X, AI development violates X, therefore estimation fails. Each assumption is traced to failure mode. However, Checkpoint Sizing also relies on untested assumptions (that decision gates provide actionable evidence).",
    123           "source": "haiku"
    124         },
    125         "counterarguments_addressed": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "No serious engagement with counterarguments: Could teams improve AI estimation accuracy through experience? Could larger buffers solve the problem? Are unknown unknowns fixable via any methodology? Are there downsides to Checkpoint Sizing?",
    129           "source": "haiku"
    130         },
    131         "analogies_appropriate": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "Analogies are valid: linear vs. non-linear effort curves (Figure 1), circular dependency trap (Figure 3), guardrail oscillation as 'whack-a-mole' (Figure 13). Parallels are apt, not false equivalences.",
    135           "source": "haiku"
    136         },
    137         "prescriptions_proportional": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Paper prescribes abandoning T-shirt sizing and adopting Checkpoint Sizing, but provides no evidence Checkpoint Sizing works. Recommending a replacement methodology without proof it succeeds is disproportionate to the evidence presented.",
    141           "source": "haiku"
    142         },
    143         "evidence_for_claims_cited": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "Factual claims are cited: exponential scaling cites [11][12], 39% multi-turn degradation cites [4], N(N-1) agent complexity cites [1]. Appendix A documents reference validation.",
    147           "source": "haiku"
    148         },
    149         "alternatives_discussed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Paper proposes Checkpoint Sizing as the alternative framework without discussing other emerging AI estimation methodologies or competing proposals.",
    153           "source": "haiku"
    154         },
    155         "historical_context_accurate": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Historical references are accurate: T-shirt sizing correctly explained, Story Points and Planning Poker accurately characterized, Brooks's Law and Cone of Uncertainty correctly cited.",
    159           "source": "haiku"
    160         }
    161       },
    162       "clarity_and_scope": {
    163         "key_terms_defined_precisely": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Key terms lack precision: 'deterministic vs. probabilistic completion' discussed informally, 'effort' undefined, 'checkpoint readiness' criteria vague. Algorithm 1 requires 'Evidence' but does not specify what constitutes sufficient evidence.",
    167           "source": "haiku"
    168         },
    169         "engages_with_existing_literature": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Section 2 reviews literature descriptively rather than critically. Paper summarizes prior work (Amershi, Sculley, etc.) but does not substantively discuss how this contribution differs from, extends, or contradicts existing research.",
    173           "source": "haiku"
    174         },
    175         "intended_audience_clear": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Abstract explicitly states: 'This paper is intended for engineering managers, technical leads, and product owners responsible for planning and delivering AI initiatives.'",
    179           "source": "haiku"
    180         },
    181         "assumptions_stated": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Implicit assumptions are not made explicit: (1) five assumptions framework correctly diagnoses root cause, (2) Checkpoint Sizing will outperform T-shirt sizing, (3) evidence from recent arXiv papers generalizes beyond LLM/multi-agent systems.",
    185           "source": "haiku"
    186         },
    187         "scope_of_applicability_discussed": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Scope explicitly bounded to LLM/multi-agent systems but does not discuss applicability to other AI domains (vision, robotics), team structures, organizational maturity, or regulatory/safety-critical contexts.",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "T-shirt sizing assumes linear effort scaling, but AI systems exhibit exponential effort curves where incremental performance gains require disproportionate resources",
    199       "evidence": "Section 4.1 presents Figure 1 (linear vs. non-linear comparison) and cites [11][12] on power-law relationships between compute/data and model performance.",
    200       "supported": "strong"
    201     },
    202     {
    203       "claim": "Multi-agent interaction complexity grows combinatorially as N(N-1), not linearly with agent count",
    204       "evidence": "Section 4.1 presents Table 1 and Figure 2 documenting combinatorial growth. Cites [1] on multi-agent failure modes.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "LLMs exhibit 39% average performance degradation in multi-turn conversations compared to single-turn interactions",
    209       "evidence": "Section 4.2.2 cites [4] as primary source. Figure 5 illustrates context degradation in multi-turn settings.",
    210       "supported": "moderate"
    211     },
    212     {
    213       "claim": "AI development contains irreducible sequential dependencies (data→train→evaluate) that cannot be compressed with added headcount",
    214       "evidence": "Section 4.3.2 lists mandatory sequential phases. Cites [13][14] on ML deployment bottlenecks and data work as longest phase.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Data engineering, model architecture, and prompt engineering form a tightly coupled system where changes cascade unexpectedly",
    219       "evidence": "Section 4.4.2 discusses tight coupling across stack (Figure 9). Cites [1][3] on system interdependencies and error amplification.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "AI project completion criteria are probabilistic, not deterministic; safety/legal constraints can turn 'done' projects into major rework",
    224       "evidence": "Section 4.5 discusses moving goalpost problem, termination failures, guardrail oscillation. Cites [1][2].",
    225       "supported": "moderate"
    226     },
    227     {
    228       "claim": "Each new AI dataset introduces unique 'unknown unknowns' (data corruption, bias, distribution shifts) that emerge only during training/evaluation",
    229       "evidence": "Section 4.2.2 argues dataset uniqueness prevents reliable analogies. Cites [2][5] on hidden uncertainty in AI projects.",
    230       "supported": "moderate"
    231     },
    232     {
    233       "claim": "Checkpoint Sizing with explicit decision gates and evidence-based reassessment provides a better estimation framework for AI projects than T-shirt sizing",
    234       "evidence": "Section 5.3 proposes framework (Algorithm 1, gate checklist). Section 5.4 presents synthetic case study showing estimate evolution across gates.",
    235       "supported": "weak"
    236     }
    237   ],
    238   "methodology_tags": [
    239     "theoretical",
    240     "qualitative"
    241   ],
    242   "key_findings": "T-shirt sizing systematically fails for AI projects because five foundational assumptions do not hold: (1) linear effort scaling (AI exhibits exponential curves), (2) repeatability from prior experience (every dataset is unique), (3) effort-duration fungibility (sequential dependencies create timeline floors), (4) task decomposability (tight coupling across data/model/prompt layers), and (5) deterministic completion criteria (probabilistic, evolving constraints). The paper proposes Checkpoint Sizing as an alternative: an iterative methodology using explicit decision gates (data readiness, evaluation harness, safety/reliability, cost/latency budgets, operationalization) to reassess scope and timeline based on empirical evidence rather than analogy. Checkpoint Sizing treats initial estimates as testable hypotheses rather than commitments.",
    243   "red_flags": [
    244     {
    245       "flag": "No empirical validation of core claims",
    246       "detail": "Paper is analytical synthesis of literature. No original experiments, real case studies, or quantitative comparison between T-shirt sizing and Checkpoint Sizing on actual AI projects."
    247     },
    248     {
    249       "flag": "Checkpoint Sizing effectiveness unproven",
    250       "detail": "Proposed alternative lacks evidence of effectiveness. Synthetic case study (5.4) illustrates framework mechanics but does not prove it reduces estimation error, schedule slip, or improves outcomes."
    251     },
    252     {
    253       "flag": "Counterarguments unaddressed",
    254       "detail": "Paper does not explore alternative explanations for AI estimation failure (team inexperience, tool misuse, organizational dysfunction). Does not discuss whether Checkpoint Sizing itself introduces overhead or new failure modes."
    255     },
    256     {
    257       "flag": "Heavy reliance on recent arXiv papers",
    258       "detail": "Evidence grounded in 5 recent (2024–2025) arXiv papers [1–5]. Limited engagement with older/canonical ML systems literature. Only Sculley et al. (2015) provides established baseline; generalizability of recent papers uncertain."
    259     },
    260     {
    261       "flag": "Potential conflict of interest not disclosed",
    262       "detail": "All three authors from Cisco Systems, Inc. No statement on funding independence or conflict of interest. Unclear whether work is independent research or influenced by Cisco's interests in AI estimation/tooling products."
    263     },
    264     {
    265       "flag": "Checkpoint readiness criteria vague",
    266       "detail": "Algorithm 1 and Gate Checklist specify artifacts (data inventory, eval pipeline, etc.) but not acceptance criteria. Who decides when 'data readiness' is sufficient? How to handle disagreement? No guidance on checkpoint failure/iteration."
    267     },
    268     {
    269       "flag": "Overclaimed 'fatality' of assumptions",
    270       "detail": "Paper frames assumptions as 'fatal' without proving they're always problematic or impossible to address within T-shirt sizing (e.g., adding larger buffers, breaking projects into smaller chunks)."
    271     },
    272     {
    273       "flag": "Limited scope and generalization",
    274       "detail": "Analysis focuses on LLM/multi-agent systems. Applicability unclear for other AI domains (vision, robotics), smaller/mature teams, or non-commercial research contexts."
    275     }
    276   ],
    277   "cited_papers": [
    278     {
    279       "title": "Why Do Multi-Agent LLM Systems Fail?",
    280       "authors": "M. Cemri et al.",
    281       "year": 2025,
    282       "arxiv_id": "2503.13657",
    283       "relevance": "Primary evidence for Assumption 4 (task decomposability via inter-agent coupling) and 5 (deterministic completion via verification/termination failures)."
    284     },
    285     {
    286       "title": "An LLM-based multi-agent framework for agile effort estimation",
    287       "authors": "T.-L. Bui, H. K. Dam, R. Hoda",
    288       "year": 2025,
    289       "arxiv_id": "2509.14483",
    290       "relevance": "Validates Assumption 2 (repeatability via subjective inconsistency) and 5 (deterministic completion via estimation instability in LLM-based systems)."
    291     },
    292     {
    293       "title": "Towards a Science of Scaling Agent Systems",
    294       "authors": "Y. Kim et al.",
    295       "year": 2025,
    296       "arxiv_id": "2512.08296",
    297       "relevance": "Primary evidence for Assumption 1 (non-linear scaling), 3 (effort-duration tradeoffs), and 4 (coordination overhead/error amplification in multi-agent systems)."
    298     },
    299     {
    300       "title": "LLMs Get Lost In Multi-Turn Conversation",
    301       "authors": "P. Laban, H. Hayashi, Y. Zhou, J. Neville",
    302       "year": 2025,
    303       "arxiv_id": "2505.06120",
    304       "relevance": "Primary evidence for 39% multi-turn performance degradation (Assumption 2: repeatability) and non-recovering error trajectories (Assumption 5: deterministic completion)."
    305     },
    306     {
    307       "title": "Effort and Size Estimation in Software Projects with Large Language Model-based Intelligent Interfaces",
    308       "authors": "C. N. Coelho Jr et al.",
    309       "year": 2024,
    310       "arxiv_id": "2402.07158",
    311       "relevance": "Validates Assumption 2 (repeatability via hidden uncertainty in AI projects) and 5 (deterministic completion via evolving specifications and AI interface behavior)."
    312     },
    313     {
    314       "title": "Software Engineering for Machine Learning: A Case Study",
    315       "authors": "S. Amershi et al.",
    316       "year": 2019,
    317       "venue": "IEEE/ACM ICSE-SEIP",
    318       "relevance": "Foundational work identifying nine ML workflow characteristics (data dependencies, experimental iteration, model decay) that differ from traditional software, establishing that AI development violates software estimation assumptions."
    319     },
    320     {
    321       "title": "Hidden Technical Debt in Machine Learning Systems",
    322       "authors": "D. Sculley et al.",
    323       "year": 2015,
    324       "venue": "NeurIPS",
    325       "relevance": "Established literature on ML technical debt showing conventional software engineering intuitions systematically mislead practitioners, supporting Assumptions 2, 4, and 5."
    326     },
    327     {
    328       "title": "Challenges in Deploying Machine Learning: A Survey of Case Studies",
    329       "authors": "A. Paleyes, R.-G. Urma, N. Lawrence",
    330       "year": 2022,
    331       "venue": "ACM Computing Surveys",
    332       "relevance": "Documents sequential bottlenecks in real-world ML deployments (data→train→evaluate), supporting Assumption 3 (irreducible sequential dependencies)."
    333     }
    334   ],
    335   "engagement_factors": {
    336     "practical_relevance": {
    337       "score": 2,
    338       "justification": "Practitioners could attempt Checkpoint Sizing, but lack of empirical validation makes adoption risky. Framework is intuitive but unproven relative to alternatives."
    339     },
    340     "surprise_contrarian": {
    341       "score": 2,
    342       "justification": "Challenges widespread T-shirt sizing use, but difficulties with AI estimation are well-known to experienced practitioners. Not surprising to practitioners who have lived the problem."
    343     },
    344     "fear_safety": {
    345       "score": 1,
    346       "justification": "Mentions safety validation and hallucination rates as hidden costs, but frames as project management challenge rather than AI safety/existential risk concern."
    347     },
    348     "drama_conflict": {
    349       "score": 2,
    350       "justification": "Moderate controversy over whether T-shirt sizing is broken for AI. Practical methodology debate rather than high-drama industry conflict or heated disagreement."
    351     },
    352     "demo_ability": {
    353       "score": 1,
    354       "justification": "Checkpoint Sizing is a conceptual framework with no accompanying software, tooling, or live demo. Would require manual implementation and interpretation to evaluate."
    355     },
    356     "brand_recognition": {
    357       "score": 2,
    358       "justification": "Cisco Systems is well-known company, but authors are not prominent AI researchers. Work not associated with top-tier AI research lab or famous research group."
    359     }
    360   },
    361   "hn_data": {
    362     "threads": [
    363       {
    364         "hn_id": "47279778",
    365         "title": "Nested Training for Mutual Adaptation in Human-AI Teaming",
    366         "points": 2,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=47279778",
    369         "created_at": "2026-03-06T19:21:19Z"
    370       }
    371     ],
    372     "top_points": 2,
    373     "total_points": 2,
    374     "total_comments": 0
    375   }
    376 }

Impressum · Datenschutz