ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19008B)


      1 {
      2   "paper": {
      3     "title": "Five Fatal Assumptions: Why T-Shirt Sizing Systematically Fails for AI Projects",
      4     "authors": ["Raja Soundaramourty", "Ozkan Kilic", "Ramu Chenchaiah"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.17734"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["theoretical", "qualitative"],
     12   "key_findings": "The paper identifies five assumptions underlying T-shirt sizing that fail for AI projects: linear effort scaling, repeatability from prior experience, effort-duration fungibility, task decomposability, and deterministic completion criteria. It grounds each failure in existing empirical literature on multi-agent systems and LLM behavior, citing N(N-1) interaction complexity and 39% multi-turn performance degradation. The paper proposes Checkpoint Sizing, an iterative estimation approach with explicit decision gates, but does not empirically validate it.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code or repository link is provided. The paper contains pseudocode for Checkpoint Sizing (Algorithm 1) but no implementation."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset or analysis data is released. The paper relies on citing existing literature rather than collecting new data."
     24       },
     25       "environment_specified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "This is a theoretical/analytical paper with no computational experiments requiring an environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": false,
     32         "answer": false,
     33         "justification": "No experiments to reproduce. The paper is an analytical argument grounded in literature synthesis."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "The paper presents no original quantitative results. All numbers cited (e.g., 39% degradation, N(N-1) complexity) come from other papers."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No original experiments or statistical comparisons are performed."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No original experiments. Effect sizes cited are from referenced papers."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original data collection. This is a theoretical paper."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No original experiments to report variance for."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "T-shirt sizing itself serves as the baseline against which the proposed Checkpoint Sizing is compared, with traditional software as the control domain."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper compares against T-shirt sizing (a decades-old practice) but does not compare Checkpoint Sizing against other contemporary AI estimation approaches that may exist."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No system with components to ablate. This is a theoretical framework."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No quantitative evaluation of the proposed framework is performed."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No system output to evaluate. The paper proposes a framework without empirical testing."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No dataset or experimental evaluation."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper provides detailed per-assumption analysis across all five fatal assumptions (Section 4.1-4.5), with each assumption analyzed separately."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The entire paper is about failure cases of T-shirt sizing. Specific failure modes include circular dependency traps (Section 4.1), context degradation (Section 4.2), observability gaps (Section 4.3), shared resource contention (Section 4.4), and guardrail oscillation (Section 4.5)."
    103       },
    104       "negative_results_reported": {
    105         "applies": false,
    106         "answer": false,
    107         "justification": "No experiments were run, so there are no negative experimental results to report."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims to identify five fatal assumptions and ground them in empirical literature. The paper delivers this in Section 4 with citations to specific studies for each assumption."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims throughout, e.g., 'these failures aren't about poor execution' (Section 1.1), AI development 'breaks these rules' (abstract). These causal claims about why estimation fails are argued analytically from cited literature but not tested with original data or controlled study."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 1.4 explicitly defines scope: LLM applications, agentic workflows, RAG systems, and model adaptation. Section 5.2 acknowledges 'simpler ML projects may violate fewer assumptions.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not consider alternative explanations for estimation failures in AI projects (e.g., team inexperience, poor requirements, organizational factors). It attributes failures entirely to the five assumptions without considering confounds."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper frames cited metrics like '39% performance degradation' and 'N(N-1) complexity' as direct evidence of estimation failure, without discussing whether these proxy measurements actually predict estimation accuracy."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "No models are used. This is a theoretical paper."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No experiments with hyperparameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section 3.1 describes the assumption identification process but does not document the literature search methodology: no search queries, databases searched, inclusion/exclusion criteria, or number of papers reviewed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5.2 'Limitations' explicitly lists three limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5.2 lists specific limitations: (1) evidence is qualitative/analytical not from a new controlled study, (2) analysis focuses on LLM/multi-agent and may not apply to simpler ML, (3) Checkpoint Sizing is not empirically validated."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 1.4 explicitly defines what counts as an 'AI project' and Section 5.2 acknowledges the analysis focuses on LLM and multi-agent systems specifically."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No original data collected. The paper synthesizes arguments from existing literature."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Section 3 describes the methodology at a high level (four-step process) but does not describe how the literature was collected — no search strategy, databases, date ranges, or selection criteria."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants and no data collection requiring recruitment."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The paper goes from '5 cited papers validate 5 assumptions' (Appendix A) without documenting how these 5 papers were selected from the broader literature or whether other evidence was considered and excluded."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure or acknowledgments section. All authors are from Cisco Systems, a major technology company."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations at Cisco Systems are clearly stated in the header."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding statement is provided. Cisco, as a technology company developing AI products, has potential interest in outcomes related to AI project estimation methodology. The absence of disclosure is itself a concern."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement. Authors work at Cisco which develops and sells AI products and services."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "No pre-trained model is evaluated on any benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No model evaluation on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Theoretical paper with no computational method to cost."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Theoretical paper with no computation."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "T-shirt sizing rests on five implicit assumptions that systematically fail in AI contexts: linear scaling, repeatability, effort-duration fungibility, decomposability, and deterministic completion.",
    295       "evidence": "Section 4 provides analytical arguments for each assumption's failure, citing literature on multi-agent failure modes [1], scaling properties [3], and multi-turn conversation degradation [4].",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Multi-agent system interaction complexity grows as N(N-1) with agent count.",
    300       "evidence": "Table 1 in Section 4.1 presents the formula and examples. This is a mathematical identity for pairwise interactions, not an empirical finding of the paper itself.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "LLMs exhibit 39% average performance degradation in multi-turn conversations vs single-turn.",
    305       "evidence": "Cited from reference [4] (Laban et al., 2025) in Sections 4.2 and 4.4. This is a pass-through claim from another paper, not independently verified.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Checkpoint Sizing with explicit decision gates is a more appropriate estimation methodology for AI projects than T-shirt sizing.",
    310       "evidence": "Section 5.3 proposes the framework with pseudocode and a synthetic case study (Section 5.4). No empirical validation is provided. The authors acknowledge this in Section 5.2.",
    311       "supported": "weak"
    312     }
    313   ],
    314   "red_flags": [
    315     {
    316       "flag": "No empirical validation",
    317       "detail": "The paper proposes Checkpoint Sizing as an alternative but provides zero empirical evidence it works better. The synthetic case study in Section 5.4 is entirely constructed by the authors, not based on real project data."
    318     },
    319     {
    320       "flag": "Circular validation methodology",
    321       "detail": "Appendix A shows that all 5 references were selected specifically to validate the 5 assumptions, yielding '100% relevance' scores. This is tautological — papers were chosen because they support the claims, then scored as supporting the claims."
    322     },
    323     {
    324       "flag": "Extremely narrow evidence base",
    325       "detail": "The entire analytical framework rests on only 5 primary references [1]-[5]. For a paper making sweeping claims about AI project estimation failures, the evidence base is remarkably thin. No industry data, project post-mortems, or estimation accuracy studies are cited."
    326     },
    327     {
    328       "flag": "Company affiliation without disclosure",
    329       "detail": "All three authors work at Cisco Systems, a major technology company with commercial AI products. No funding, competing interests, or conflict of interest statements are provided."
    330     },
    331     {
    332       "flag": "Claims outrun evidence",
    333       "detail": "The paper claims estimation failures are 'systematic' and 'fatal' but provides no data on actual estimation accuracy in AI projects. The five assumptions are plausible but the magnitude of their impact on real estimation is entirely unquantified."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Why Do Multi-Agent LLM Systems Fail?",
    339       "authors": ["M. Cemri"],
    340       "year": 2025,
    341       "arxiv_id": "2503.13657",
    342       "relevance": "Identifies 14 failure modes in multi-agent LLM systems across system design, inter-agent misalignment, and task verification."
    343     },
    344     {
    345       "title": "An LLM-based multi-agent framework for agile effort estimation",
    346       "authors": ["T.-L. Bui", "H. K. Dam", "R. Hoda"],
    347       "year": 2025,
    348       "arxiv_id": "2509.14483",
    349       "relevance": "Directly addresses LLM-based approaches to agile effort estimation, relevant to AI-assisted software engineering."
    350     },
    351     {
    352       "title": "Towards a Science of Scaling Agent Systems",
    353       "authors": ["Y. Kim"],
    354       "year": 2025,
    355       "arxiv_id": "2512.08296",
    356       "relevance": "Quantifies non-linear performance regimes and coordination trade-offs in scaling agent systems."
    357     },
    358     {
    359       "title": "LLMs Get Lost In Multi-Turn Conversation",
    360       "authors": ["P. Laban", "H. Hayashi", "Y. Zhou", "J. Neville"],
    361       "year": 2025,
    362       "arxiv_id": "2505.06120",
    363       "relevance": "Documents 39% average performance degradation in multi-turn LLM conversations, relevant to reliability of agentic systems."
    364     },
    365     {
    366       "title": "Effort and Size Estimation in Software Projects with Large Language Model-based Intelligent Interfaces",
    367       "authors": ["C. N. Coelho Jr"],
    368       "year": 2024,
    369       "arxiv_id": "2402.07158",
    370       "relevance": "Addresses estimation challenges introduced by LLM-based interfaces in software projects."
    371     },
    372     {
    373       "title": "Software Engineering for Machine Learning: A Case Study",
    374       "authors": ["S. Amershi"],
    375       "year": 2019,
    376       "relevance": "Identifies nine characteristics of ML workflows that differ from traditional software, foundational to understanding AI project estimation challenges."
    377     },
    378     {
    379       "title": "Hidden Technical Debt in Machine Learning Systems",
    380       "authors": ["D. Sculley"],
    381       "year": 2015,
    382       "relevance": "Characterizes ML-specific technical debt patterns that affect project estimation and maintenance effort."
    383     },
    384     {
    385       "title": "Scaling Laws for Neural Language Models",
    386       "authors": ["J. Kaplan"],
    387       "year": 2020,
    388       "arxiv_id": "2001.08361",
    389       "relevance": "Establishes power-law scaling relationships between compute/data and model performance, relevant to non-linear effort scaling."
    390     },
    391     {
    392       "title": "Training Compute-Optimal Large Language Models",
    393       "authors": ["J. Hoffmann"],
    394       "year": 2022,
    395       "arxiv_id": "2203.15556",
    396       "relevance": "Chinchilla scaling laws for compute-optimal training, relevant to understanding resource requirements for AI systems."
    397     },
    398     {
    399       "title": "Beyond Accuracy: Behavioral Testing of NLP Models with CheckList",
    400       "authors": ["M. T. Ribeiro", "T. Wu", "C. Guestrin", "S. Singh"],
    401       "year": 2020,
    402       "relevance": "Provides behavioral testing framework for NLP models, referenced for evaluation coverage in Checkpoint Sizing gates."
    403     }
    404   ]
    405 }

Impressum · Datenschutz