ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24436B)


      1 {
      2   "paper": {
      3     "title": "From Horizontal Layering to Vertical Integration: A Comparative Study of the AI-Driven Software Development Paradigm",
      4     "authors": ["Chi Zhang", "Zehan Li", "Ziqian Zhong", "Haibing Ma", "Dan Xiao", "Chen Lin", "Ming Dong"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.22667",
      8     "doi": "10.48550/arXiv.2601.22667"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["case-study", "qualitative"],
     13   "key_findings": "Through a multiple-case comparative study of a traditional enterprise (brownfield) and an AI-native startup (greenfield), the paper claims 8x to 33x reductions in resource consumption when transitioning from horizontal functional silos to vertically integrated 'Super-Cells' where AI-augmented engineers own end-to-end delivery. The efficiency baselines are counterfactual estimates, not observed controls. The paper proposes Human-AI Collaboration Efficacy as a new optimization metric and identifies an 'AI Distortion Effect' that suppresses returns to labor scale.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code, analysis scripts, or repository links are provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No interview transcripts, coding logs, git commit data, or other qualitative data are released. The paper references project artifacts (git logs, meeting minutes) but does not make them available."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a qualitative case study with no computational experiments requiring environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The interview guides, coding schemes, and detailed case selection procedures are not included."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a qualitative case study with no statistical analysis. The quantitative claims (8x, 33x) are simple ratios from counterfactual estimates, not statistical results."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No statistical comparisons are made; this is qualitative research."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No statistical effect sizes are relevant to this qualitative case study design."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper studies two cases but provides no justification for why two cases are sufficient, no discussion of theoretical saturation, and no reference to case study sample size guidelines (e.g., Eisenhardt recommends 4-10 cases)."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs or repeated measurements; qualitative case study design."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The 'traditional baseline' for each case is a counterfactual estimate (what would have happened under traditional methods), not an observed baseline. No actual control group or prior measured project was used for comparison."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The baselines are hypothetical counterfactuals based on function point analysis and 'historical data from similar past projects' (Section 5.1.2), but no specific past projects are identified or their data presented."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is a qualitative case study examining organizational transformation, not a system with decomposable components."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The primary metric is person-months reduction. While the paper discusses cognitive load, role transformation, and organizational complexity conceptually, these are not measured with distinct metrics — only the person-month ratio is quantified."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "There is no system output to evaluate. The paper is about organizational structure, not a technical system."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No benchmark or test set is used; this is qualitative organizational research."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 provides a cross-case comparison broken down by five dimensions: Structure, Process, Role, Efficiency, and Key Issues, comparing traditional vs. AI-driven paradigm across both cases."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.5 discusses risks including ethical concerns, workforce displacement, and dynamic instability. Section 7.4 discusses limitations including elite bias, Hawthorne effect, and long-term technical debt. The 'Black Box Effect' risk is discussed in Section 5.4."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that AI can sometimes slow experienced developers (citing Becker et al. 2025), discusses cognitive overload risks, and acknowledges that the efficiency gains may not generalize to median-level developers (Section 7.4)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims '8× to 33× reductions in resource consumption' but these are based on comparing actual project effort against counterfactual estimates, not observed controls. The abstract presents these as validated findings without qualifying the counterfactual nature of the comparison."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims throughout — 'transitioning from Horizontal Layering to Vertical Integration yields 8× to 33× reductions' — but the study design (two cases, no control group, counterfactual baselines) is inadequate for causal inference. Confounds include the Hawthorne effect (acknowledged in limitations), team selection bias, and project novelty."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper generalizes from two cases to sweeping claims about 'the new paradigm' for software engineering organizations broadly. While limitations mention elite bias and scaling concerns, the title and abstract frame results as a general 'paradigm shift' rather than bounding them to the specific contexts studied."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 7.4 discusses several alternative explanations: the Hawthorne Effect (pilot team awareness), elite bias (high-performing engineers may not be representative), Organization B's dual role introducing observer bias, and model dependency affecting generalizability."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures person-months and frames this as 'efficiency' and 'resource consumption reduction' without discussing what these proxies miss — code quality, long-term maintainability, defect rates, customer satisfaction. Section 7.4 briefly mentions maintainability concerns but does not frame person-month reduction as a proxy for broader organizational effectiveness."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not run any AI models as part of its research methodology. It studies organizations that use AI tools but does not specify or evaluate particular model versions."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting as part of its research methodology."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No AI models are run as part of the research; this is qualitative organizational research."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used in the research methodology itself."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper describes three data sources (participant observation, semi-structured interviews, project documentation) in Section 4.3 but does not describe how qualitative data was coded, analyzed, or reduced. No coding scheme, thematic analysis procedure, or inter-coder reliability is reported."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.4 is a dedicated 'Limitations' subsection covering elite bias, methodological confounds, long-term technical debt, and model dependency."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.4 discusses specific threats: the Hawthorne effect from the Seed Team's pilot status, Organization B's dual role as partner and subject introducing observer bias, elite bias from high-performing engineers, and the 3-month observation window being too short to assess burnout."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.4 states specific scope boundaries: results may not apply to median-level developers, cognitive sustainability over periods longer than 3 months is untested, scaling from a 4-person unit to enterprise-scale is not captured, and efficiency gains are contingent on current LLM capabilities."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No interview transcripts, observation notes, git logs, or project documentation are made available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.3 describes three data collection methods: immersive participant observation, semi-structured interviews with key stakeholders, and analysis of project documentation (git logs, resource reports, meeting minutes)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper states a 'Seed Team was selected from their existing workforce' (Section 4.1.1) but does not describe the selection criteria, how many candidates were considered, or whether the selection could introduce bias toward favorable outcomes."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper does not document how qualitative data was transformed from raw interviews and observations into the findings presented. No coding process, theme derivation, or analytical steps are described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed. The Declarations section explicitly states that Zhang, Li, Zhong, Xiao, and Lin are full-time employees of Organization B, and Ma serves as part-time Chief Scientist."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Organization B serves as both the research partner/subject and the employer of 5 of 7 authors. The company has a direct financial interest in demonstrating the superiority of its AI-driven organizational model. The competing interests statement acknowledges this dual role but the funder is clearly not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The Declarations section includes a 'Competing Interests' statement that explicitly acknowledges the employment relationship and dual role, though it claims adherence to 'academic standards of objectivity.' Dong M. is stated to have 'no employment ties to Organization B.'"
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate any pre-trained model on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No pre-registration is mentioned for this qualitative study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No IRB or ethics board approval is mentioned despite conducting interviews with employees."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "Interviewees are described only by role (Seed Team members, Company G management, AI engineers from Organization B). No demographic details, experience levels, or number of participants are provided."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No criteria for selecting interview participants are described. The 'Seed Team' selection process is not documented."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "This is a qualitative case study, not an experimental study requiring randomization."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Blinding is not feasible in qualitative case study research."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No information is provided about how many people were interviewed, whether any declined, or whether any data was excluded."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a qualitative organizational study, not a system with inference costs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No computational experiments are conducted."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Transitioning from Horizontal Layering to Vertical Integration yields 8x to 33x reductions in resource consumption.",
    296       "evidence": "Case A (Project W): 12 person-months actual vs. ~100 person-months estimated traditional (8.3x). Case B (AI-CRM): 1.5 person-months actual vs. ~50 person-months estimated traditional (33x). Sections 5.1-5.2. Traditional estimates based on function point analysis and historical data.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "AI adoption enables engineers to transition from execution-focused coding to architectural supervision roles.",
    301       "evidence": "Qualitative interview quotes from Seed Team engineer (Section 5.4): 'Previously, I spent approximately nine hours writing code and thirty minutes on planning and design. Now the ratio has inverted.' Supported by participant observation.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The AI Distortion Effect diminishes returns to labor scale while amplifying technological leverage in TFP.",
    306       "evidence": "Theoretical analysis in Section 6.3 based on Solow TFP framework. Figure 4 shows changing TFP component weights, but values are described as 'developed through expert consensus during qualitative interviews' — not empirically measured.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "Traditional cognitive bandwidth is progressively underutilized as engineers gain experience (Table 1).",
    311       "evidence": "Table 1 presents a 'theoretical conceptual model developed through expert consensus.' The note explicitly states values are 'illustrative estimations intended to visualize the trend' rather than 'precise psychometric measurements.'",
    312       "supported": "unsupported"
    313     }
    314   ],
    315   "red_flags": [
    316     {
    317       "flag": "Severe conflict of interest",
    318       "detail": "5 of 7 authors are employees of Organization B, which is simultaneously the 'Research Partner' providing the paradigm and one of the two case subjects. The paper is essentially Organization B evaluating Organization B's own methodology and finding it produces 8-33x efficiency gains. This is analogous to a drug company running its own clinical trial with no independent oversight."
    319     },
    320     {
    321       "flag": "Counterfactual baselines presented as empirical evidence",
    322       "detail": "The 8x and 33x efficiency claims compare actual project effort against hypothetical estimates of what a traditional team would have needed. No actual traditional team delivered the same project. The 'traditional baseline' is a thought experiment using function point analysis, not an observed comparison. The paper presents these ratios as empirical 'findings' without adequately qualifying the counterfactual nature."
    323     },
    324     {
    325       "flag": "Fabricated quantitative data in Table 1",
    326       "detail": "Table 1 presents specific numerical values for 'Cognitive Capacity' and 'Utilized Load' across career stages (e.g., Year 10+ expert has '50 units capacity, 2.5 utilized'). The paper's own note admits these are 'illustrative estimations' from 'expert consensus,' yet they are presented in a formal table with precise numbers, lending false precision to a conceptual argument."
    327     },
    328     {
    329       "flag": "Extreme claims from minimal evidence",
    330       "detail": "33x efficiency gain claimed from a single project done by a single engineer at the company that authored the paper. No independent verification, no code quality assessment, no long-term maintenance evaluation, no customer satisfaction data. The magnitude of the claim far exceeds what two uncontrolled case studies can support."
    331     },
    332     {
    333       "flag": "Hawthorne effect acknowledged but not controlled",
    334       "detail": "Section 7.4 acknowledges the Seed Team knew they were a pilot unit, which likely boosted motivation and performance. Despite acknowledging this confound, the paper still presents the efficiency numbers as evidence of paradigm superiority rather than as potentially inflated by participant awareness."
    335     },
    336     {
    337       "flag": "Qualitative analysis methodology absent",
    338       "detail": "Despite claiming to use triangulation and semi-structured interviews (Section 4.3), the paper provides no coding scheme, no inter-coder reliability, no thematic analysis procedure, no interview guide, and no count of participants. The qualitative methodology is described at a high level but not actually documented or executed with rigor."
    339     },
    340     {
    341       "flag": "Self-citation of unpublished work",
    342       "detail": "The paper cites 'Zhang et al. (2025). The rise of the super employee' from Zenodo, which appears to be a prior version of the same work by the same authors from the same company, used to support the 'Super Employee' concept as if it were independently established."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    348       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu", "Zhen Yang", "Kailong Wang", "Li Li", "Haoyu Wang"],
    349       "year": 2024,
    350       "relevance": "Comprehensive survey of LLMs in software engineering covering code generation, testing, and maintenance."
    351     },
    352     {
    353       "title": "AI Agentic Programming: A Survey of Techniques, Challenges, and Opportunities",
    354       "authors": ["Hao Wang", "Jiacheng Gong", "Haofeng Zhang", "Jiaqi Xu", "Zhichao Wang"],
    355       "year": 2025,
    356       "arxiv_id": "2508.11126",
    357       "relevance": "Survey of agentic AI programming techniques relevant to the AI-driven development paradigm."
    358     },
    359     {
    360       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    361       "authors": ["Jonas Becker", "Natalie Rush", "Eleanor Barnes", "David Rein"],
    362       "year": 2025,
    363       "arxiv_id": "2507.09089",
    364       "relevance": "RCT finding AI tools can slow experienced developers in complex open-source tasks — counter-evidence to uniform productivity gains."
    365     },
    366     {
    367       "title": "GPTs are GPTs: Labor Market Impact Potential of LLMs",
    368       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    369       "year": 2024,
    370       "relevance": "Foundational analysis of LLMs as general-purpose technologies with labor market implications for software engineering."
    371     },
    372     {
    373       "title": "Advancing GenAI Assisted Programming – A Comparative Study on Prompt Efficiency and Code Quality between GPT-4 and GLM-4",
    374       "authors": ["Andrew Yang", "Zhouli Li", "Jian Li"],
    375       "year": 2024,
    376       "arxiv_id": "2402.12782",
    377       "relevance": "Empirical comparison of LLM code generation efficiency claiming 30-100x improvements over manual coding."
    378     },
    379     {
    380       "title": "A Review on Vibe Coding: Fundamentals, State-of-the-Art, Challenges and Future Directions",
    381       "authors": ["Partha Pratim Ray"],
    382       "year": 2025,
    383       "relevance": "Discusses the 'vibe coding' phenomenon where developers rely on AI output without rigorous verification."
    384     },
    385     {
    386       "title": "Understanding Code Quality: A Qualitative Evaluation of LLM-Generated vs. Human-Written Code",
    387       "authors": ["Adnan Naqvi", "Anish Jain", "Arjun Goyal", "Ankit Verma"],
    388       "year": 2025,
    389       "relevance": "Qualitative evaluation of AI-generated code quality finding subtle high-risk vulnerabilities distinct from human errors."
    390     },
    391     {
    392       "title": "Generative AI at Work",
    393       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    394       "year": 2025,
    395       "relevance": "Empirical study of AI productivity effects showing disproportionate benefits for lower-skilled workers."
    396     },
    397     {
    398       "title": "The Simple Macroeconomics of AI",
    399       "authors": ["Daron Acemoglu"],
    400       "year": 2025,
    401       "relevance": "Economic analysis of AI's impact on total factor productivity and labor markets."
    402     },
    403     {
    404       "title": "The Impact of Generative AI on Critical Thinking",
    405       "authors": ["Harsh Puppala Lee", "Advait Sarkar", "Lev Tankelevitch", "Ian Drosos", "Sean Rintel", "Richard Banks", "Nadia Wilson"],
    406       "year": 2025,
    407       "relevance": "Survey finding GenAI reduces information gathering effort but increases cognitive demand for verification — relevant to cognitive load shifts."
    408     }
    409   ]
    410 }

Impressum · Datenschutz