scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24684B)
      1 {
      2   "paper": {
      3     "title": "Towards AI-Native Software Engineering (SE 3.0): A Vision and a Challenge Roadmap",
      4     "authors": [
      5       "Ahmed E. Hassan",
      6       "Gustavo A. Oliva",
      7       "Dayi Lin",
      8       "Boyuan Chen",
      9       "Zhen Ming (Jack) Jiang"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv",
     13     "arxiv_id": "2410.06107"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No source code or repository link is provided in the paper. The paper describes a technology stack (Compiler.next, Runtime.next, etc.) but does not release any implementation artifacts."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No dataset is released. The paper is a vision paper that draws on surveys of literature, discussions with industry leaders, and the authors' practical experience, but none of these inputs are made publicly available."
     26       },
     27       "environment_specified": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "This is a vision/position paper with no experiments to reproduce. Environment specifications are not applicable."
     31       },
     32       "reproduction_instructions": {
     33         "applies": false,
     34         "answer": false,
     35         "justification": "This is a vision/position paper with no experiments. There are no results to reproduce."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "This is a vision/position paper that does not present its own quantitative experimental results. No statistical analysis is performed."
     43       },
     44       "significance_tests": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No comparative experiments are conducted in this paper, so significance tests are not applicable."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No experiments are conducted in this paper. Some numbers are mentioned from companion papers (e.g., '30% latency improvement', '50% fewer requests routed'), but these are results from other papers, not from this paper's own analysis."
     53       },
     54       "sample_size_justified": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experiments with sample sizes are conducted in this paper."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No experiments are conducted in this paper. Variance reporting is not applicable."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "This is a vision paper that does not conduct experiments. SE 2.0 is discussed as a conceptual contrast to SE 3.0, but no empirical baseline comparison is made."
     70       },
     71       "baselines_contemporary": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No experiments are conducted, so baseline recency is not applicable."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No system is evaluated in this paper; ablation studies are not applicable."
     80       },
     81       "multiple_metrics": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No experiments are conducted in this paper."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No system outputs are evaluated in this paper. Human evaluation is not applicable to a vision paper."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No experiments are conducted in this paper."
     95       },
     96       "per_category_breakdown": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "No experimental results are presented that could be broken down by category."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 2.2 critically discusses the limitations and failure cases of SE 2.0, including cognitive overload (Section 2.2.1), inefficient model training (Section 2.2.2), and suboptimal code quality/additive bias (Section 2.2.3). Section 2.3 discusses limitations of autonomous software engineers."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper acknowledges negative aspects: SE 2.0 limitations, SWE-bench intrinsic limitations (Section 2.3), the fragility of prompts (Section 4.5), and the general acknowledgment that SE 3.0 challenges remain unsolved. Section 4.6 also lists open questions the authors have not yet developed a vision for."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims the paper proposes a vision for SE 3.0 with intent-centric development and outlines key components and challenges. The body of the paper delivers on these claims: Section 3 presents the vision and technology stack, and Section 4 presents challenges. The claims are appropriately framed as visionary rather than empirical."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes several causal claims without adequate empirical backing, such as 'cognitive overload on developers' being caused by SE 2.0 (Section 2.2.1), and that AI-generated code 'contaminates future training data, creating a feedback loop that further degrades model quality' (Section 2.2.3). These are stated as established facts but are supported only by reasoning and selective citations, not by the authors' own controlled studies."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper makes broad claims about the future of software engineering without clearly bounding them. For example, the abstract states SE 3.0 will feature 'AI systems evolving beyond task-driven copilots into intelligent collaborators' without qualifying this as speculative or conditional. The title frames SE 3.0 as a general paradigm shift. While Section 4 does acknowledge challenges, the vision itself is presented in sweeping, unbounded terms."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not substantively discuss alternative visions or explanations for the future of SE. Section 2.3 briefly considers autonomous software engineers as an alternative but dismisses them. No consideration is given to alternative paradigms (e.g., that SE 2.0 may improve sufficiently, or that different organizational models might address the same problems without the proposed technology stack)."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "No models are used experimentally in this paper. Model names are mentioned in passing (e.g., 'GPT 5.2', 'DeepSeek V3.2') but only as contextual references, not as models the authors run experiments with."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No prompting is performed in this paper."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No experiments are conducted in this paper."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used experimentally in this paper. The proposed technology stack describes conceptual scaffolding but does not implement or evaluate it."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The paper states its vision was 'identified based on (i) surveys of academic and gray literature, (ii) in-depth discussions with industrial and academic leaders...' (Section 1) but provides no documentation of how these inputs were collected, filtered, or synthesized. No systematic methodology is described for the literature survey or expert consultations."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations or threats-to-validity section. Section 4 discusses challenges for realizing SE 3.0, but these are open research problems for the proposed vision, not limitations of the paper's own methodology or analysis."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed. The paper does not address potential biases in its methodology (e.g., the authors' affiliation with Huawei and their vested interest in the proposed technology stack)."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the vision does NOT cover or where its claims should not be applied. Section 4.6 lists open questions 'for which we have not yet developed a thorough vision,' which is a partial acknowledgment, but no explicit scope boundaries are drawn for the overall vision."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper references surveys, discussions, and practical experience as inputs to its vision, but none of this source material is made available. The literature survey results, meeting notes, and customer feedback that informed the vision are not provided."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "Section 1 lists five sources of input (surveys, discussions, customer meetings, practical experience, industry partners) but provides no detail on how data was collected from any of these. No timeframes, instruments, or sampling methods are described."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are recruited for a study. The expert discussions mentioned are informal consultations, not a structured study with participants."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No data pipeline is documented. The transformation from the five input sources (Section 1) to the SE 3.0 vision is not described in any systematic way."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source is disclosed. The acknowledgments section disclaims that opinions don't reflect Huawei's, but does not state who funded the research."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: three authors are from the Centre for Software Excellence at Huawei Canada, one from Queen's University, and one from York University."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "Three of the five authors are from Huawei Canada, and the paper's proposed technology stack (Compiler.next, Runtime.next, FMArts, FMware) references Huawei-affiliated work extensively. The funder (implicitly Huawei) has a direct commercial interest in the proposed SE 3.0 vision being adopted."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper. Three authors work at Huawei, and several referenced companion papers [28, 45, 98, 114] are authored by the same group and describe Huawei-related systems."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This paper does not evaluate any pre-trained model on a benchmark."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "This paper does not evaluate any pre-trained model on a benchmark."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper does not evaluate any pre-trained model on a benchmark."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human subjects study is conducted in this paper."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human subjects study is conducted in this paper."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human subjects study is conducted in this paper."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human subjects study is conducted in this paper."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human subjects study is conducted in this paper."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects study is conducted in this paper."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects study is conducted in this paper."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a vision/position paper. It does not implement or run its own method. Cost reporting is not applicable."
    281       },
    282       "compute_budget_stated": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a vision/position paper with no computational experiments of its own."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "SE 2.0 imposes high cognitive overload on developers because the human drives the code creation loop, requiring continuous problem decomposition, prompting, evaluation, and debugging.",
    292       "evidence": "Section 2.2.1 describes the programming workflow in detail and cites Vaithilingam et al. [97] on debugging 'rabbit holes'. Also cites He et al. [46] on Cursor increasing code complexity.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "AI coding assistants exhibit additive bias, leading to bloated codebases that harm long-term maintainability.",
    297       "evidence": "Section 2.2.3 cites He et al. [46] finding Cursor leads to higher code complexity and static analysis warnings. Also cites Sergeyuk et al. [89] survey on why developers reject AI suggestions.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "SE 3.0 will enable intent-centric, conversation-oriented development where AI drives the code creation loop instead of humans.",
    302       "evidence": "Section 3 describes this vision conceptually (Sections 3.1-3.6) with a proposed technology stack. No empirical evidence is provided for the vision itself.",
    303       "supported": "unsupported"
    304     },
    305     {
    306       "claim": "Curriculum engineering is more efficient than unstructured internet-scale data for training FMs.",
    307       "evidence": "Section 3.6 argues for this philosophically, citing the phi family of models [15] and InstructLab [51, 91] as examples, but provides no direct empirical comparison between curriculum-engineered and data-driven training in this paper.",
    308       "supported": "weak"
    309     },
    310     {
    311       "claim": "A multi-agent ToM-enhanced conversational system significantly improved intent clarification, requirement completeness, and human-AI alignment across 150 diverse scenarios.",
    312       "evidence": "Section 4.1 references Gallaba et al. [36] as 'complementary work' providing evidence. This is a separate paper, not results from the current paper.",
    313       "supported": "weak"
    314     },
    315     {
    316       "claim": "Runtime.next's Fusion Runtime shows approximately 30% latency improvement compared to Ray Serve.",
    317       "evidence": "Section 4.3 states 'Preliminary results show a latency improvement in the order of 30% compared to Ray Serve,' referencing the companion paper [45]. No details or methodology are provided in this paper.",
    318       "supported": "weak"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "theoretical"
    323   ],
    324   "key_findings": "This paper proposes Software Engineering 3.0 (SE 3.0), an AI-native paradigm shift from the current AI-assisted SE 2.0. The key vision is intent-centric, conversation-oriented development where AI teammates drive the code creation loop rather than human developers. The proposed technology stack includes five components: Teammate.next (personalized AI partners), IDE.next (intent-centric IDEs), Compiler.next (multi-objective code synthesis), Runtime.next (SLA-aware execution), and FM.next (curriculum-engineered models). The paper identifies six major challenges and eight additional open questions that must be addressed to realize this vision.",
    325   "red_flags": [
    326     {
    327       "flag": "Vendor-affiliated vision paper",
    328       "detail": "Three of five authors are from Huawei Canada's Centre for Software Excellence. The proposed SE 3.0 technology stack heavily references the authors' own companion papers (Compiler.next [28], FMware [45], Runtime.next [114], RAR [98], Watson [85], SPICE [70], conversational development [36]) and Huawei-related systems (FMArts, Fusion Runtime). The vision is substantially a description of Huawei's research agenda framed as a community roadmap."
    329     },
    330     {
    331       "flag": "No empirical validation of core vision",
    332       "detail": "The central claims about SE 3.0 being superior to SE 2.0 are purely speculative. All empirical results cited are from companion papers (often under review), not from this paper itself. The paper acknowledges 'the SE 3.0 vision can only be truly assessed and validated as a whole once prototypes have been developed for all components of the stack' (Section 5)."
    333     },
    334     {
    335       "flag": "No limitations section",
    336       "detail": "For a paper making sweeping claims about the future of software engineering, the absence of a limitations section discussing potential failure modes, risks, or conditions under which the vision might not hold is a significant methodological gap."
    337     },
    338     {
    339       "flag": "Unsystematic methodology for vision development",
    340       "detail": "The paper lists five sources of input (surveys, expert discussions, customer meetings, practical experience, industry partnerships) but provides no systematic methodology for how these were gathered, analyzed, or synthesized into the proposed vision. This makes the conclusions difficult to evaluate or reproduce."
    341     },
    342     {
    343       "flag": "Undisclosed conflicts of interest",
    344       "detail": "No competing interests statement despite authors being employed by Huawei (which would commercially benefit from adoption of the proposed stack), membership in OPEA alliance with 40+ companies, and multiple self-citations to companion papers under review."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Does AI-Assisted Coding Deliver? A Difference-in-Differences Study of Cursor's Impact on Software Projects",
    350       "authors": ["Hao He", "Courtney Miller", "Shyam Agarwal", "Christian Kästner", "Bogdan Vasilescu"],
    351       "year": 2025,
    352       "arxiv_id": "2511.04427",
    353       "relevance": "Empirical study finding that Cursor boosts short-term velocity but increases code complexity and static analysis warnings, directly relevant to AI coding tool productivity evaluation."
    354     },
    355     {
    356       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    357       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    358       "year": 2023,
    359       "arxiv_id": "2302.06590",
    360       "relevance": "Key study on AI coding assistant productivity impact, directly relevant to the survey's assessment of empirical evidence on LLM programming tools."
    361     },
    362     {
    363       "title": "Measuring GitHub Copilot's Impact on Productivity",
    364       "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X. Alice Li", "Andrew Rice"],
    365       "year": 2024,
    366       "doi": "10.1145/3633453",
    367       "relevance": "Empirical measurement of Copilot's productivity impact, relevant to evaluating claims about AI coding assistant effectiveness."
    368     },
    369     {
    370       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    371       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    372       "year": 2024,
    373       "doi": "10.1145/3695988",
    374       "relevance": "Comprehensive systematic review of LLMs applied to software engineering tasks, directly relevant as a survey of the LLM-SE space."
    375     },
    376     {
    377       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    378       "authors": ["Agnia Sergeyuk", "Yaroslav Golubev", "Timofey Bryksin", "Iftekhar Ahmed"],
    379       "year": 2025,
    380       "doi": "10.1016/j.infsof.2024.107610",
    381       "relevance": "Survey of ~500 developers on reasons for not using AI coding assistants, providing empirical evidence on developer trust and perception issues."
    382     },
    383     {
    384       "title": "Rethinking Software Engineering in the Foundation Model Era: A Curated Catalogue of Challenges in the Development of Trustworthy FMware",
    385       "authors": ["Ahmed E. Hassan", "Dayi Lin", "Gopi Krishnan Rajbahadur"],
    386       "year": 2024,
    387       "arxiv_id": "2402.15943",
    388       "relevance": "Companion paper from the same group cataloguing challenges in foundation-model-powered software development."
    389     },
    390     {
    391       "title": "Agentic Software Engineering: Foundational Pillars and a Research Roadmap",
    392       "authors": ["Ahmed E. Hassan", "Hao Li", "Dayi Lin"],
    393       "year": 2025,
    394       "arxiv_id": "2509.06216",
    395       "relevance": "Related roadmap paper on agentic software engineering, directly relevant to the survey's scope of agentic AI in SE."
    396     },
    397     {
    398       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    399       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    400       "year": 2025,
    401       "relevance": "Open platform for AI software engineering agents, relevant to the survey's coverage of agentic coding tools."
    402     },
    403     {
    404       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    405       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"],
    406       "year": 2022,
    407       "doi": "10.1145/3491101.3519665",
    408       "relevance": "User study evaluating LLM code generation usability, finding developers can get trapped in debugging rabbit holes."
    409     },
    410     {
    411       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    412       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    413       "year": 2024,
    414       "arxiv_id": "2406.18665",
    415       "relevance": "Approach for routing requests between LLMs, relevant to the survey's coverage of efficient LLM deployment and cost optimization."
    416     },
    417     {
    418       "title": "Hidden technical debt in Machine learning systems",
    419       "authors": ["D. Sculley", "Gary Holt", "Daniel Golovin"],
    420       "year": 2015,
    421       "relevance": "Foundational paper on ML systems engineering challenges, cited by the authors as inspiration for their systems-level view of AI engineering."
    422     },
    423     {
    424       "title": "Compiler.next: A Search-Based Compiler to Power the AI-Native Future of Software Engineering",
    425       "authors": ["Filipe R. Cogo", "Gustavo A. Oliva", "Ahmed E. Hassan"],
    426       "year": 2025,
    427       "arxiv_id": "2510.24799",
    428       "relevance": "Companion paper presenting the search-based code synthesis component of the proposed SE 3.0 stack, with initial benchmark evaluation."
    429     }
    430   ]
    431 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs