scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21587B)
      1 {
      2   "paper": {
      3     "title": "Agentic AI for Software: thoughts from Software Engineering community",
      4     "authors": [
      5       "Abhik Roychoudhury"
      6     ],
      7     "year": 2025,
      8     "venue": "arXiv preprint",
      9     "arxiv_id": "2508.17343"
     10   },
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "The paper explicitly states 'Code — N.A.' in the header. No repository or code archive is provided."
     17       },
     18       "data_released": {
     19         "applies": false,
     20         "answer": false,
     21         "justification": "This is a position/perspective paper that does not use or produce any dataset. The header lists 'Datasets — SWE-bench' but the paper runs no experiments on SWE-bench — it merely discusses the benchmark as context for the agentic AI landscape. For a theoretical/position paper, data release is structurally inapplicable."
     22       },
     23       "environment_specified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "This is a position/perspective paper with no original experiments — no code to run and no environment to specify."
     27       },
     28       "reproduction_instructions": {
     29         "applies": false,
     30         "answer": false,
     31         "justification": "This is a position/perspective paper with no original experiments, so there are no results to reproduce."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "This is a position/perspective paper that presents no new quantitative experiments or statistical results."
     39       },
     40       "significance_tests": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "No comparative statistical claims are made; the paper is a conceptual discussion without original empirical data."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "No quantitative effect measurements are reported; the paper is a conceptual position paper."
     49       },
     50       "sample_size_justified": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No new data collection or sample is involved; the paper is theoretical/position in nature."
     54       },
     55       "variance_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No experimental runs are conducted; the paper presents no numerical results requiring variance reporting."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "This is a position/perspective paper; it discusses various systems (Devin, SWE-agent, OpenHands, AutoCodeRover) but does not conduct any comparative evaluation of its own."
     66       },
     67       "baselines_contemporary": {
     68         "applies": false,
     69         "answer": false,
     70         "justification": "No baseline comparison is conducted in this position paper."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "No experiments are conducted in this position paper, so ablation studies are not applicable."
     76       },
     77       "multiple_metrics": {
     78         "applies": false,
     79         "answer": false,
     80         "justification": "No empirical evaluation is conducted; the paper is a position/perspective piece."
     81       },
     82       "human_evaluation": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No evaluation of system outputs is conducted; the paper is a position/perspective piece."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No new experiments are run; the paper does not collect or split any datasets."
     91       },
     92       "per_category_breakdown": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "No empirical evaluation is conducted that would yield per-category results."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The paper briefly discusses trust deficit and overfitting in program repair, but does not present concrete failure cases of the systems it discusses (AutoCodeRover, Devin, etc.). There is no systematic error analysis."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The paper acknowledges challenges (overfitting, trust deficit) in passing but does not report any negative experimental results since no experiments are conducted."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The abstract claims AI agents have 'shown significant promise in software engineering' and discusses AutoCodeRover's integration into SonarQube as evidence of real-life usage, but no quantitative results or benchmarks are provided to support the claim of 'significant promise.' The paper relies on citing external benchmarks without presenting them."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper makes implicit causal claims — e.g., 'working on program representations' improves trust and enables better intent inference — but provides no controlled evidence. The claim that intent inference 'combats the overfitting problem' from program repair is asserted without empirical support in this paper."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper makes broad generalizations about 'agentic AI' in software engineering from discussion of a single system (AutoCodeRover) on a single benchmark (SWE-bench). The title 'thoughts from Software Engineering community' implies broad community consensus, but the paper represents one researcher's perspective."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper presents the intent-inference approach as the key direction without seriously considering alternative explanations for AutoCodeRover's success or alternative architectural approaches that might achieve the same goals through different means."
    128       }
    129     },
    130     "setup_transparency": {
    131       "model_versions_specified": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "This is a position paper with no new experiments; no LLM API calls are made by the paper's authors, so no model version specifications are needed."
    135       },
    136       "prompts_provided": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "This is a position paper that does not run LLM experiments; no prompts are used by the paper itself."
    140       },
    141       "hyperparameters_reported": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No experiments are conducted in this position paper, so no hyperparameters are relevant."
    145       },
    146       "scaffolding_described": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper describes AutoCodeRover's approach at a conceptual level (code search, fault localization, patch generation stages) but does not provide sufficient detail about the agent scaffolding — tool descriptions, retry logic, feedback mechanisms, or context management are not specified."
    150       },
    151       "data_preprocessing_documented": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No new data is collected or preprocessed; the paper references publicly available datasets used by the systems it discusses."
    155       }
    156     },
    157     "limitations_and_scope": {
    158       "limitations_section_present": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "There is no dedicated limitations or threats-to-validity section. The paper acknowledges challenges (trust deficit, vulnerability risks) inline, but these are not organized as a substantive limitations discussion."
    162       },
    163       "threats_to_validity_specific": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No specific threats to validity are discussed. The paper does not acknowledge limitations of SWE-bench as an evaluation benchmark, potential biases in AutoCodeRover's design choices, or limitations of the intent-inference paradigm."
    167       },
    168       "scope_boundaries_stated": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The paper does not state explicit scope boundaries. It does not clarify what claims it is not making — e.g., it does not state whether the argument applies to all programming languages, problem types, or agent architectures beyond the ones discussed."
    172       }
    173     },
    174     "data_integrity": {
    175       "raw_data_available": {
    176         "applies": false,
    177         "answer": false,
    178         "justification": "Position paper with no original data to verify. The paper references SWE-bench but does not use it in any experiment. There is no 'underlying data' produced by this paper that could be independently verified."
    179       },
    180       "data_collection_described": {
    181         "applies": false,
    182         "answer": false,
    183         "justification": "This position paper does not collect new data; all data references are to external public benchmarks."
    184       },
    185       "recruitment_methods_described": {
    186         "applies": false,
    187         "answer": false,
    188         "justification": "No participants or samples are recruited; this is a position paper discussing concepts and referencing published systems."
    189       },
    190       "data_pipeline_documented": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No data pipeline exists in this position paper; results from referenced systems are cited without the authors running their own pipeline."
    194       }
    195     },
    196     "conflicts_of_interest": {
    197       "funding_disclosed": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants, institutional support, or sponsors."
    201       },
    202       "affiliations_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper discloses the author's dual affiliation: 'Full-time involvement as Professor at NUS, while being Senior Advisor at SonarSource SA.' This is relevant given that AutoCodeRover has been integrated into SonarQube, a SonarSource product."
    206       },
    207       "funder_independent_of_outcome": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The author discloses being a Senior Advisor at SonarSource SA, the company that produces SonarQube, into which AutoCodeRover has been integrated. This creates a non-independent relationship between the author's commercial affiliation and the paper's positive portrayal of AutoCodeRover/SonarQube. No independent funder is disclosed."
    211       },
    212       "financial_interests_declared": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No competing interests statement is provided. The author's role as Senior Advisor at SonarSource (which commercializes SonarQube, the product that integrates AutoCodeRover) is disclosed as affiliation but not explicitly framed as a financial interest or conflict of interest."
    216       }
    217     },
    218     "contamination": {
    219       "training_cutoff_stated": {
    220         "applies": false,
    221         "answer": false,
    222         "justification": "This is a position/perspective paper that does not benchmark any pre-trained model's capability; it discusses systems at a conceptual level without running benchmark evaluations itself."
    223       },
    224       "train_test_overlap_discussed": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "No benchmark evaluation of pre-trained models is conducted in this paper; contamination is not applicable."
    228       },
    229       "benchmark_contamination_addressed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "The paper references SWE-bench as an existing benchmark but does not run its own evaluations; contamination is not applicable."
    233       }
    234     },
    235     "human_studies": {
    236       "pre_registered": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No human subjects study is conducted; this is a position paper."
    240       },
    241       "irb_or_ethics_approval": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human subjects study is conducted; IRB approval is not applicable."
    245       },
    246       "demographics_reported": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants are involved in this position paper."
    250       },
    251       "inclusion_exclusion_criteria": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participant selection is involved in this position paper."
    255       },
    256       "randomization_described": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No experimental study with human participants is conducted."
    260       },
    261       "blinding_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No experimental study with human participants is conducted."
    265       },
    266       "attrition_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved; attrition is not applicable."
    270       }
    271     },
    272     "cost_and_practicality": {
    273       "inference_cost_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is a position/perspective paper and does not run its own experiments; cost reporting is not applicable."
    277       },
    278       "compute_budget_stated": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a position/perspective paper with no computational experiments; compute budget is not applicable."
    282       }
    283     }
    284   },
    285   "claims": [
    286     {
    287       "claim": "Agentic AI represents the second major shift in the software industry in fifty years, comparable in significance to the SaaS revolution.",
    288       "evidence": "The paper provides a historical narrative (Historical Retrospective section) but no quantitative evidence for this comparative significance claim.",
    289       "supported": "weak"
    290     },
    291     {
    292       "claim": "Working on program representations (rather than treating code as text) is the key to building effective AI software engineering agents, enabling intent inference.",
    293       "evidence": "The paper provides conceptual argumentation and cites AutoCodeRover's design as evidence, referencing its integration into SonarQube. However, no controlled comparison against text-based agents is presented in this paper itself.",
    294       "supported": "weak"
    295     },
    296     {
    297       "claim": "AutoCodeRover's intent-inference approach via code search combats the overfitting problem seen in test-based automated program repair.",
    298       "evidence": "The paper draws an analogy from the program repair literature (Le Goues et al. 2019) and claims intent inference addresses overfitting, but no new experimental evidence is provided to support this specific causal claim.",
    299       "supported": "weak"
    300     },
    301     {
    302       "claim": "AutoCodeRover has been integrated into the widely used SonarQube tool for enterprise code quality and security.",
    303       "evidence": "Stated directly in the paper ('has now been integrated into the widely used SonarQube tool') with citation to Gaudin and Mallet 2010. The author's SonarSource affiliation corroborates this, though no user adoption metrics or independent confirmation is provided.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Agentic AI can be used for formal verification of code by interpreting formal proof representations and communicating with theorem provers.",
    308       "evidence": "This is presented as a future research direction with a citation to Cadar and Roychoudhury 2025, not as a demonstrated result. No experimental evidence is provided.",
    309       "supported": "weak"
    310     }
    311   ],
    312   "methodology_tags": [
    313     "theoretical",
    314     "case-study"
    315   ],
    316   "key_findings": "This position paper argues that intent inference — using program analysis on program representations rather than treating code as text — is the key capability distinguishing effective AI software engineering agents from simpler code assistants. The AutoCodeRover system, which performs code search and fault localization to infer intended program behavior, is presented as the primary exemplar of this approach and has been integrated into SonarQube. The paper posits that future agentic workflows will need to include AI-based verification and validation (V&V) to address the 'trust deficit' created by the explosion of automatically generated code.",
    317   "red_flags": [
    318     {
    319       "flag": "Conflict of interest not declared",
    320       "detail": "The author discloses being a Senior Advisor at SonarSource SA (the company behind SonarQube) while writing positively about AutoCodeRover's integration into SonarQube. This commercial relationship is disclosed as affiliation but not explicitly acknowledged as a potential conflict of interest that could bias the paper's conclusions."
    321     },
    322     {
    323       "flag": "Claims outrun evidence",
    324       "detail": "The paper makes broad claims about agentic AI representing a paradigm shift and about intent inference being the key to effective AI software engineering agents, but presents no original empirical evidence. All supporting evidence comes from referenced external work, and no comparative evaluation is conducted."
    325     },
    326     {
    327       "flag": "Self-promotion without independent evaluation",
    328       "detail": "The paper prominently discusses AutoCodeRover and SpecRover, both developed by the author's research group. The positive portrayal is not supported by independent evaluation or comparison with competing approaches on equal footing in this paper."
    329     },
    330     {
    331       "flag": "No limitations section",
    332       "detail": "A position paper advocating a particular approach (intent inference via program analysis) should acknowledge its limitations and scope boundaries. The paper does not discuss when this approach might fail, its computational overhead, or settings where simpler text-based approaches might be adequate."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    338       "authors": [
    339         "Carlos E. Jimenez"
    340       ],
    341       "year": 2024,
    342       "relevance": "The primary benchmark used to evaluate AI software engineering agents, central to the paper's discussion of issue resolution as a key challenge."
    343     },
    344     {
    345       "title": "AutoCodeRover: Autonomous Program Improvement",
    346       "authors": [
    347         "Yuntong Zhang",
    348         "Haifeng Ruan",
    349         "Zhiyu Fan",
    350         "Abhik Roychoudhury"
    351       ],
    352       "year": 2024,
    353       "relevance": "The primary system discussed in this paper as an exemplar of intent-inference-based agentic software engineering."
    354     },
    355     {
    356       "title": "SpecRover: Code Intent Extraction via LLMs",
    357       "authors": [
    358         "Haifeng Ruan",
    359         "Yuntong Zhang",
    360         "Abhik Roychoudhury"
    361       ],
    362       "year": 2025,
    363       "relevance": "Follow-up to AutoCodeRover making intent inference more explicit via LLM-based specification extraction."
    364     },
    365     {
    366       "title": "SWE-agent: Agent-computer Interfaces Enable Automated Software Engineering",
    367       "authors": [
    368         "John Yang"
    369       ],
    370       "year": 2024,
    371       "relevance": "A prominent AI software engineering agent that the paper compares conceptually against intent-inference approaches."
    372     },
    373     {
    374       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    375       "authors": [
    376         "Xingyao Wang"
    377       ],
    378       "year": 2025,
    379       "relevance": "Open-source AI software engineering agent platform discussed as a representative tool-use approach."
    380     },
    381     {
    382       "title": "Automated Program Repair",
    383       "authors": [
    384         "Claire Le Goues",
    385         "Michael Pradel",
    386         "Abhik Roychoudhury"
    387       ],
    388       "year": 2019,
    389       "relevance": "Survey of automated program repair techniques that provides foundational context for the intent-inference approach discussed."
    390     },
    391     {
    392       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    393       "authors": [
    394         "Hammond Pearce"
    395       ],
    396       "year": 2025,
    397       "relevance": "Empirical study on security vulnerabilities in AI-generated code, used to motivate the need for V&V of AI-generated code."
    398     },
    399     {
    400       "title": "RepoAudit: An Autonomous LLM-Agent for Repository-Level Code Auditing",
    401       "authors": [
    402         "Jinyao Guo",
    403         "Chong Wang",
    404         "Xiaokang Xu",
    405         "Zhendong Su",
    406         "Xiangyu Zhang"
    407       ],
    408       "year": 2025,
    409       "relevance": "Recent work on LLM-based security auditing of code repositories, cited as evidence of agentic AI for V&V."
    410     },
    411     {
    412       "title": "AI for Program Verification",
    413       "authors": [
    414         "Cristian Cadar",
    415         "Abhik Roychoudhury"
    416       ],
    417       "year": 2025,
    418       "relevance": "Discusses using AI for formal program verification, cited as a future direction for agentic AI in software engineering."
    419     }
    420   ]
    421 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs