ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18317B)


      1 {
      2   "paper": {
      3     "title": "Agentic AI Software Engineers: Programming with Trust",
      4     "authors": [
      5       "Abhik Roychoudhury",
      6       "Corina Păsăreanu",
      7       "Michael Pradel",
      8       "Baishakhi Ray"
      9     ],
     10     "year": 2025,
     11     "venue": "ACM (opinion piece, September 2025)",
     12     "arxiv_id": "2502.13767"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": false,
     18         "answer": false,
     19         "justification": "This is a 5-page opinion/position piece with no experiments or implementation. There is no code to release."
     20       },
     21       "data_released": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "No dataset is collected or used in this opinion piece. The criterion does not apply."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No computational experiments are conducted, so there is no environment or dependencies to specify."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "No experiments are run. There is nothing to reproduce."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a theoretical opinion piece with no empirical results or numerical measurements."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative empirical claims are made that would require significance testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical results are reported; effect sizes are not applicable."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No samples or experimental participants are used; the criterion does not apply."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted; variance reporting is not applicable."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a position paper with no experimental evaluation; baselines are not applicable."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No baselines are used in this opinion piece."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system is built and evaluated; ablation studies are not applicable."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No evaluation metrics are reported in this theoretical opinion piece."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No empirical evaluation of any system is conducted."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No experiments or test sets exist in this position paper."
     94       },
     95       "per_category_breakdown": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No empirical results to break down by category."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "This is a pure position/opinion paper with no system built or evaluated. Failure cases are structurally inapplicable — there is no approach whose failures could be examined. The scan agent instructions state that for theoretical/position papers, 'Most empirical checklist items will have applies: false.'"
    104       },
    105       "negative_results_reported": {
    106         "applies": false,
    107         "answer": false,
    108         "justification": "No experiments are run from which negative results could be reported."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract frames this as an opinion piece commenting on how LLM agents can enable trustworthy AI software engineering. The body of the paper provides conceptual discussion consistent with this framing; no empirical claims in the abstract are contradicted by the body."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal empirical claims. It argues normatively (what should be done) rather than causally (X causes Y in measured data)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad claims about 'AI software engineers' and 'future development workflows' without bounding these to specific systems, languages, or settings. The paper acknowledges it is an opinion piece but does not bound the scope of its prescriptions."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": false,
    129         "answer": false,
    130         "justification": "The paper presents no empirical results, so there are no alternative explanations of observed data to discuss. The criterion is not applicable."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No models are used in experiments. This is a position piece that discusses models conceptually."
    138       },
    139       "prompts_provided": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No prompting experiments are conducted."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No computational experiments are run."
    148       },
    149       "scaffolding_described": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No new agentic system is built or evaluated in this paper; scaffolding description is not applicable."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No data is collected or preprocessed."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "There is no dedicated limitations or threats-to-validity section. The paper is 5 pages long and ends with an 'Outlook' section that does not address limitations of the arguments made."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No threats to validity are discussed. The paper does not acknowledge limitations of its conceptual arguments, such as the basis for the claim that trust is the key barrier to adoption."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what its arguments do not cover. It speaks broadly about 'AI software engineers' and 'future development workflows' without bounding the scope to specific domains, model types, or organizational contexts."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": false,
    180         "answer": false,
    181         "justification": "No data is collected. This is an opinion piece."
    182       },
    183       "data_collection_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No data collection is performed."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No participants or samples are recruited."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No data pipeline exists in this position paper."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No funding or acknowledgments section is present in the paper. The paper notes that one author (Roychoudhury) is 'Senior Advisor at SonarSource' in a footnote, but no funding disclosure is made."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are clearly listed on the first page: National University of Singapore, Carnegie Mellon University / KBR Inc. / NASA Ames, CISPA Helmholtz Center / University of Stuttgart, and Columbia University."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding is disclosed. Additionally, one author (Roychoudhury) is identified as Senior Advisor at SonarSource (which acquired AutoCodeRover, a system discussed favorably in the paper), creating an undisclosed potential conflict of interest."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No competing interests statement is present. Notably, one author is Senior Advisor at SonarSource, which acquired AutoCodeRover — a system prominently featured and described favorably in the paper — but this relationship is not disclosed as a potential conflict of interest."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "This is a position paper; no pre-trained model is evaluated on a benchmark. Contamination questions are not applicable."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is conducted."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation is conducted."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this opinion piece."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human subjects research is conducted."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a theoretical opinion piece; cost reporting is not applicable."
    280       },
    281       "compute_budget_stated": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No computational experiments are run."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "The key barrier to AI adoption in software engineering is trust — developers are wondering if they can trust AI systems.",
    291       "evidence": "Attributed to a Forbes blog post by behavioral scientist Lindsay Kohler (reference [3]). Section 1, p.1.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "LLM agents (as opposed to raw LLMs) offer a path toward integrating code generation with analysis tools to increase trust.",
    296       "evidence": "Conceptual argument in Sections 3-4. The paper describes three aspects of LLM agents (LLMs as backends, tool interaction, autonomy) and outlines trust mechanisms (testing, formal proofs, guardrails). No empirical support is provided.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Developer hesitation in accepting AI-generated code comes from the volume of code that can be quickly generated, overwhelming human developers.",
    301       "evidence": "Described as 'real-life anecdotes communicated by clients' of the AutoCodeRover team (Section 4). Explicitly acknowledged as anecdotal.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Programming with AI will shift from scale to trust, requiring effective delegation between human and AI.",
    306       "evidence": "Stated as a normative argument in the Outlook section (Section 5). No empirical evidence provided.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": [
    311     "theoretical"
    312   ],
    313   "key_findings": "This is a 5-page opinion piece arguing that trust — both technical and human — is the primary barrier to deploying AI software engineers. The authors propose LLM agents augmented with testing, formal verification, guardrails, and explainability mechanisms as the path to establishing this trust. The paper surveys existing SE agents (Devin, RepairAgent, AutoCodeRover, SWE-agent) at a high level and calls for review-parity practices (treating AI-generated code the same as human contributions). No new empirical results are presented.",
    314   "red_flags": [
    315     {
    316       "flag": "Undisclosed conflict of interest",
    317       "detail": "Author Roychoudhury is identified in a footnote as 'Senior Advisor at SonarSource', which acquired AutoCodeRover — a system described favorably multiple times in the paper. This relationship is not disclosed in a conflicts-of-interest statement, and no competing interests section exists."
    318     },
    319     {
    320       "flag": "Anecdote cited as evidence",
    321       "detail": "In Section 4, the claim about developer hesitation due to overwhelming code volume is supported only by 'real-life anecdotes communicated by clients' of AutoCodeRover — explicitly acknowledged but then used to motivate design recommendations without empirical validation."
    322     },
    323     {
    324       "flag": "Trust barrier claim relies on non-peer-reviewed source",
    325       "detail": "The central premise — that 'the key barrier to AI adoption is trust' — is supported solely by a Forbes newsletter/blog post (reference [3]), not empirical research."
    326     },
    327     {
    328       "flag": "Broad generalizations without scope bounding",
    329       "detail": "The paper makes sweeping prescriptions about 'AI software engineers' and 'future development workflows' without specifying which domains, languages, organizational sizes, or model types these recommendations apply to."
    330     }
    331   ],
    332   "cited_papers": [
    333     {
    334       "title": "RepairAgent: An autonomous, LLM-based agent for program repair",
    335       "authors": [
    336         "Islem Bouzenia",
    337         "Premkumar Devanbu",
    338         "Michael Pradel"
    339       ],
    340       "year": 2025,
    341       "relevance": "An LLM agent for automated program repair evaluated at ICSE 2025; directly relevant to agentic AI SE survey."
    342     },
    343     {
    344       "title": "Towards Neural Synthesis for SMT-assisted Proof-Oriented Programming",
    345       "authors": [
    346         "Saikat Chakraborty",
    347         "Gabriel Ebner",
    348         "Siddharth Bhat",
    349         "Sarah Fakhoury",
    350         "Sakina Fatima",
    351         "Shuvendu Lahiri",
    352         "Nikhil Swamy"
    353       ],
    354       "year": 2025,
    355       "relevance": "Combines LLMs with formal verification (proof-oriented programming); relevant to trustworthy AI code generation."
    356     },
    357     {
    358       "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?",
    359       "authors": [
    360         "Samuel Miserendino",
    361         "Michele Wang",
    362         "Tejal Patwardhan",
    363         "Johannes Heidecke"
    364       ],
    365       "year": 2025,
    366       "arxiv_id": "2502.12115",
    367       "relevance": "Benchmark evaluating frontier LLMs on real-world SE tasks; highly relevant to AI SE capability assessment."
    368     },
    369     {
    370       "title": "A systematic review on fostering appropriate trust in Human-AI interaction: Trends, opportunities and challenges",
    371       "authors": [
    372         "Siddharth Mehrotra",
    373         "Chadha Degachi",
    374         "Oleksandra Vereschak",
    375         "Catholijn M Jonker",
    376         "Myrthe L Tielman"
    377       ],
    378       "year": 2024,
    379       "relevance": "Systematic review on human-AI trust; relevant to the human factors dimension of AI SE adoption."
    380     },
    381     {
    382       "title": "SpecRover: Code Intent Extraction via LLMs",
    383       "authors": [
    384         "Haifeng Ruan",
    385         "Yuntong Zhang",
    386         "Abhik Roychoudhury"
    387       ],
    388       "year": 2025,
    389       "relevance": "LLM-based specification inference for code explainability; relevant to trustworthy agentic SE."
    390     },
    391     {
    392       "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM",
    393       "authors": [
    394         "Gabriel Ryan",
    395         "Siddhartha Jain",
    396         "Mingyue Shang",
    397         "Shiqi Wang",
    398         "Xiaofei Ma",
    399         "Murali Krishna Ramanathan",
    400         "Baishakhi Ray"
    401       ],
    402       "year": 2024,
    403       "relevance": "Empirical study of LLM-based test generation; directly relevant to automated software engineering quality."
    404     },
    405     {
    406       "title": "SWE-agent: Agent-computer Interfaces Enable Automated Software Engineering",
    407       "authors": [
    408         "John Yang",
    409         "Carlos E. Jimenez",
    410         "Alexander Wettig",
    411         "Kilian Lieret",
    412         "Shunyu Yao",
    413         "Karthik Narasimhan",
    414         "Ofir Press"
    415       ],
    416       "year": 2024,
    417       "relevance": "Major LLM agent for SE (SWE-bench); a key system in the agentic AI SE landscape."
    418     },
    419     {
    420       "title": "AutoCodeRover: Autonomous Program Improvement",
    421       "authors": [
    422         "Yuntong Zhang",
    423         "Haifeng Ruan",
    424         "Zhiyu Fan",
    425         "Abhik Roychoudhury"
    426       ],
    427       "year": 2024,
    428       "relevance": "LLM agent combining program analysis with autonomous issue resolution; prominent system in agentic SE."
    429     }
    430   ]
    431 }

Impressum · Datenschutz