scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23482B)
      1 {
      2   "paper": {
      3     "title": "Automated structural testing of LLM-based agents: methods, framework, and case studies",
      4     "authors": [
      5       "Jens Kohl",
      6       "Otto Kruse",
      7       "Youssef Mostafa",
      8       "Andre Luckow",
      9       "Karsten Schroer",
     10       "Thomas Riedl",
     11       "Ryan French",
     12       "David Katz",
     13       "Manuel P. Luitz",
     14       "Tanrajbir Takher",
     15       "Ken E. Friedl",
     16       "Céline Laurent-Winter"
     17     ],
     18     "year": 2026,
     19     "venue": "arXiv preprint",
     20     "arxiv_id": "2601.18827"
     21   },
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper states 'We provide an open-source implementation under Apache 2 license at GitHub' (Section VI) and mentions a GitHub link for a travel agent use case (Section V). An open source reference implementation is provided."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The case studies involve proprietary BMW driver assistance and AWS cloud incident agents. No datasets are released. The travel agent example on GitHub is a reference implementation, but the case study data (customer data, vehicle data, cloud incident data) are not released."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'Amazon Bedrock Converse API' and 'Generative AI Toolkit' but provides no requirements.txt, Dockerfile, or detailed environment/dependency specifications with library versions."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper references a GitHub repository and provides code listings, but no step-by-step reproduction instructions are included in the paper itself. The code snippets are illustrative rather than complete runnable instructions."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "The paper presents a methods/framework contribution with qualitative case studies. No quantitative experimental results with numerical metrics are reported, so confidence intervals are not applicable."
     50       },
     51       "significance_tests": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No quantitative comparisons or claims of statistically significant differences are made. The evaluation is entirely qualitative."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No quantitative effect sizes are reported. The paper describes qualitative benefits observed during case studies, not measured effects."
     60       },
     61       "sample_size_justified": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "No quantitative experiments with sample sizes are conducted. The paper is a methods/framework paper with qualitative case study observations."
     65       },
     66       "variance_reported": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No quantitative experiments are run, so there is no variance to report. The evaluation is qualitative."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper does not compare its structural testing approach against any baseline testing method (e.g., acceptance-only testing) with quantitative metrics. It discusses acceptance testing conceptually as the current approach but provides no comparative evaluation."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No baselines are included in the evaluation, so contemporaneity is not assessable. The paper lists existing testing frameworks (Section II) but does not compare against them experimentally."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The framework has multiple components (traces, mocking, assertions) but no ablation study examines the individual contribution of each component to testing effectiveness."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The paper reports no quantitative evaluation metrics. Benefits are described qualitatively (e.g., 'higher observability', 'faster root cause analysis', 'higher test coverage') without measurement."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "The paper claims qualitative benefits observed by developers but reports no structured human evaluation, user study, or systematic developer feedback collection."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "This is a methods/framework paper, not a benchmark evaluation. There is no dataset to split into training and test sets."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No per-category or per-case-study quantitative breakdown is provided. Results are described qualitatively across case studies without numeric comparisons."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "The paper does not discuss cases where structural testing failed, was insufficient, or produced false negatives/positives. Only successful applications are shown."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "No negative results or unsuccessful attempts are reported. Every described application of the methods is presented as beneficial."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract claims 'these methods reduce testing costs and improve agent quality through higher coverage, reusability, and earlier defect detection.' These are stated as accomplished facts, but the paper's Section V.C acknowledges the evaluation is qualitative and a 'large-scale quantitative study' is needed. The abstract overstates what the evidence shows."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes causal claims such as 'structural testing enables... faster root-cause analysis' and 'reduce testing costs.' These are based on qualitative observations from a few case studies with no controlled comparison, which is insufficient for causal inference. The paper itself acknowledges this limitation in Section VI."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title and abstract present the methods as general-purpose for 'LLM-based agents,' but the case studies are limited to Amazon Bedrock Converse API with BMW/AWS use cases. The paper mentions migration to other environments 'is fairly straightforward' (Section I) but this is untested. Generalization is not adequately bounded."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for the observed qualitative benefits. For example, the benefits could stem from the increased test effort itself rather than structural testing specifically, or from developer familiarity with the systems. No such alternatives are considered."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper mentions 'Amazon Bedrock Converse API' but does not specify which LLM model versions were used in the case studies. No specific model names, versions, or snapshot dates are given."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper does not provide the system prompts or user prompts used with the LLM-based agents in the case studies. Code listings show test case inputs but not the agent prompts themselves."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the case study agents."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The paper describes the agent architecture in detail (Section I, Fig. 1), including the agent core, perception module, brain module, action module, tools, memory, and knowledge bases. The driver assistance agent's RAG setup is shown in Fig. 3, and the root cause analysis agent's architecture is shown in Fig. 5."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No data preprocessing steps are described. The case studies reference tools and databases but do not document how data was prepared or preprocessed for the agents."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section VI states 'We acknowledge two limitations' and describes them: the evaluation is qualitative rather than quantitative, and the implementation is tied to Amazon Bedrock Converse API."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The two limitations are specific to this study: (1) 'our evaluation is qualitative; a large-scale quantitative study comparing development with and without structural testing would provide stronger evidence of benefits' and (2) 'our implementation uses Amazon Bedrock Converse API as a unified interface to multiple LLM.' These are specific rather than boilerplate."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges the need for quantitative study but does not bound the claims to specific agent types, domains, or scales. The general framing of 'LLM-based agents' in the title is not narrowed to the two proprietary case studies actually tested."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No raw data from the case studies is available. The case studies involve proprietary BMW and AWS systems, and no test traces, results, or data are released."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The paper describes the case study setups but does not detail how the qualitative observations from developers were collected. Section V.C mentions 'we observed several qualitative benefits for developers' without describing the observation methodology."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants are recruited for a study. The developers mentioned are the authors' own team members, not study participants. This is a framework paper, not a human subjects study."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No data pipeline is documented. The qualitative observations are not systematically collected or documented with any methodology."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source is disclosed. The paper has an Acknowledgements section but it only thanks individuals, not funding sources. Authors are from BMW Group and Amazon Web Services but no explicit funding disclosure is made."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: BMW Group, Munich, Germany and Amazon Web Services. These are shown directly under the author names."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "Multiple authors are affiliated with Amazon Web Services, and the paper promotes Amazon Bedrock Converse API and the Generative AI Toolkit (an AWS product). AWS has a financial interest in demonstrating the value of agent testing frameworks built on their platform. This represents a non-independent funder/employer."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is provided. Authors work for BMW and AWS, both of which have commercial interests in the agent development ecosystem being promoted."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It presents a testing framework with case studies; no model performance benchmarking is involved."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation is conducted, so train/test overlap is not applicable."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No benchmark evaluation is conducted, so contamination is not applicable."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human subjects study is conducted. The paper presents a framework with developer-authored case studies, not a study involving human participants."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human subjects study is conducted."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study is conducted."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study is conducted."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study is conducted."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study is conducted."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human subjects study is conducted."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper discusses cost savings from mocking (avoiding LLM API costs) and structural testing (fewer expensive acceptance tests) but reports no actual cost figures, latency measurements, or per-example costs."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No computational budget is stated. The paper does not report hardware used, total API spend, or execution times for the case studies."
    293       }
    294     }
    295   },
    296   "claims": [
    297     {
    298       "claim": "Structural testing methods (traces, mocking, assertions) enable the test automation pyramid, regression testing, test-driven development, and multi-language testing for LLM-based agents.",
    299       "evidence": "Sections III and IV describe the methods conceptually with code examples in Section V showing test cases for driver assistance and cloud incident agents. Demonstrated via code listings (Listings 1-2) and architectural diagrams (Figs. 2-5).",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Structural testing reduces testing costs and improves agent quality through higher coverage, reusability, and earlier defect detection.",
    304       "evidence": "Section V.C reports qualitative observations from developers: 'higher observability,' 'fast root cause analysis,' 'higher quality from the first deployments.' However, no quantitative evidence is provided. The paper itself acknowledges this limitation in Section VI.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "Mocking enables reproducible and faster LLM testing while saving costs compared to cloud-based LLM usage.",
    309       "evidence": "Section III.A.3 describes the mocking approach conceptually. No quantitative speedup or cost comparison is provided.",
    310       "supported": "weak"
    311     },
    312     {
    313       "claim": "Traces based on OpenTelemetry facilitate detailed observability and root cause analysis for LLM-based agents.",
    314       "evidence": "Section III.A.1 describes the tracing approach. Section V.B describes how traces enable root cause analysis for cloud incidents. Evidence is from qualitative case study observations.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "case-study"
    320   ],
    321   "key_findings": "The paper proposes three technical methods for structural testing of LLM-based agents: OpenTelemetry-based tracing to capture agent trajectories, LLM mocking for reproducible test conditions, and trace-based assertions for automated verification. These methods enable adapting software engineering best practices (test automation pyramid, TDD, regression testing) to LLM agent development. The evaluation consists of two qualitative case studies at BMW (driver assistance agent) and AWS (cloud incident root cause analysis agent), with the authors acknowledging that quantitative evidence of benefits is needed.",
    322   "red_flags": [
    323     {
    324       "flag": "No quantitative evaluation",
    325       "detail": "All claimed benefits (reduced costs, higher coverage, faster root cause analysis) are stated as qualitative observations without any measurement or comparison. The paper acknowledges this limitation but the abstract presents these as established facts."
    326     },
    327     {
    328       "flag": "Conflict of interest: AWS employees promoting AWS tools",
    329       "detail": "Half the authors are from Amazon Web Services, and the paper heavily promotes Amazon Bedrock Converse API and the Generative AI Toolkit. The open-source implementation is specifically built for AWS services. This conflict is not disclosed or acknowledged."
    330     },
    331     {
    332       "flag": "Abstract overclaims relative to evidence",
    333       "detail": "The abstract states 'these methods reduce testing costs and improve agent quality' as accomplished facts, but the evidence is limited to qualitative impressions from the authors' own teams with no controlled comparison."
    334     },
    335     {
    336       "flag": "No failure cases or negative results",
    337       "detail": "Every application of the methods is presented as successful. No limitations of the structural testing approach, scenarios where it fails, or cases where acceptance testing outperforms structural testing are discussed."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "From LLMs to LLM-based agents for software engineering: a survey of current, challenges and future",
    343       "authors": ["H. Jin", "L. Huang", "H. Cai", "J. Yan", "B. Li", "H. Chen"],
    344       "year": 2024,
    345       "arxiv_id": "2408.02479",
    346       "relevance": "Survey of LLM-based agents for software engineering, directly relevant to understanding the scope of agent testing needs."
    347     },
    348     {
    349       "title": "Why do multi-agent LLM systems fail?",
    350       "authors": ["M. Cemri"],
    351       "year": 2025,
    352       "arxiv_id": "2503.13657",
    353       "relevance": "Taxonomy of failures in multi-agent LLM systems, directly relevant to understanding what testing should catch."
    354     },
    355     {
    356       "title": "Defining and detecting the defects of the large language model-based autonomous agents",
    357       "authors": ["K. Ning"],
    358       "year": 2024,
    359       "arxiv_id": "2412.18371",
    360       "relevance": "Defines defect types in LLM-based agents, relevant to agent quality evaluation."
    361     },
    362     {
    363       "title": "Survey on evaluation of LLM-based agents",
    364       "authors": ["A. Yehudai"],
    365       "year": 2025,
    366       "arxiv_id": "2503.16416",
    367       "relevance": "Survey of agent evaluation methods, directly relevant to understanding the landscape of agent testing approaches."
    368     },
    369     {
    370       "title": "TrustAgent: towards safe and trustworthy LLM-based agents",
    371       "authors": ["W. Hua", "X. Yang", "M. Jin", "Z. Li", "W. Cheng", "R. Tang", "Y. Zhang"],
    372       "year": 2024,
    373       "arxiv_id": "2402.01586",
    374       "relevance": "Addresses safety and trustworthiness of LLM-based agents, relevant to agent quality assurance."
    375     },
    376     {
    377       "title": "Software testing of generative AI systems: challenges and opportunities",
    378       "authors": ["A. Aleti"],
    379       "year": 2023,
    380       "relevance": "Discusses challenges and opportunities in testing generative AI systems, directly relevant to the survey scope of AI software quality."
    381     },
    382     {
    383       "title": "Generative AI Toolkit – a framework for increasing the quality of LLM-based applications over their whole life cycle",
    384       "authors": ["J. Kohl"],
    385       "year": 2024,
    386       "arxiv_id": "2412.14215",
    387       "relevance": "Prior work by same authors on LLM application quality framework, the basis for the testing framework described in this paper."
    388     },
    389     {
    390       "title": "Test-Driven Development and LLM-based code generation",
    391       "authors": ["N. S. Mathews", "M. Nagappan"],
    392       "year": 2024,
    393       "relevance": "Explores TDD for LLM-based code generation, relevant to understanding testing practices for AI-generated code."
    394     },
    395     {
    396       "title": "Judging LLM-as-a-judge with MT-bench and Chatbot arena",
    397       "authors": ["L. Zheng"],
    398       "year": 2023,
    399       "relevance": "Foundational work on LLM-as-a-judge evaluation, relevant to understanding limitations of automated evaluation that structural testing aims to address."
    400     },
    401     {
    402       "title": "Methodology for quality assurance testing of LLM-based multi-agent systems",
    403       "authors": ["I. Shamim", "R. Singhal"],
    404       "year": 2025,
    405       "relevance": "Directly relevant methodology for QA testing of multi-agent LLM systems."
    406     },
    407     {
    408       "title": "Evil geniuses: delving into the safety of LLM-based agents",
    409       "authors": ["Y. Tian", "X. Yang", "J. Zhang", "Y. Dong", "H. Su"],
    410       "year": 2023,
    411       "arxiv_id": "2311.11855",
    412       "relevance": "Examines safety vulnerabilities of LLM-based agents, relevant to understanding testing requirements for agent safety."
    413     },
    414     {
    415       "title": "LLMs get lost in multi-turn conversation",
    416       "authors": ["P. Laban", "H. Hayashi", "Y. Zhou", "J. Neville"],
    417       "year": 2025,
    418       "arxiv_id": "2505.06120",
    419       "relevance": "Demonstrates LLM performance degradation in multi-turn conversations, a key quality issue that structural testing aims to catch."
    420     }
    421   ]
    422 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs