scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30447B)
      1 {
      2   "paper": {
      3     "title": "STELLAR: A Search-Based Testing Framework for Large Language Model Applications",
      4     "authors": [
      5       "Lev Sorokin",
      6       "Ivan Vasilev",
      7       "Ken E. Friedl",
      8       "Andrea Stocco"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2601.00497",
     13     "doi": "10.48550/arXiv.2601.00497"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Replication package released at https://github.com/ast-fortiss-tum/STELLAR (reference [35]), explicitly cited in Section IX: 'The pipeline used to obtain the results discussed in this work and the results are available in our replication package.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "They use publicly available datasets (BeaverTails [16], Yelp [51], MultiWOZ 2.2 [15]) and state results are in the replication package. NaviQA-II data is proprietary but NaviQA-I uses open-source data."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions OpenSBT, NSGA-II, Azure, Ollama/HuggingFace, and ALL-MINILM-L6-V2 for embeddings, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions in the paper. The replication package is referenced but the paper itself does not include commands to run or a 'Reproducing Results' section."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Figures 2 and 3 show standard deviation bands for failure ratio over time (bottom plots), and box plots showing distribution spread for failure counts (top plots). Results averaged over 6 runs."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section IV.F.2: 'we applied the Mann–Whitney U test [60] (α = 0.05) and quantified effect size using Vargha–Delaney A12 [61]. The improvements of STELLAR over all baselines are statistically significant, with large effect sizes.'"
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "They report Vargha-Delaney A12 effect sizes alongside significance tests, and provide context with relative improvements: '2.2 times higher failure rate' (SafeQA), '2.5 times higher failure rate' (overall), 'up to 4.3×' improvement."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "They run 6 repetitions per configuration but provide no justification for why 6 runs and no power analysis. The choice of 10 human raters is also unjustified."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Figures 2 and 3 show 'Mean ratio between failures found and in total generated test cases with standard deviation' across 6 runs. Box plots show distribution spread."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "They compare against Random Search (RS), ASTRAL (state-of-the-art automated LLM testing), and T-WISE (combinatorial 4-wise feature interaction) across all experiments."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "ASTRAL is from ISSTA '25, a recent and directly relevant automated LLM testing approach. T-WISE is a standard combinatorial method. Both are appropriate and current baselines."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No ablation study removing individual STELLAR components (e.g., crossover vs. mutation, RAG-based generation, duplicate elimination, constraint rules). The system has multiple components but their individual contributions are not measured."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "They use multiple metrics: absolute failure count, failure ratio (failures/total tests), and cluster coverage for diversity (RQ2). NaviQA uses two fitness functions (f1 response quality, f2 content accuracy)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section IV.E.2: 10 BMW participants rated 30 question-answer pairs for judge validation (300 annotations). Section VI: domain expert with 7 years experience evaluated failure types and answered structured questions about severity and novelty."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Hyperparameters (population size, search time, mutation/crossover rates) were tuned via 'preliminary experiments' without clear separation from the final evaluation AUTs. No explicit train/validation/test separation for parameter tuning."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results broken down per LLM model (Figures 2, 3), per case study (SafeQA, NaviQA-I, NaviQA-II), and per failure type (Table V with 9 failure categories for NaviQA-II)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section VI provides detailed qualitative evaluation of failure types in NaviQA-II (Table V). Nine failure types identified with examples: endpoint failure, incorrect rating, name misinterpretation, language misclassification, technical output, etc."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "They report that locally deployed LLMs (LLAMA 3, QWEN 2.5-7B, DEEPSEEK-V2-16B) had >90% failure rates on NaviQA and were excluded. ASTRAL only outperforms RS/T-WISE in 2 of 6 LLM configurations. RS/T-WISE occasionally match STELLAR's diversity."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims 'up to 4.3× (average 2.5×) more failures' are supported by results in Figures 2 and 3 and the statistical analysis. All key claims in the abstract are backed by the experimental results."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Comparative claims ('STELLAR detects more failures') are supported by controlled experiments: same testing budget, same LLMs, same evaluation criteria. Statistical significance tests with effect sizes provide adequate justification for the comparison."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "External validity section explicitly states: 'the scope is still limited. Generalization to other domains, larger systems, or additional LLM families should be made with caution.' The paper also notes budget constraints limited the number of models tested."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Threats to validity (Section V) discusses specific alternatives: threshold sensitivity may alter results, LLM non-determinism, API call duration variability, generator occasionally producing misaligned utterances, offline RAG configuration for ASTRAL vs. online."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures failure count and failure rate, and claims testing effectiveness. The measurements directly match the claims — no proxy gap exists. They additionally validate with diversity metrics and qualitative expert evaluation to triangulate."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Models listed by marketing names only: GPT-4O, GPT-5-CHAT, DEEPSEEK-V3, MISTRAL 7B, DEEPSEEK-V2-16B, QWEN 2.5-7B, GPT-4O-MINI, DOLPHIN 3. No API version IDs, snapshot dates, or exact model identifiers provided."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Figure 1 shows an 'illustrative' prompt template with placeholders ({{content}}, {{style}}, {{perturbation}}, {{rag_examples}}, {{examples}}). Full prompts referenced in replication package [35] but not in the paper. Template with unfilled placeholders does not count per schema."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Table I reports population size (20), search time (2h/3h), crossover threshold (0.7), mutation threshold (0.12/0.07), similarity threshold (0.8), number of fitness functions. Temperature values stated: 0/0 for SafeQA generator/judge, 0.2/0 for NaviQA."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "STELLAR is a search-based testing framework, not an agentic system. It uses LLMs for test generation and evaluation in a pipeline (no tool use, retry logic, memory, or agent scaffolding). The framework architecture is described via Algorithm 1."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Feature discretization is documented (Section III.A), numerical encoding explained, constraint rules described (Section III.B), duplicate elimination via cosine similarity with 0.8 threshold (Section III.F), embedding model specified (ALL-MINILM-L6-V2)."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section V 'Threats to Validity' contains three detailed subsections: Internal Validity, Construct Validity, and External Validity, with substantive discussion in each."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Specific threats discussed: threshold values may alter results (internal), LLM non-determinism despite repeated runs (internal), generator may produce misaligned utterances (internal), custom judge dimensions introduce subjectivity (construct), offline RAG for ASTRAL may differ from online (construct)."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "External validity section: 'Although the systems and models differ in nature, the scope is still limited. Generalization to other domains, larger systems, or additional LLM families should be made with caution.' Future work explicitly lists unaddressed areas (multi-modal, multi-turn)."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section IX: 'The pipeline used to obtain the results discussed in this work and the results are available in our replication package [35].' Results include generated test inputs and system outputs."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Test input generation pipeline fully described (Section III): feature sampling → encoding → constraint application → prompt generation → LLM generation → execution → evaluation. Judge validation procedure described with sampling from BeaverTails."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "For the 10-participant BMW judge validation study, only 'from BMW' is stated with no recruitment method, selection criteria, or potential bias discussion. The domain expert for qualitative evaluation is described only as having '7 years of working experience.'"
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Algorithm 1 documents the full pipeline: random sampling → encoding → constraint application → prompt generation → LLM generation → AUT execution → fitness evaluation → evolutionary optimization → failure extraction. Each step is detailed in Sections III.A-III.G."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Acknowledgements: 'This research was funded by the Bavarian Ministry of Economic Affairs, Regional Development and Energy, and by the BMW Group.'"
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Three authors (Sorokin, Vasilev, Friedl) listed with BMW Group affiliation. Andrea Stocco listed with TU Munich and fortiss GmbH. Affiliations clearly stated in author block."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "BMW Group both funds the research and provides the industrial system under test (NaviQA-II). BMW has a commercial interest in the results — the funder is not independent of the outcome."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial disclosure statement. Three authors are BMW employees evaluating a BMW system. No patent or equity disclosures."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper evaluates a testing tool (STELLAR) rather than model knowledge on a benchmark. Models are tested for robustness to adversarial inputs, not for knowledge retrieval. Test inputs are dynamically generated, not drawn from a pre-existing benchmark."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "STELLAR generates new test inputs dynamically via evolutionary search, so there is no fixed test set that could overlap with model training data. The paper tests a tool/defense rather than evaluating model knowledge."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Test inputs are generated at runtime, not drawn from a pre-existing benchmark. The paper itself acknowledges data contamination as a limitation of static benchmarks (Section I) and positions STELLAR as an alternative to contamination-prone static evaluation."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No mention of pre-registration for the 10-participant judge validation study or the expert interview."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No mention of IRB or ethics board approval for the human evaluation study involving 10 BMW participants and the domain expert."
    256       },
    257       "demographics_reported": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "The 10 participants are described only as 'from BMW.' No experience level, role, gender, age, or other demographics reported. The domain expert is described only as having '7 years of working experience in the testing of AI-enabled systems.'"
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "No inclusion or exclusion criteria stated for selecting the 10 BMW participants. No screening process described."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "The human study is a simple annotation/rating task where all participants rated the same items. No experimental conditions requiring randomization."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "The human study is an annotation task evaluating response quality, not a comparative experiment requiring blinding between conditions."
    276       },
    277       "attrition_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No explicit reporting of how many participants started vs. finished. The total of 300 annotations (10 × 30) implies full completion, but attrition is not explicitly discussed."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Search time budgets reported: 2 hours for SafeQA, 3 hours for NaviQA. Section III.G: 'more than 234,000 tests (1,000 tests per run), for a total execution time of more than 24 days.' Table I includes per-LLM inference time for judges (Table III)."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Total execution time stated as 'more than 24 days' for 234,000+ tests. Azure used for cloud models, Ollama/HuggingFace for local models. Per-run budget of 2-3 hours with 6 repetitions per configuration."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Results reported across 6 independent runs per configuration with standard deviation shown in Figures 2 and 3. Box plots show run-to-run variability. This captures sensitivity to stochastic initialization of the evolutionary algorithm."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Explicitly stated: 'Results averaged over 6 runs' in Figures 2 and 3 captions. Diversity analysis repeated 10 times with averaged coverage results."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Parameters described as 'informed by preliminary experiments' (population size, search time) and 'default settings introduced by Abdessalem et al.' (mutation/crossover). No reporting of how many configurations were tried or total compute spent on tuning."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Configuration selected via unspecified 'preliminary experiments' and defaults from prior work. No description of how many configurations were evaluated or what criterion was used to select the final parameters."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Mann-Whitney U tests performed across multiple baselines (3-4), multiple LLMs (3-6), and multiple case studies (2-3), resulting in many comparisons. No mention of Bonferroni, Holm, or other family-wise error rate corrections."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors developed STELLAR and compare it against their own implementations of baselines (RS, T-WISE) and ASTRAL. No discussion of author-evaluation bias or whether baseline implementations are fair."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Figures 2 and 3 (bottom) show failure ratio as a function of time for all approaches, directly comparing performance at matched compute budgets. All approaches run under the same time budget (2h/3h)."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "No discussion of whether failure count is the right measure for testing effectiveness, or whether the LLM judge (F1=0.71-0.79) introduces systematic measurement error that could differentially affect approaches."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "The comparison is between testing approaches (STELLAR vs. baselines), not between models in different scaffolds. The testing framework IS the thing being tested. No scaffolding confound applies."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "While STELLAR generates new test inputs dynamically, the RAG examples drawn from BeaverTails [16] (2023) and MultiWOZ [15] (2020) could have been in the training data of models tested. This temporal overlap is not discussed."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the feature space design or RAG examples leak information about expected failure modes. The 5 malicious examples in SafeQA prompts could bias the types of unsafe outputs generated."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of independence between generated test inputs across runs or between the judge training/evaluation data. RAG examples may create systematic dependencies in generated inputs."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection or prevention method applied. The paper acknowledges contamination risk in static benchmarks (Section I) but does not apply any detection method to their own evaluation."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "STELLAR exposes up to 4.3× (average 2.5×) more failures than existing baseline approaches across both SafeQA and NaviQA case studies.",
    368       "evidence": "Figures 2 and 3 show failure counts and failure ratios across 6 LLMs and 3 AUTs. Mann-Whitney U test (α=0.05) with Vargha-Delaney A12 confirms statistical significance with large effect sizes (Section IV.F.2).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "GPT-4O-MINI demonstrates the best balance between performance and cost efficiency as an LLM judge, achieving F1-scores of 0.79 (SafeQA) and 0.71 (NaviQA).",
    373       "evidence": "Table III reports F1 scores and inference times across 7 models for both binary and continuous judgment tasks, averaged over 5 runs (Section IV.F.1).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "STELLAR achieves higher failure diversity than ASTRAL, reaching up to 98% cluster coverage on GPT-5-CHAT.",
    378       "evidence": "Table IV shows cluster coverage for SafeQA and NaviQA. Coverage differences between STELLAR and ASTRAL are statistically significant with large effect sizes (Section IV.F.3).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "STELLAR uncovered two novel failure types (F3: Name Misinterpretation, F5: Technical Output) in NaviQA-II that had not been detected through prior testing.",
    383       "evidence": "Section VI: domain expert with 7 years experience confirmed F3 and F5 were previously undetected. However, this is based on a single expert's assessment.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Smaller local LLMs (MISTRAL 7B, DEEPSEEK-V2-16B) exhibit substantially more failures than large cloud-based models (GPT-5-CHAT).",
    388       "evidence": "Figures 2 and 3 show consistent pattern: MISTRAL 7B has highest failure counts (~1000 in SafeQA), GPT-5-CHAT lowest (~200). This holds across all testing approaches (Section IV.F.2).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "The majority of identified failures in NaviQA-II are of high severity.",
    393       "evidence": "Table V classifies 5 of 9 failure types as High criticality based on domain expert assessment (Section VI, Q3). However, this is a single expert's judgment.",
    394       "supported": "weak"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval", "case-study"],
    398   "key_findings": "STELLAR, a search-based testing framework using evolutionary optimization, consistently finds 2.5× more failures than random, combinatorial, and coverage-based baselines when testing LLM applications. The framework was evaluated on safety testing of 6 LLMs (SafeQA) and navigational QA testing of an open-source and industrial BMW system (NaviQA). Results show statistical significance with large effect sizes across all comparisons. Qualitative evaluation on the industrial system uncovered 9 distinct failure types, including two previously undetected categories, with the majority classified as high severity.",
    399   "red_flags": [
    400     {
    401       "flag": "Conflict of interest",
    402       "detail": "Three of four authors are BMW employees. BMW funds the research and provides the industrial system under test (NaviQA-II). The qualitative evaluation is validated by a single BMW domain expert. This creates a dual incentive: demonstrating the tool works well AND showing the BMW system has findable flaws (justifying testing investment)."
    403     },
    404     {
    405       "flag": "Single-expert qualitative validation",
    406       "detail": "The qualitative evaluation of failure types (Section VI) and severity assessment relies entirely on one domain expert with 7 years experience. No inter-rater reliability for the failure type classification. Novel failure types F3 and F5 validated by this single expert."
    407     },
    408     {
    409       "flag": "No ablation study",
    410       "detail": "STELLAR has multiple components (evolutionary search, crossover/mutation, RAG-based generation, duplicate elimination, feature constraints) but no ablation study isolates the contribution of each. It's unclear which component drives the improvement over baselines."
    411     },
    412     {
    413       "flag": "Model versions unspecified",
    414       "detail": "All LLMs identified by marketing names (GPT-4O, GPT-5-CHAT, DEEPSEEK-V3, etc.) without API version IDs or snapshot dates. Model behavior changes across versions, making results non-reproducible."
    415     },
    416     {
    417       "flag": "Small human evaluation sample",
    418       "detail": "LLM judge validation uses only 10 BMW participants rating 30 items. No demographics, no recruitment method, no ethics approval. The judge achieves only 0.71 F1 on NaviQA, meaning ~29% of judgments may be incorrect, which could systematically bias the comparison between approaches."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Astral: A tool for the automated safety testing of large language models",
    424       "authors": ["M. Ugarte", "P. Valle", "J. A. Parejo", "S. Segura", "A. Arrieta"],
    425       "year": 2025,
    426       "relevance": "State-of-the-art automated LLM testing baseline using coverage-based feature combination; directly compared against STELLAR."
    427     },
    428     {
    429       "title": "Jailbreaking leading safety-aligned llms with simple adaptive attacks",
    430       "authors": ["M. Andriushchenko", "F. Croce", "N. Flammarion"],
    431       "year": 2025,
    432       "relevance": "Demonstrates LLM jailbreaking via adaptive suffix modification, achieving up to 100% attack rates on safety-aligned LLMs."
    433     },
    434     {
    435       "title": "METAL: Metamorphic Testing Framework for Analyzing Large-Language Model Qualities",
    436       "authors": ["S. Hyun", "M. Guo", "M. A. Babar"],
    437       "year": 2024,
    438       "relevance": "Metamorphic testing framework for LLM robustness and fairness; provides character-level perturbation features used in STELLAR."
    439     },
    440     {
    441       "title": "Beavertails: Towards improved safety alignment of llm via a human-preference dataset",
    442       "authors": ["J. Ji", "M. Liu", "J. Dai"],
    443       "year": 2023,
    444       "relevance": "Safety alignment dataset used for RAG examples and judge evaluation in STELLAR's SafeQA case study."
    445     },
    446     {
    447       "title": "SORRY-bench: Systematically evaluating large language model safety refusal",
    448       "authors": ["T. Xie", "X. Qi", "Y. Zeng"],
    449       "year": 2025,
    450       "relevance": "Safety benchmark providing 13 content categories used as features in STELLAR's safety testing case study."
    451     },
    452     {
    453       "title": "Addressing Data Leakage in HumanEval Using Combinatorial Test Design",
    454       "authors": ["J. S. Bradbury", "R. More"],
    455       "year": 2025,
    456       "relevance": "Addresses data contamination in LLM benchmarks, a key motivation for dynamic test generation approaches like STELLAR."
    457     },
    458     {
    459       "title": "How toxic can you get? search-based toxicity testing for large language models",
    460       "authors": ["S. Corbo", "L. Bancale", "V. D. Gennaro"],
    461       "year": 2025,
    462       "relevance": "Search-based approach for LLM toxicity testing; closely related work in evolutionary LLM testing."
    463     },
    464     {
    465       "title": "Mortar: Metamorphic multi-turn testing for llm-based dialogue systems",
    466       "authors": ["G. Guo", "A. Aleti", "N. Neelofar", "C. Tantithamthavorn"],
    467       "year": 2024,
    468       "relevance": "Framework for testing LLM dialogue systems via metamorphic relations at the turn level; related automated testing approach."
    469     },
    470     {
    471       "title": "A survey on code generation with llm-based agents",
    472       "authors": ["Y. Dong", "X. Jiang", "J. Qian"],
    473       "year": 2025,
    474       "relevance": "Survey of LLM-based code generation agents, relevant to the broader context of LLM application testing."
    475     },
    476     {
    477       "title": "From llms to llm-based agents for software engineering: A survey of current, challenges and future",
    478       "authors": ["H. Jin", "L. Huang", "H. Cai"],
    479       "year": 2025,
    480       "relevance": "Survey of LLM-based agents for software engineering, providing context for why systematic testing of LLM applications is needed."
    481     },
    482     {
    483       "title": "LLMs in software security: A survey of vulnerability detection techniques and insights",
    484       "authors": ["Z. Sheng", "Z. Chen", "S. Gu"],
    485       "year": 2025,
    486       "relevance": "Survey on LLM vulnerability detection relevant to security testing of LLM applications."
    487     }
    488   ]
    489 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs