scan.json (29207B)
1 { 2 "paper": { 3 "title": "AGENTS-LLM: Augmentative GENeration of Challenging Traffic Scenarios with an Agentic LLM Framework", 4 "authors": [ 5 "Yu Yao", 6 "Salil Bhatnagar", 7 "Markus Mazzola", 8 "Vasileios Belagiannis", 9 "Igor Gilitschenski", 10 "Luigi Palmieri", 11 "Simon Razniewski", 12 "Marcel Hallgarten" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2507.13729" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The conclusion states 'The code for this paper will be made available,' which is a promise of future release, not a working URL or archive. A GitHub link (https://github.com/mh0797/Agents-LLM/) is listed in the header but no code is confirmed available at time of submission." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses the publicly available interPlan scenario dataset and nuPlan benchmark. Both are referenced as standard public benchmarks, and the authors did not collect proprietary data — they use interPlan's 50 human-augmented scenarios as the test set." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, conda environment, or library version listing is provided. The paper mentions using GPT-4o, Gemini-1.5-Flash, and Llama3.1-70B but does not specify any software dependencies or environment details beyond the LLM names." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README commands, or scripts to replicate experiments are provided. The paper describes the method at a high level but a competent researcher could not reproduce experiments from the paper text alone." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "Table IV reports 95% confidence intervals (via bootstrapping) for Elo ratings for all nine variants. For example, interPlan has Elo 1042 with CI -9/+11. CIs are not provided for displacement error results." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No formal statistical significance tests (p-values, t-tests, etc.) are applied to the displacement error comparisons. The Elo rating system uses CI-based ranking rather than hypothesis testing. Claims like 'significantly larger error' for LCTGen are made without statistical tests." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "Effect sizes are not formally reported. Displacement error differences are shown in figures without quantifying the magnitude as a standardized effect size. The Elo point differences provide some sense of magnitude but no formal effect size metric is computed." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The 50-scenario test set and 9 expert judges are not justified with a power analysis or explicit reasoning for why these numbers are sufficient. The paper acknowledges expert ranking is 'very time consuming' but does not justify whether 9 judges and 5760 comparisons yield sufficient statistical power." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Displacement error results in Figures 3 and 4 show distributions via box plots/scatter but Table V reports only mean driving scores without any variance measure. No standard deviation or IQR is reported for mean displacement errors or driving scores, preventing assessment of result stability." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "LCTGen [23] is included as a baseline for comparison on displacement error (Figure 3). The human-generated interPlan scenarios serve as ground truth for the Elo comparison (Table IV). Multiple prompting strategy variants also serve as internal baselines." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "LCTGen was published in 2023 (CoRL 2023) and represents a recent language-based scenario generation approach. The interPlan baseline (IROS 2024) is very recent. The baselines are appropriate and contemporary for the task." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Table II and Section V.B present four prompting strategy variants (OTM, FC, tQA, vQA) that systematically add or swap components (lane representation format, QA loop type), effectively constituting an ablation study over framework components." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Three distinct metrics are used: (1) displacement error in meters (placement accuracy), (2) Elo rating from human expert pairwise comparisons (visual quality), and (3) nuPlan mean driving score (challenging-ness for planners)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Nine expert judges from the autonomous driving research community conducted 5760 pairwise comparisons of BEV images in a blind setup, with identities hidden. Elo ratings with 95% CI are computed from these judgments." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "The same 50 interPlan scenarios are used for development (identifying common failure cases, designing prompting strategies) and final evaluation. There is no explicit held-out test set separated before any selection decisions were made." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Figure 3 shows displacement error broken down by five scenario types (approaching construction zone, nudge around parked vehicle, overtake parked vehicle, jaywalking pedestrian, avoid crashed cars). Table III shows error counts by category (position, heading, logic errors) per LLM." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section V.A describes three error categories (position, heading, logic errors) and Table III quantifies them per model. Figure 5 provides qualitative examples of failures, e.g., 'Gemini placed the accident vehicles at the correct distance, but with an unrealistically large overlap.'" 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Several negative results are reported: vQA does not improve displacement error over tQA on average for Gemini (Section V.B); smaller models significantly underperform in OTM; LCTGen failed to generate traffic agents for 'jaywalker' and 'construction site' types." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims that human evaluation demonstrates high quality comparable to manual scenarios. Table IV shows GPT-4o OTM achieves Elo 1039 vs. interPlan's 1042 (overlapping CIs, same rank 1), supporting this claim. The claim about smaller models achieving comparable performance via agentic design is also supported by the Elo results for Gemini vQA." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The causal claim that the agentic framework enables smaller models to achieve comparable performance is supported by controlled ablation experiments (Table II, Figures 3-4) that systematically vary prompting strategy while holding the model constant, allowing causal attribution of performance changes to the framework components." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper tests on only 50 scenarios from one dataset (interPlan) covering five scenario types on the nuPlan map format, yet the title and abstract make broad claims about 'autonomous driving planners' and 'LLM-agent based framework' without clearly bounding results to this specific dataset and scenario type." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not substantively discuss alternative explanations for the observed performance differences. For example, the finding that vQA improves Elo but not displacement error could be due to evaluator bias toward visually similar images, but this is not discussed. The conclusion section offers no alternative interpretations." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper uses 'GPT-4o', 'Gemini-1.5-Flash', and 'Llama3.1-70B' without specifying API version numbers or snapshot dates. Model behavior changes across versions — 'GPT-4o' without 'gpt-4o-2024-05-13' or similar is insufficient per the schema criterion." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "Figure 2 shows the SMA prompt with actual values for one example, but the QA agents' prompts (Text QA, Visual QA Engineer, Visual QA Agent) are only described in natural language. The 'list of common problems' given to the Text QA Agent is referenced but not provided. The reader cannot reconstruct all prompts used across tQA and vQA variants." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The paper states models are 'used in their pretrained form without any problem-specific fine-tuning' but does not specify sampling settings used during inference." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The agentic scaffold is described in detail in Section III: the Scenario Modifier Agent (SMA), Text QA Agent, Visual QA Engineer and Visual QA Agent are described with their roles, inputs, outputs, rating mechanisms (1-5 scale), feedback loops, and retry logic. Figure 1 provides a workflow diagram." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section III.C and Table I document how traffic scenarios are represented as text vectors (entity types, attributes) and Section IV describes how the 50 interPlan scenarios are used. The input format transformation from raw scenarios to the text-based representation is described sufficiently." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion contains one sentence: 'One limitation is the dependence on commercial frontier LLMs, which are only accessible through commercial APIs.' This is insufficient — a single sentence in the conclusion does not constitute substantive limitations discussion." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats-to-validity discussion is present. The one limitation mentioned (API dependence) is a practical concern, not a validity threat. Specific threats like the small 50-scenario test set, potential evaluator fatigue or bias, or the fact that the same scenarios were used for development and evaluation are not discussed." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that results are limited to the five interPlan scenario types, the nuPlan map format, or the specific LLMs tested. No explicit out-of-scope statements are made." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The raw generated scenarios and expert judgment data (5760 pairwise comparisons) are not released. The underlying expert responses cannot be independently verified without access to the raw data. The code is also not yet released." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section IV describes the data: 50 human-augmented scenarios from interPlan covering five scenario types, reconstructed using the proposed framework. The scenario representation format is documented in Table I. The source (interPlan) is properly cited." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The nine expert judges are described only as 'experts from the autonomous driving research community' with no description of how they were recruited (internal Bosch employees? external collaborators? a specific research group?), what criteria qualified them as experts, or whether their recruitment could introduce systematic bias." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The full pipeline is documented: interPlan scenarios as input → text-based vector representation → LLM-based modification → QA loop (optional) → output scenarios → nuPlan simulation for evaluation. The process of pairwise comparison data collection and Elo computation is also described." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No acknowledgments section or funding disclosure is present in the paper. The authors' affiliations include Robert Bosch GmbH (a major industry player), but no explicit statement about funding sources is made." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed at the top of the paper: Robert Bosch GmbH, Friedrich-Alexander-Universität Erlangen-Nürnberg, University of Toronto, ScaDS.AI & TU Dresden, University of Tübingen, and Vector Institute. The industry affiliation with Bosch is transparent." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Multiple authors are affiliated with Robert Bosch GmbH, and the framework is designed for autonomous driving testing — an area directly relevant to Bosch's commercial interests. The funder/employer has a direct stake in demonstrating that such frameworks are effective." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement or declaration of financial interests is present in the paper. The absence of such a statement means the reader cannot assess whether authors hold relevant patents or equity interests." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "Contamination is less applicable here because the paper uses LLMs for scene generation (a generative task based on user instructions), not for answering benchmark questions where training data overlap would invalidate results. The LLMs are not being evaluated on their knowledge of test cases." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not applicable for the same reason as training_cutoff_stated — the task is generative scenario creation, not benchmark knowledge retrieval. There is no meaningful train/test contamination concern for instruction-following on scenario augmentation." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not applicable — the paper does not evaluate pre-trained model knowledge. The nuPlan and interPlan benchmarks are used to measure driving planner performance on generated scenarios, not LLM knowledge." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": false, 246 "justification": "No pre-registration link is provided for the human expert evaluation study. The study is not registered on OSF, AsPredicted, or any other registry." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "No mention of IRB or ethics board approval is made, despite the paper involving human participants (9 expert judges) in an evaluation study. The paper does not state that ethics review was obtained or waived." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": false, 256 "justification": "The nine expert judges are described only as 'experts from the autonomous driving research community.' No demographics (experience level, years of experience, gender, geographic distribution, company affiliation) are reported." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": false, 261 "justification": "No inclusion or exclusion criteria for the expert judges are stated beyond being 'experts from the autonomous driving research community.' No screening process or formal definition of expertise is described." 262 }, 263 "randomization_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "Section IV states 'The order of match-ups were randomized,' indicating that the pairwise comparison order was randomized to prevent ordering effects in the Elo tournament." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": true, 271 "justification": "Section IV states 'the identities of the models were hidden from the judges during rating,' indicating that evaluators were blinded to which model produced each scenario during the pairwise comparison." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": false, 276 "justification": "No attrition information is reported. It is unclear whether all nine judges completed all assigned comparisons or whether any dropped out. The total of 5760 comparisons across nine judges is stated but no per-judge breakdown or dropout information is given." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper acknowledges that vQA 'involves processing a large number of tokens' and is 'very cost inefficient' compared to frontier models, but no actual cost figures (API spend, token counts, cost per scenario) are reported. The stated motivation for the work is reducing costs, yet costs are not quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The total computational budget for running all experiments (frontier model API calls, simulation runs, etc.) is not stated. No GPU hours, API spend estimates, or hardware specifications are provided." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "The proposed LLM-agent framework generates scenarios of quality comparable to human-generated interPlan scenarios, as judged by human domain experts.", 295 "evidence": "Table IV shows GPT-4o OTM achieves Elo 1039 (CI -9/+11) vs. interPlan's 1042 (CI -9/+11), both ranked 1st with overlapping confidence intervals. 5760 pairwise comparisons from 9 experts.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "The agentic framework (using function calling and QA loops) enables smaller utility models to achieve performance comparable to frontier models, closing the gap seen with simple OTM prompting.", 300 "evidence": "Figure 4 and Table IV: Gemini-1.5-Flash vQA achieves Elo 1025 (rank 1, same as GPT-4o OTM), while Gemini OTM is Elo 953 (rank 8). Function calling most improves displacement error for smaller models.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "The generated scenarios are equally challenging to state-of-the-art planners as the human-generated interPlan scenarios.", 305 "evidence": "Table V: GPT-4o OTM achieves mean driving score 49.6% vs. interPlan's 51.9%, while nuPlan Val14 baseline is 90.8%. All LLM-generated scenario variants produce similarly low planner scores.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "LCTGen has significantly larger displacement error than the proposed framework on comparable scenario types.", 310 "evidence": "Figure 3 shows LCTGen displacement error substantially higher than GPT-4o, Gemini, and Llama for the three scenario types where LCTGen successfully generated agents. Also failed entirely on jaywalker and construction site types.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "Visual QA does not improve displacement error over text QA on average.", 315 "evidence": "Figure 4 shows vQA and tQA have similar displacement error distributions for Gemini; Section V.B states 'vQA does not improve displacement error over tQA on average.' However, vQA improves Elo ratings.", 316 "supported": "moderate" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval", 321 "case-study" 322 ], 323 "key_findings": "The AGENTS-LLM framework uses an LLM-based agentic design (Scenario Modifier Agent + optional QA loop) to automatically augment real-world traffic scenarios from natural language descriptions. Using GPT-4o, the framework generates scenarios rated virtually indistinguishable from human-generated scenarios by 9 expert judges in blind pairwise comparison (Elo 1039 vs. 1042). Advanced prompting strategies (function calling, visual QA) allow smaller utility models like Gemini-1.5-Flash to close the performance gap to frontier models. Generated scenarios are equally challenging to state-of-the-art autonomous driving planners (PDM-Closed) as manually created scenarios.", 324 "red_flags": [ 325 { 326 "flag": "Test set used for development and evaluation", 327 "detail": "The same 50 interPlan scenarios are explicitly used for development ('we use the interPlan scenarios as ground-truth for development') and for final evaluation. There is no held-out test set, so the reported results may reflect design choices optimized on the test data." 328 }, 329 { 330 "flag": "No model version snapshots", 331 "detail": "The paper uses 'GPT-4o', 'Gemini-1.5-Flash', and 'Llama3.1-70B' without API version or snapshot dates. GPT-4o has multiple versions with different behaviors; results may not be reproducible with current API versions." 332 }, 333 { 334 "flag": "Code not released at submission", 335 "detail": "Despite listing a GitHub URL in the header, the conclusion states 'The code for this paper will be made available' — a future promise. The results cannot currently be reproduced." 336 }, 337 { 338 "flag": "Insufficient limitations discussion", 339 "detail": "The paper has no dedicated limitations section. Only one limitation is mentioned (commercial API dependence). Critical limitations such as the small 50-scenario test set, the single dataset/map format, the absence of a held-out test set, and potential evaluator bias are not discussed." 340 }, 341 { 342 "flag": "Expert recruitment not described", 343 "detail": "The 9 expert judges are described only as 'experts from the autonomous driving research community' with no description of recruitment method, criteria, or potential bias. Multiple authors are from Bosch, raising the possibility that some judges were colleagues or collaborators." 344 }, 345 { 346 "flag": "No inference cost quantification", 347 "detail": "The paper's stated motivation is reducing reliance on expensive frontier models, yet no actual API costs or token counts are reported. The claim that the agentic approach is more cost-effective cannot be verified without this data." 348 } 349 ], 350 "cited_papers": [ 351 { 352 "title": "Can Vehicle Motion Planning Generalize to Realistic Long-tail Scenarios?", 353 "authors": [ 354 "M. Hallgarten", 355 "J. Zapata", 356 "M. Stoll", 357 "K. Renz", 358 "A. Zell" 359 ], 360 "year": 2024, 361 "relevance": "The interPlan work this paper builds on — provides human-augmented challenging scenarios as ground truth for evaluation; directly relevant to agentic AI in autonomous driving evaluation." 362 }, 363 { 364 "title": "NuPlan: A closed-loop ML-based planning benchmark for autonomous vehicles", 365 "authors": [ 366 "H. Caesar", 367 "J. Kabzan", 368 "K. S. Tan" 369 ], 370 "year": 2021, 371 "relevance": "The simulation and evaluation benchmark used in this paper; key infrastructure for autonomous driving planner evaluation." 372 }, 373 { 374 "title": "Language Conditioned Traffic Generation", 375 "authors": [ 376 "S. Tan", 377 "B. Ivanovic", 378 "X. Weng", 379 "M. Pavone", 380 "P. Kraehenbuehl" 381 ], 382 "year": 2023, 383 "relevance": "LCTGen is the primary baseline comparison for the proposed framework, representing language-guided scenario generation." 384 }, 385 { 386 "title": "ChatScene: Knowledge-Enabled Safety-Critical Scenario Generation for Autonomous Vehicles", 387 "authors": [ 388 "J. Zhang", 389 "C. Xu", 390 "B. Li" 391 ], 392 "year": 2024, 393 "relevance": "Related LLM-based scenario generation approach using Scenic code for autonomous driving, a direct comparison point." 394 }, 395 { 396 "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference", 397 "authors": [ 398 "W.-L. Chiang", 399 "L. Zheng", 400 "Y. Sheng", 401 "A. N. Angelopoulos" 402 ], 403 "year": 2024, 404 "relevance": "Provides the Elo-based pairwise human evaluation methodology adopted in this paper for comparing scenario quality." 405 }, 406 { 407 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 408 "authors": [ 409 "S. Yao", 410 "J. Zhao", 411 "D. Yu", 412 "N. Du", 413 "I. Shafran", 414 "K. Narasimhan", 415 "Y. Cao" 416 ], 417 "year": 2022, 418 "relevance": "Foundational agentic LLM framework reference that underpins the agentic design pattern used in this work." 419 }, 420 { 421 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 422 "authors": [ 423 "Q. Wu", 424 "G. Bansal", 425 "J. Zhang" 426 ], 427 "year": 2024, 428 "relevance": "Multi-agent LLM framework directly relevant to the agentic design pattern used for scenario modification." 429 }, 430 { 431 "title": "Chain-of-thought prompting elicits reasoning in large language models", 432 "authors": [ 433 "J. Wei", 434 "X. Wang", 435 "D. Schuurmans" 436 ], 437 "year": 2022, 438 "relevance": "Foundational prompting technique referenced as the basis for the agentic framework's multi-step reasoning approach." 439 }, 440 { 441 "title": "Toolformer: Language models can teach themselves to use tools", 442 "authors": [ 443 "T. Schick", 444 "J. Dwivedi-Yu", 445 "R. Dessi" 446 ], 447 "year": 2023, 448 "relevance": "Tool use / function calling foundation, directly relevant to the function-calling variant of the framework." 449 }, 450 { 451 "title": "SLEDGE: Synthesizing Driving Environments with Generative Models and Rule-Based Traffic", 452 "authors": [ 453 "K. Chitta", 454 "D. Dauner", 455 "A. Geiger" 456 ], 457 "year": 2024, 458 "relevance": "Data-driven scenario generation baseline for autonomous driving, representing the competing approach to LLM-based methods." 459 }, 460 { 461 "title": "Text-to-Drive: Diverse Driving Behavior Synthesis via Large Language Models", 462 "authors": [ 463 "P. Nguyen", 464 "T.-H. Wang", 465 "Z.-W. Hong", 466 "S. Karaman", 467 "D. Rus" 468 ], 469 "year": 2024, 470 "relevance": "Related LLM-based driving behavior generation approach, directly comparable to this work's traffic scenario generation." 471 }, 472 { 473 "title": "RealGen: Retrieval Augmented Generation for Controllable Traffic Scenarios", 474 "authors": [ 475 "W. Ding", 476 "Y. Cao", 477 "D. Zhao", 478 "C. Xiao", 479 "M. Pavone" 480 ], 481 "year": 2024, 482 "relevance": "Related controllable traffic scenario generation approach using retrieval augmented generation, relevant to benchmarking." 483 } 484 ] 485 }