scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25724B)
      1 {
      2   "paper": {
      3     "title": "LD-Scene: LLM-Guided Diffusion for Controllable Generation of Adversarial Safety-Critical Driving Scenarios",
      4     "authors": [
      5       "Mingxing Peng",
      6       "Yuting Xie",
      7       "Xusen Guo",
      8       "Ruoyu Yao",
      9       "Hai Yang",
     10       "Jun Ma"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv (preprint submitted to Elsevier)",
     14     "arxiv_id": "2505.11247",
     15     "doi": "10.48550/arXiv.2505.11247"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper uses the publicly available nuScenes dataset (Caesar et al., 2020), a standard public benchmark. Section 4.1 states 'We conduct our experiments on the nuScenes dataset.'"
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Section 4.3 mentions 'PyTorch framework' and 'four GeForce RTX 4090 GPUs' but provides no requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The implementation details in Section 4.3 give some hyperparameters but are insufficient for reproduction without code."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Tables 1, 2, and 3 report only point estimates with no confidence intervals, error bars, or ± notation."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims LD-Scene 'outperforms baseline models' and shows 'significant advantages' (Section 4.4) but provides no statistical significance tests (p-values, t-tests, etc.) to support these comparative claims."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table 1 provides raw performance numbers for all methods (e.g., Adv-Ego Coll: LD-Scene 40.75% vs Safe-Sim 27.81%, Strive 22.69%), giving full baseline context to assess the magnitude of improvements."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the number of scenarios evaluated. The nuScenes validation split size is used as-is without discussion of whether it is sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No standard deviation, variance, or spread measures are reported across experimental runs. All results appear to be from single runs."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Four baselines are compared: AdvSim (Wang et al., 2021), Strive (Rempe et al., 2022), DiffScene (Xu et al., 2023), and Safe-Sim (Chang et al., 2024), as shown in Table 1."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Safe-Sim (2024) and DiffScene (2023) are recent baselines. Strive (2022) and AdvSim (2021) are slightly older but represent established approaches in this specific subfield. The selection appears reasonable."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 2 presents an ablation study on guidance components (Other-real, Adv-real, Adv guidance). Section 4.5.2 and Figure 5 ablate the debugger module across multiple LLMs with 500 queries."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple metrics are used across three categories: adversariality (Adv-Ego Coll, Adv Acc), behavior plausibility (offroad rates, collision rates), and efficiency (Sim Time). Table 1 reports 8 distinct metrics."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation is conducted. The paper claims scenarios are 'realistic' and 'user-friendly' but relies entirely on automated metrics. Human judgment of scenario realism and usefulness would strengthen the claims."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 4.1 states 'We train our models using the training split and evaluate them on the validation split of nuScenes,' following standard nuScenes prediction challenge guidelines."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by vehicle type (adversarial vs other), collision type (Adv-Ego, Adv-Other, Other-Ego, Other-Other), adversarial level (weak/medium/strong in Table 3), and behavior type (case studies in Fig. 7)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No systematic failure analysis is provided. The paper does not discuss scenarios where LD-Scene fails to generate adversarial situations or produces implausible results."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4.6.1 reports that the strong adversarial level (39.33%) does not outperform medium (40.75%) in collision rate. Section 4.7 and Fig. 8(b) show that increasing diffusion steps degrades both adversariality and realism."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims 'state-of-the-art performance' (supported by Table 1 showing best Adv-Ego Coll) and 'fine-grained control over adversarial behaviors' (supported by controllability studies in Sections 4.6.1 and 4.6.2)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper makes causal claims through ablation studies (Table 2) showing the contribution of each guidance component via controlled single-variable manipulation. The ablation design is adequate for these causal claims."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title and framing claim general 'controllable generation of adversarial safety-critical driving scenarios,' but results are limited to nuScenes with a single rule-based planner. No discussion of generalization to other datasets, cities, or planner types."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations for the results are discussed. For example, performance gains could partly stem from GPT-4o's code generation quality rather than the framework design, but this is not considered."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper's measurements (collision rate, offroad rate, acceleration, TTC) directly correspond to what is claimed (adversariality, behavior plausibility). The proxy-measurement gap is minimal — the metrics measure what the paper claims to evaluate."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Section 4.3 states 'both the code generator and debugger used in generating guidance utilize the GPT-4o model' without specifying a version, snapshot date, or API version. 'GPT-4o' is a marketing name."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Figure 2 provides detailed prompts: (a) system prompt with task/inputs/objectives/instructions, (b) code generation template, (c) reasoning prompt with step-by-step instructions, and (d) debugger prompt. These appear to be the actual prompts used."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 4.3 reports: learning rate 5×10⁻⁴, Adam optimizer, 200 training epochs, 20 diffusion steps, 10 test samples. Training on 4× RTX 4090 GPUs for 6 hours."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The LLM-based guidance generation module is described in detail in Section 3.3, including the CoT code generator workflow (3-step reasoning: adversarial level → loss weights → code), the code debugger with closed-loop unit testing, and the integration with the diffusion model."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "Section 4.1 states nuScenes has 1,000 scenes and mentions the train/val split, but does not describe any preprocessing, filtering, or how many scenarios were actually used for evaluation. No filtering criteria or data transformation steps are documented."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper has no limitations, threats-to-validity, or similar section. The conclusion (Section 5) makes only positive claims without acknowledging any limitations."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity are discussed anywhere in the paper."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show (e.g., generalization to other datasets, other planners, real-world deployment)."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "While nuScenes is public, the paper's own experimental outputs (generated scenarios, guidance functions, intermediate results) are not made available for independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4.1 describes the nuScenes dataset: 1,000 scenes, 20 seconds each at 2 Hz, 5.5 hours of urban driving from Boston and Singapore, with train/val split details."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. The data source is a standard public benchmark (nuScenes)."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper does not document how nuScenes data flows from raw format to model inputs, what filtering or selection was applied, or how many scenarios were ultimately used for evaluation."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source or acknowledgments section is present in the paper."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: HKUST Guangzhou, Sun Yat-sen University, and HKUST Hong Kong."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of a funding statement is a gap in transparency."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial disclosure statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "The paper evaluates a custom system (LD-Scene) rather than a pre-trained model's capability on a benchmark. The diffusion model is trained by the authors with known data splits, and GPT-4o is used as a code generation tool, not evaluated for benchmark knowledge."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above: the paper tests a tool/system rather than evaluating a pre-trained model's knowledge on a benchmark. Standard train/val splits are used for the custom diffusion model."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Same as above: contamination concerns about pre-trained models seeing benchmark data do not structurally apply to this evaluation setup."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Table 1 reports Sim Time (229.40s for LD-Scene). Figure 5(b) shows token consumption per LLM model, and Figure 5(c) shows total cost per LLM model for guidance generation."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Section 4.3 states 'trained on four GeForce RTX 4090 GPUs for six hours' and provides details on the number of diffusion steps and training epochs."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds anywhere in the paper."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs producing the main results is not stated. Section 4.3 mentions '10 test samples' per scenario but this is the inference sample count, not repeated experimental runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No hyperparameter search budget is reported. The paper presents final hyperparameters (learning rate, diffusion steps, etc.) without describing how they were selected."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The final configuration (20 diffusion steps, 10 samples, etc.) is presented without justification for how it was selected, beyond the parameter sensitivity analysis in Section 4.7 which appears post-hoc."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Section 4.3 notes 'our re-implementation of AdvSim' but does not acknowledge the bias of evaluating their own system against their own re-implementations of baselines."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "While Sim Time is reported in Table 1, the paper does not discuss the fairness of computational comparisons. LD-Scene uses GPT-4o API calls (additional cost) that baselines do not, and this is not factored into performance comparisons."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper does not discuss whether nuScenes adequately measures what is claimed. No analysis of whether automated metrics (collision rate, offroad rate) truly capture scenario realism and adversarial effectiveness."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "LD-Scene is evaluated as a complete bundled system. The comparison is between different complete systems (LD-Scene vs Safe-Sim vs Strive etc.), not between models within different scaffolds. The scaffold IS the thing being tested."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of temporal leakage. The nuScenes dataset was collected before GPT-4o's training, and GPT-4o may have encountered nuScenes-related code or analysis patterns during training. Not addressed."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the evaluation setup provides information not available in real usage scenarios."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "While standard nuScenes train/val splits are used, no discussion of potential non-independence between training and test scenarios (e.g., same locations, similar traffic patterns)."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No concrete leakage detection or prevention method is described or applied."
    362       }
    363     }
    364   },
    365   "scan_version": 2,
    366   "active_modules": ["experimental_rigor", "data_leakage"],
    367   "claims": [
    368     {
    369       "claim": "LD-Scene achieves state-of-the-art adversarial collision rate (40.75%) on nuScenes, substantially higher than all baselines.",
    370       "evidence": "Table 1 shows Adv-Ego Coll: LD-Scene 40.75% vs Safe-Sim 27.81%, AdvSim 24.72%, Strive 22.69%, DiffScene 15.06%.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "LD-Scene generates more realistic adversarial scenarios with lower off-road rates than baselines.",
    375       "evidence": "Table 1 shows Adv Offroad: LD-Scene 12.52% vs AdvSim 15.60%, Strive 18.94%, DiffScene 19.71%, Safe-Sim 21.79%.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The debugger module significantly improves code generation success rates across all LLMs tested.",
    380       "evidence": "Figure 5(a) shows GPT-4o success rate increases from 69.4% to 95.0% with the debugger. Evaluated on 500 automatically generated user queries (Section 4.5.2).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LD-Scene enables controllable adversarial level generation with progressively aggressive behaviors from weak to strong.",
    385       "evidence": "Table 3 shows TTC decreasing (2.06→1.98→1.91s) and acceleration increasing across weak/medium/strong levels. However, collision rate does not increase monotonically (30.63%, 40.75%, 39.33%).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "LD-Scene can generate diverse controllable adversarial behaviors (normal collision, high-speed overtaking, sharp turns) based on natural language queries.",
    390       "evidence": "Figure 7 shows three case studies with different query types and corresponding generated scenarios. Only qualitative examples are provided.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "methodology_tags": ["benchmark-eval", "case-study"],
    395   "key_findings": "LD-Scene integrates LLMs with Latent Diffusion Models for adversarial driving scenario generation, achieving 40.75% adversarial-ego collision rate on nuScenes (vs 27.81% for the best baseline Safe-Sim) while maintaining lower off-road rates. The LLM-based code debugger improves guidance function generation success from 69.4% to 95.0% for GPT-4o across 500 queries. The framework enables natural language control over adversarial intensity levels, though strong-level queries do not always produce higher collision rates than medium-level ones.",
    396   "red_flags": [
    397     {
    398       "flag": "No uncertainty quantification",
    399       "detail": "All results in Tables 1-3 are point estimates with no error bars, confidence intervals, standard deviations, or significance tests. Claims of outperformance are based on single-number comparisons."
    400     },
    401     {
    402       "flag": "No limitations section",
    403       "detail": "The paper contains no dedicated limitations, threats-to-validity, or scope-bounding discussion. The conclusion is entirely positive with no acknowledgment of weaknesses."
    404     },
    405     {
    406       "flag": "Re-implementation bias",
    407       "detail": "AdvSim is re-implemented by the authors (Section 4.3). The paper does not acknowledge the bias of evaluating their system against their own re-implementation of a baseline, which may systematically underperform."
    408     },
    409     {
    410       "flag": "Single planner evaluation",
    411       "detail": "All experiments use a single rule-based lane-graph planner. Results may not generalize to learning-based or other planners, limiting the practical applicability of the conclusions."
    412     },
    413     {
    414       "flag": "LLM version unspecified",
    415       "detail": "GPT-4o is used without specifying a version or snapshot date. GPT-4o behavior changes across versions, making reproduction uncertain."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "GPT-4 technical report",
    421       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    422       "year": 2023,
    423       "arxiv_id": "2303.08774",
    424       "relevance": "Foundation LLM capabilities paper, cited for natural language understanding and reasoning abilities used in the LD-Scene framework."
    425     },
    426     {
    427       "title": "DeepSeek-Coder: When the large language model meets programming–the rise of code intelligence",
    428       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    429       "year": 2024,
    430       "arxiv_id": "2401.14196",
    431       "relevance": "LLM code generation capabilities relevant to the survey's coverage of AI programming and code generation."
    432     },
    433     {
    434       "title": "Sparks of artificial general intelligence: Early experiments with gpt-4",
    435       "authors": ["Sébastien Bubeck"],
    436       "year": 2023,
    437       "arxiv_id": "2303.12712",
    438       "relevance": "Early GPT-4 capability assessment relevant to understanding LLM capabilities in reasoning and code generation."
    439     },
    440     {
    441       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    442       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    443       "year": 2022,
    444       "relevance": "Foundational CoT prompting technique used in LD-Scene's code generator for structured reasoning."
    445     },
    446     {
    447       "title": "Large language models are zero-shot reasoners",
    448       "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid"],
    449       "year": 2022,
    450       "relevance": "Zero-shot reasoning capabilities of LLMs, relevant to the survey's assessment of LLM reasoning and prompting methods."
    451     },
    452     {
    453       "title": "Language-guided traffic simulation via scene-level diffusion (CTG++)",
    454       "authors": ["Ziyuan Zhong", "Davis Rempe", "Yuxiao Chen"],
    455       "year": 2023,
    456       "relevance": "Direct baseline and prior work on LLM-guided traffic simulation using language-based guidance for diffusion models."
    457     },
    458     {
    459       "title": "LLM4Drive: A survey of large language models for autonomous driving",
    460       "authors": ["Zhenjie Yang", "Xiaosong Jia", "Hongyang Li"],
    461       "year": 2023,
    462       "arxiv_id": "2311.01043",
    463       "relevance": "Survey of LLM applications in autonomous driving, relevant to the survey's scope on agentic AI applications."
    464     },
    465     {
    466       "title": "DILU: A knowledge-driven approach to autonomous driving with large language models",
    467       "authors": ["Licheng Wen", "Daocheng Fu", "Xin Li"],
    468       "year": 2023,
    469       "arxiv_id": "2309.16292",
    470       "relevance": "LLM-based autonomous driving approach demonstrating knowledge-driven decision-making capabilities."
    471     },
    472     {
    473       "title": "ChatScene: Knowledge-enabled safety-critical scenario generation for autonomous vehicles",
    474       "authors": ["Jiaqi Zhang", "Cao Xu", "Bo Li"],
    475       "year": 2024,
    476       "relevance": "LLM-based agent for scenario generation using natural language descriptions converted to executable Scenic code."
    477     }
    478   ]
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs