scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32429B)
      1 {
      2   "paper": {
      3     "title": "TARGET: Traffic Rule-based Test Generation for Autonomous Driving via Validated LLM-Guided Knowledge Extraction",
      4     "authors": [
      5       "Yao Deng",
      6       "Zhi Tu",
      7       "Jiaohong Yao",
      8       "Mengshi Zhang",
      9       "Tianyi Zhang",
     10       "Xi Zheng"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE Transactions on Software Engineering",
     14     "arxiv_id": "2305.06018",
     15     "doi": "10.1109/TSE.2025.3569086"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "case-study"],
     20   "key_findings": "TARGET automatically generates 284 executable test scenarios from 54 traffic rules and uncovers 610 erroneous behaviors across 7 ADSs on 3 simulator platforms. The three-step LLM parsing pipeline (knowledge extraction, validation, syntax alignment) with GPT-4 achieves 100% rule-level accuracy on 90/98 benchmarked rules, outperforming GPT-4 with knowledge extraction alone (71/98). Human evaluation with Fleiss' Kappa 0.68 confirms generated scenarios substantially match traffic rule descriptions. Two issues were confirmed by ADS developers.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section I states 'The code is open-sourced at https://zenodo.org/records/14346539' with a working Zenodo archive URL."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The traffic rule benchmark is derived from the publicly available Texas Driver Handbook. Ground truth scenario representations and supplementary materials are provided via a supplementary material link (https://shorturl.at/gFPX3). The Zenodo archive also contains project data."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions CARLA, LGSVL, MetaDrive simulators and RTX4090 for Llama experiments, but provides no requirements.txt, Dockerfile, conda environment file, or detailed dependency listing with library versions sufficient to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper describes the methodology in detail (Algorithm 1, prompt templates, DSL grammar) but provides no step-by-step reproduction instructions, README with commands, or scripts to replicate the main experiments within the paper itself."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results are reported as point estimates: counts (610 errors, 90/98 rules) and percentages (Table VII component accuracies) without confidence intervals, error bars, or uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Comparisons between LLM parsers (GPT-4 vs GPT-3.5 vs Llama) and ablation variants are made by comparing raw accuracy numbers without any statistical significance tests (no p-values, t-tests, or other tests)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Table VII provides component-level accuracies for all parsers side by side, giving baseline context for improvements. Section V-A reports that rules at 100% accuracy improved from 71 (GPT-4-KE) to 90 (GPT-4-KE-KV-SA). Raw accuracy differences with baselines are provided throughout."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for why 98 traffic rules from one handbook, 54 rules for scenario generation, or the human study sample sizes (36, 15, 27, 16, 10 students) are sufficient. No power analysis discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "All LLM parsing results appear to be single-run. No standard deviation, variance, or spread measures across multiple runs are reported, despite LLM outputs being stochastic."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section V-A compares with LawBreaker's manual rule parsing and evaluates multiple LLMs (GPT-3, GPT-3.5, GPT-4, Llama3.1-8b, Llama3.1-70b). An ablation study compares pipeline variants (KE, KE-KV, KE-KV-SA, KE without few-shot)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "LawBreaker (ICSE 2022) is the most directly related prior work. GPT-4 and Llama 3.1 (2024) are contemporary LLMs. The baselines represent the relevant state of the art for this task."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Section V-A presents a systematic ablation: GPT-4-KE (extraction only), GPT-4-KE(-FL) (without few-shot learning), GPT-4-KE-KV (with validation), GPT-4-KE-KV-SA (full pipeline). Each pipeline component's contribution is measured."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are used: rule-level accuracy, component-level accuracy (Formula 1-2), compile error rate, runtime error rate, normal execution rate, human evaluation (5-point scale), weighted Fleiss' Kappa, and rule violation/collision/timeout counts."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section IV-C2 describes two human evaluation studies: (1) software engineering students across 5 classes (36+15+27+16+10 participants) and (2) Prolific workers with driving experience (20 per survey × 5 surveys). Participants rated scenario-rule consistency on a 5-point scale."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The few-shot learning example is drawn from the same 98-rule benchmark used for evaluation. No explicit separation of the few-shot example from the test set is described. All 98 rules are evaluated ('For each traffic rule in the benchmark, we applied the rule parser')."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table VII provides per-subcomponent accuracy breakdowns (Weather, Time, Road type, Road marker, Traffic sign, Type, Behavior, Position reference, Position relation, Longitudinal oracle, Lateral oracle). Table VIII breaks down by ADS and violation type."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section V-A analyzes wrong parsing results (Road type inference failures, confusion between current and expected behaviors, wrong reference frames). Section V-B discusses mismatched scenarios (weather rendering, road marker annotation errors). Section V-C provides root cause analysis of ADS failures."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table I reports the pilot study where GPT-4 failed to directly generate Scenic (0/13 executable) and CARLA Scenario Runner scripts (0/8 executable). Section V-B reports 7% compile errors and 3% runtime errors. 5 of 38 LawBreaker rules could not be parsed due to DSL limitations."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims are supported: 610 erroneous behaviors (Table VIII sums to 610), 284 scenarios from 54 rules across 7 ADSs (Section IV-B), two issues confirmed by developers (Section V-C). Rule violation and collision detection claims match the results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper's causal claims about pipeline component effectiveness are supported by controlled ablation (GPT-4-KE → KE-KV → KE-KV-SA, Section V-A). Each ablation adds one component while holding others constant, which is adequate for the causal claim that each component improves accuracy."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Traffic Rule-based Test Generation for Autonomous Driving' is broad, but all traffic rules come from the Texas Driver Handbook only. The abstract claims 'automatically generates test scenarios from traffic rules' without bounding to Texas-specific rules. Section VII acknowledges this limitation but the title/abstract do not."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section VII discusses multiple threats: internal validity (whether GPT-4 alone is sufficient, addressed via ablation), external validity (generalizability to other regions/simulators), and construct validity (appropriateness of human evaluation metrics, Fleiss' Kappa adaptation). Section V-B identifies simulator-specific explanations for mismatches."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper's claims align with its measurements: parsing accuracy measures parsing quality directly, execution rate measures executability directly, and human evaluation explicitly measures scenario-rule consistency. The paper does not frame narrow metrics as broader capabilities."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper refers to 'GPT-4', 'GPT-3.5', and 'GPT-3' without specifying version snapshots (e.g., gpt-4-0613). Llama 3.1 versions are specified by parameter count (8B, 70B) but without exact model IDs or snapshot dates."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Prompt templates are provided in Tables III, IV, and V with structural detail. The DSL grammar (Figure 3) and element lists that fill the templates are also in the paper. Full prompt details are available in supplementary material (referenced with URL)."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No LLM API hyperparameters are reported: temperature, top-p, max tokens, or other sampling parameters for GPT-4 or other models are not mentioned anywhere in the paper."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The three-step pipeline (knowledge extraction → knowledge validation → syntax alignment) is described in detail in Section III-C with workflow diagram (Figure 5), prompt templates (Tables III-V), and the connecting logic between stages."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section IV-B1 documents traffic rule filtering: 14 chapters → Chapters 4-9, then excluded rules with figures (18), non-driving scenarios (16), non-vehicular behaviors (8), yielding 98 rules. Section IV-B2 describes map selection and customization for simulators."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section VII 'Threats to Validity' provides a dedicated, multi-paragraph discussion covering external, internal, and construct validity threats."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VII discusses study-specific threats: using only Texas rules, specific ADSs tested, GPT-4 as the primary LLM, Fleiss' Kappa adaptation for >20 raters (noting higher difficulty of agreement vs standard 2-rater scenario), and DSL limitations for consecutive behaviors."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section VII explicitly states 'we applied the most relevant rules from the Texas driver handbook' and lists specific untested scenarios (consecutive driving behaviors, multi-modal traffic rules with figures). Plans to expand to other regions are noted as future work."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "While code is on Zenodo and benchmark rules are in supplementary material, raw experimental data (LLM outputs per rule, scenario execution logs, human study response data, scenario recordings) are not made available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section IV-B1 describes traffic rule benchmark creation from the Texas Driver Handbook with explicit inclusion/exclusion criteria. Section IV-B2 describes simulator and map selection. Section IV-C2 describes human study data collection procedure."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section IV-C2 describes recruitment: 'software engineering students across five practical testing classes at a public university' (with per-class counts) and Prolific platform workers with requirements of 'driving license and driving experience' (20 per survey)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented with counts at each stage: 98 rules → 54 supported in CARLA / 40 in LGSVL → 217 CARLA scenarios / 124 LGSVL / 27 MetaDrive → manual filtering for RQ3 yielding 169/88/27 valid scenarios. Ground truth creation process and consensus procedure are described."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgments, grants, or sponsorship information is found anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Macquarie University (Deng, Yao, Zheng), Purdue University (Tu, T. Zhang), and TensorBlock (M. Zhang). Xi Zheng is noted as corresponding author."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding information is disclosed, so funder independence cannot be assessed. One author (Mengshi Zhang) is affiliated with TensorBlock, a commercial entity, but no funding relationship is described."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper evaluates GPT-4's ability to parse traffic rules but does not state GPT-4's training data cutoff date. This is relevant because the Texas Driver Handbook (2022) is publicly available and likely in GPT-4's training data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether GPT-4's training data included the Texas Driver Handbook or similar traffic rules. Since the handbook is a widely available public document, overlap is highly likely but unaddressed."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The Texas Driver Handbook was published in 2022 and is publicly available online. GPT-4 was likely trained on data that includes this document, but no contamination risk analysis is provided."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No pre-registration of the human evaluation study is mentioned. No link to OSF, AsPredicted, or any registry."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "The paper states 'Informed consent was obtained from all participants' (Section IV-C2) but does not mention IRB or ethics board approval."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "Participants are described only as 'software engineering students' and Prolific workers with 'driving license and driving experience.' No detailed demographics (experience level, age, gender, programming experience, driving frequency) are reported."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "Inclusion criteria are stated: students enrolled in software engineering testing classes at a public university, and Prolific workers required to have 'a driving license and driving experience' (Section IV-C2)."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a survey-based human evaluation, not an experimental study with treatment and control conditions. Participants all evaluated the same scenarios within their assigned survey."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a survey-based evaluation study, not an experimental study where blinding would be applicable. Participants evaluated scenario-rule consistency without treatment conditions."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Per-class participant numbers are given (36, 15, 27, 16, 10) attributed to 'class sizes and willingness to participate' but no information on dropouts, incomplete surveys, or how many started vs finished."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Section VIII-C reports 'the average cost to parse a single rule is about $0.05' and references OpenAI API pricing at '$10 per million tokens.' Per-unit inference cost is clearly stated."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is mentioned (RTX4090 for Llama) and per-unit cost ($0.05/rule) is given, but total computational budget (total GPU hours, total API spend, simulation compute time) for the full experimental evaluation is not stated."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or stochastic variation analysis. LLM parsing results appear to be single-run despite LLM outputs being non-deterministic."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper does not state how many times LLM parsing was run per traffic rule. No mention of averaging over multiple runs or single-run reporting."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search is described. The prompt templates and pipeline structure appear fixed without discussion of how they were developed or tuned."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The few-shot example used in the prompt is not justified — no discussion of how it was selected or whether different examples yield different results. The pipeline configuration appears fixed without explaining the selection process."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Multiple comparisons are made across 8 parsers and 11 subcomponents (Table VII) without any statistical tests at all, let alone multiple comparison corrections."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own system (TARGET) against their own implementations and ground truth without acknowledging potential self-evaluation bias. Ground truth was created by the authors themselves."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "GPT-4 is compared with GPT-3, GPT-3.5, and Llama models that have vastly different compute costs, but performance is not analyzed as a function of compute budget."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether parsing accuracy on 98 rules from the Texas Driver Handbook is a valid proxy for general traffic rule parsing capability. The construct validity of the benchmark itself is not examined."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "When comparing different LLMs (GPT-4, GPT-3.5, GPT-3, Llama variants), the same pipeline scaffold (KE-KV-SA or KE) is used across all models, keeping the scaffold constant. The ablation study explicitly tests the scaffold components."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The Texas Driver Handbook (2022) predates GPT-4's training cutoff. GPT-4 may have seen both the traffic rules and similar scenario representations during training. This temporal leakage is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the few-shot example in the prompt leaks patterns about the expected output format beyond what the DSL specification provides, or whether the element list constrains outputs in ways that inflate accuracy."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the 98 traffic rules are independent of each other or of GPT-4's training data. Many rules from the same handbook may share linguistic patterns that inflate apparent parsing ability."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference tests, or temporal splits are applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "TARGET's rule parser (GPT-4-KE-KV-SA) achieves 100% accuracy on 90 out of 98 traffic rules",
    372       "evidence": "Section V-A reports rule-level accuracy results: GPT-4-KE achieves 71, GPT-4-KE-KV achieves 73, GPT-4-KE-KV-SA achieves 90 rules at 100% accuracy. Table VII provides component-level breakdown.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "TARGET generates executable test scenarios with 90% normal execution rate",
    377       "evidence": "Section V-B1 reports '90% executed normally, while 7% had compile errors, and 3% had runtime errors' for generated scenarios. Most errors attributed to CARLA simulator bugs.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "TARGET uncovered 610 erroneous behaviors including rule violations, collisions, and timeouts across 7 ADSs",
    382       "evidence": "Table VIII provides per-ADS breakdown: Auto (75+10+38), MMFN (72+43+17), LAV (46+2+55), Autoware (68+88+21), Apollo 7.0 (7+8+4), IDM (18+15+0), PPO (14+9+0). Sum is 610.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "The three-step pipeline (KE → KV → SA) significantly improves parsing over extraction alone",
    387       "evidence": "Section V-A ablation: GPT-4-KE (71/98 at 100%), GPT-4-KE-KV (73/98), GPT-4-KE-KV-SA (90/98). Component-level improvements shown in Table VII, especially for Position reference and Position relation.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Generated scenarios substantially match traffic rule descriptions per human evaluation",
    392       "evidence": "Sections V-B2, Figures 7-8: 50/75 scenarios voted at least partial match by students, 49/75 by Prolific workers without 'Most not match' or 'Totally not match' votes. Mean Fleiss' Kappa of 0.68 (students) and 0.67 (workers).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Two detected issues were confirmed by ADS developers",
    397       "evidence": "Section V-C: LAV's conservative driving behavior 'was reported to the authors and was confirmed.' MMFN's localization problem 'has been confirmed by the authors of MMFN.' Autoware collision issue matched a reported GitHub bug.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No statistical significance tests",
    404       "detail": "All comparisons between LLM parsers and pipeline variants are raw accuracy numbers without any statistical tests. With stochastic LLM outputs and modest sample sizes (98 rules), observed differences could reflect noise rather than genuine improvements."
    405     },
    406     {
    407       "flag": "Single-run LLM evaluation",
    408       "detail": "LLM outputs are non-deterministic, but no results are reported across multiple runs. The stochastic nature of GPT-4 means the rule-parsing accuracy numbers could vary significantly across runs, yet no variance is reported."
    409     },
    410     {
    411       "flag": "Contamination risk with public benchmark",
    412       "detail": "The Texas Driver Handbook is a widely available public document likely present in GPT-4's training data. GPT-4's strong performance on parsing these rules may partly reflect memorization rather than genuine understanding capability."
    413     },
    414     {
    415       "flag": "Self-created ground truth",
    416       "detail": "Ground truth scenario representations were created by the paper's authors, who also designed the DSL and evaluation criteria. While they describe a consensus process ('All authors made these independently, reviewed each other's work'), this creates potential for bias in both benchmark and evaluation."
    417     },
    418     {
    419       "flag": "No IRB approval for human study",
    420       "detail": "Human evaluation studies involving 100+ university students and paid Prolific workers mention informed consent but not IRB or ethics board approval."
    421     },
    422     {
    423       "flag": "Uneven human study sample sizes",
    424       "detail": "Student participant numbers vary widely across surveys (36, 15, 27, 16, 10) due to class sizes, potentially introducing uneven statistical power across the 5 survey groups."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Lawbreaker: An approach for specifying traffic laws and fuzzing autonomous vehicles",
    430       "authors": ["Y. Sun", "C. M. Poskitt"],
    431       "year": 2022,
    432       "relevance": "Closest prior work using traffic laws for ADS test generation via manual STL encoding and fuzzing."
    433     },
    434     {
    435       "title": "Scenic: A language for scenario specification and scene generation",
    436       "authors": ["D. J. Fremont", "T. Dreossi"],
    437       "year": 2019,
    438       "doi": "10.1145/3314221.3314633",
    439       "relevance": "Prominent DSL for driving scenario specification that TARGET's simpler DSL aims to replace for LLM-based generation."
    440     },
    441     {
    442       "title": "Generating effective test cases for self-driving cars from police reports",
    443       "authors": ["A. Gambi", "T. Huynh", "G. Fraser"],
    444       "year": 2019,
    445       "relevance": "AC3R uses NLP to construct ADS test scenarios from crash reports — related NLP-to-scenario pipeline."
    446     },
    447     {
    448       "title": "Specification-based autonomous driving system testing",
    449       "authors": ["Y. Zhou", "Y. Sun"],
    450       "year": 2023,
    451       "relevance": "AVUnit framework for systematic ADS testing using specification languages (SCENEST + AVSpec)."
    452     },
    453     {
    454       "title": "Mosat: finding safety violations of autonomous driving systems using multi-objective genetic algorithm",
    455       "authors": ["H. Tian", "Y. Jiang"],
    456       "year": 2022,
    457       "relevance": "Search-based ADS test generation using multi-objective optimization that TARGET-generated scenarios could seed."
    458     },
    459     {
    460       "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models",
    461       "authors": ["P. Manakul", "A. Liusie", "M. J. Gales"],
    462       "year": 2023,
    463       "arxiv_id": "2303.08896",
    464       "relevance": "LLM self-checking for hallucination, directly relevant to TARGET's knowledge validation step."
    465     },
    466     {
    467       "title": "Lost in the middle: How language models use long contexts",
    468       "authors": ["N. F. Liu", "K. Lin", "J. Hewitt"],
    469       "year": 2024,
    470       "relevance": "Motivates TARGET's modular prompting design to reduce input context length and mitigate LLM hallucination."
    471     },
    472     {
    473       "title": "Hallucination is inevitable: An innate limitation of large language models",
    474       "authors": ["Z. Xu", "S. Jain", "M. Kankanhalli"],
    475       "year": 2024,
    476       "arxiv_id": "2401.11817",
    477       "relevance": "Theoretical analysis of LLM hallucination that motivates TARGET's validation pipeline design."
    478     },
    479     {
    480       "title": "RMT: Rule-based metamorphic testing for autonomous driving models",
    481       "authors": ["Y. Deng", "X. Zheng", "T. Zhang", "G. Lou", "M. Kim"],
    482       "year": 2020,
    483       "arxiv_id": "2012.10672",
    484       "relevance": "Prior work by the same authors using traditional NLP for extracting information from simplified traffic rules for ADS testing."
    485     },
    486     {
    487       "title": "SCTrans: Constructing a large public scenario dataset for simulation testing of autonomous driving systems",
    488       "authors": ["J. Dai", "B. Gao"],
    489       "year": 2024,
    490       "relevance": "Demonstrates effectiveness of using TARGET-generated scenarios as seeds for fuzzing-based ADS test generation."
    491     },
    492     {
    493       "title": "AV-Fuzzer: Finding safety violations in autonomous driving systems",
    494       "authors": ["G. Li", "Y. Li"],
    495       "year": 2020,
    496       "relevance": "Fuzzing-based approach for finding ADS safety violations, complementary to TARGET's rule-based scenario generation."
    497     },
    498     {
    499       "title": "A systematic study and comprehensive evaluation of ChatGPT on benchmark datasets",
    500       "authors": ["M. T. R. Laskar", "M. S. Bari"],
    501       "year": 2023,
    502       "arxiv_id": "2305.18486",
    503       "relevance": "Evaluates LLM capabilities on NLP tasks, supporting TARGET's use of GPT-4 for knowledge extraction."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "ADS testers could use TARGET to automate scenario generation from traffic rules, though it requires simulator infrastructure (CARLA, LGSVL) and LLM API access."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "Demonstrates that a simpler DSL intermediary outperforms direct LLM code generation for ADS scenarios, a useful but not paradigm-shifting insight."
    514     },
    515     "fear_safety": {
    516       "score": 1,
    517       "justification": "Indirectly related to autonomous vehicle safety by finding ADS bugs, but does not raise novel AI risk or security concerns."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or conflict; a straightforward systems paper proposing and evaluating a test generation framework."
    522     },
    523     "demo_ability": {
    524       "score": 2,
    525       "justification": "Code is released on Zenodo, but requires CARLA/LGSVL simulator setup and GPT-4 API access to run."
    526     },
    527     "brand_recognition": {
    528       "score": 1,
    529       "justification": "Uses GPT-4 and published in IEEE TSE, but the lab (Macquarie University/Purdue) is not a high-profile AI brand."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs