scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33063B)
      1 {
      2   "paper": {
      3     "title": "Driving Style Alignment for LLM-powered Driver Agent",
      4     "authors": [
      5       "Ruoxuan Yang",
      6       "Xinyue Zhang",
      7       "Anais Fernandez-Laaksonen",
      8       "Xin Ding",
      9       "Jiangtao Gong"
     10     ],
     11     "year": 2024,
     12     "venue": "IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)",
     13     "arxiv_id": "2403.11368",
     14     "doi": "10.1109/IROS58592.2024.10802629"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval",
     22     "qualitative"
     23   ],
     24   "key_findings": "A multi-alignment framework combining human demonstrations and Coach Agent feedback can align LLM-powered driver agents (GPT-4) with distinct human driving styles (cautious vs. risky) in CARLA simulation. The full multi-alignment method outperforms demonstrations-only or feedback-only, producing the most differentiated collision rates, speeds, and throttle/brake behavior. Human evaluators (n=259) perceive clear riskiness distinctions between aligned agents, and interestingly associate higher riskiness with greater human-likeness (r=0.17) while inversely correlating riskiness with perceived intelligence (r=-0.59).",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Two GitHub repositories are provided: https://github.com/AIR-DISCOVER/Multi-alignment-Drivng-Agent (framework) and https://github.com/AIR-DISCOVER/Driving-Thinking-Dataset (dataset), listed in footnotes on page 1."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The Driving-Thinking-Dataset is provided via a GitHub link (footnote 2). This contains the natural language driving behavior demonstrations."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper mentions CARLA 0.9.14, Python 3.7, Unreal Engine 4, and a ThundeRobot Zero desktop, but no requirements.txt, Dockerfile, or library dependency list is provided in the paper itself. Not enough detail to recreate the full environment."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub links are provided but the paper itself contains no README-level reproduction guide."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Simulation results (Fig 3) report point estimates for collision rates, speeds, throttle/brake percentages with no confidence intervals or error bars. Human evaluation reports significance stars but no CIs on the metrics."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The human evaluation (Fig 4a) reports statistical significance between ranking conditions using p-value thresholds (****  p<0.0001, ** p<0.01, ns ≥0.05). Pearson correlation significance is also reported (Fig 4b)."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Raw metric values are reported per condition (collision rates, speeds in km/h, throttle/brake percentages), providing magnitude context. Pearson correlation coefficients (r=-0.59, r=0.17, r=0.10) are reported for human evaluation scores, giving effect size information."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No power analysis or justification for the sample sizes: 24 drivers in the naturalistic experiment, ~6.7 minutes of simulation per condition, or 270 survey participants. No discussion of whether these are adequate."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No standard deviations, interquartile ranges, or variance measures reported for simulation metrics (collision rates, speed, throttle, brake). Only point estimates are shown in Fig 3."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The NOT-ALIGNED condition serves as a baseline, and the ablation design compares DEMONSTRATIONS-only, FEEDBACK-only, and full MULTI-ALIGNMENT methods against each other (Section IV-A, Fig 2)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No external baselines from prior work on driving style alignment are compared. All comparisons are between variants of the proposed framework. The paper references other LLM driving agent work but does not benchmark against them."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The experiment is structured as an ablation: DEMONSTRATIONS (only demonstrations to Driver Agent), FEEDBACK (only Coach Agent feedback), and MULTI-ALIGNMENT (both). This isolates the contribution of each component (Section IV-A)."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple metrics are used: collision rate per meter, average speed, throttle percentage, brake percentage (simulation), plus riskiness ranking, intelligence score, human-likeness score (human evaluation)."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "A human evaluation with 259 valid participants evaluated Driver Agent performance through video clips, rating riskiness rankings, intelligence, human-likeness (Section IV-C)."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "There is no explicit held-out test set. Demonstrations come from real-world driving, and evaluation is in CARLA simulation with randomly generated endpoints. While the domains differ, there is no explicit discussion of train/test separation or potential overlap between demonstration scenarios and evaluation scenarios."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down by both driving style (CAUTIOUS, RISKY, NOT-ALIGNED) and alignment method (DEMONSTRATIONS, FEEDBACK, MULTI-ALIGNMENT) in Fig 3 and Fig 4."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "No dedicated failure analysis. The paper mentions collision rates but does not analyze specific failure scenarios, collision causes, or when the agent makes poor decisions."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The DEMONSTRATIONS method is reported as the least effective for alignment in both cautious and risky conditions. The NOT-ALIGNED condition shows DN has no significant difference from DR or FR, described as 'all look very risky.' One collision rate cell is missing (MN), indicating an incomplete design."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract claims the framework can align driver agents with human driving styles, validated through CARLA simulation and human evaluation. Both the simulation metrics (Fig 3) and human evaluation rankings (Fig 4a) support these claims."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper claims 'MULTI-ALIGNMENT was the most effective method.' The ablation design with controlled manipulation of alignment method and driving style across conditions supports causal inference. Each alignment component is systematically varied while others are held constant."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title 'Driving Style Alignment for LLM-powered Driver Agent' and conclusions about 'paving the way for more intuitive and effective human-agent alignment across a broad spectrum of applications beyond autonomous driving' are far broader than what was tested: one CARLA map (Town10), one vehicle (Audi TT), one LLM (GPT-4), two driving styles, 60 NPCs."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No discussion of alternative explanations for the observed differences (e.g., whether results are driven by prompt sensitivity, specific demonstration selection, or CARLA-specific behavior rather than genuine style alignment)."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper measures collision rates, speed, throttle/brake percentages and human riskiness rankings as proxies for 'driving style alignment,' but does not discuss the gap between these simulation metrics and real-world driving style or acknowledge what these proxies fail to capture."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper says 'OpenAI's GPT-4' with a footnote to https://openai.com/gpt-4. No specific version (e.g., gpt-4-0613) or API snapshot date is provided."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "The paper describes the prompting approach (CoT reasoning, 'Think Step by Step,' demonstrations as few-shot prompts) and provides one example reasoning output, but the actual prompt text sent to GPT-4 for the Driver Agent and Coach Agent is not provided."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No GPT-4 API parameters (temperature, top-p, max tokens) are reported. The CARLA time-step (0.0008-0.0015 seconds) is mentioned but LLM hyperparameters are absent."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The agentic scaffolding is described in detail: Driver Agent with iterable fixed-capacity short-term memory, perception-situation-reasoning-action workflow, Coach Agent with Guidelines module that evaluates and generates guidelines, memory unit compilation (Section II)."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section III describes how raw interview data was organized: driving style differentiation via MDSI questionnaire + CAN-Bus data, identification of 3 risky and 2 cautious drivers, selection of representative decision-making processes, and formatting into Situation-Reasoning-Action demonstrations."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No dedicated limitations section. The conclusions (Section V) mention 'opening new avenues for research' but do not discuss specific limitations of the current work."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No threats to validity are discussed. Issues such as the small number of demonstration drivers (3+2), single CARLA map, GPT-4 dependency, sim-to-real gap, and potential survey response biases are not addressed."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, what settings were NOT tested, or what claims are NOT being made."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The Driving-Thinking-Dataset GitHub repository is provided (footnote 2), which appears to contain the organized driving behavior demonstrations. However, it is unclear whether raw interview recordings/transcripts or raw CAN-Bus data are included."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section III-A describes the data collection in detail: 24 drivers, urban road driving task covering 13 driving conditions (5.7 km), roof-mounted 360° camera, in-car motion camera, eye tracker, real-time CAN-Bus data, and post-driving interviews lasting 1.5-2 hours."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Driving experiment: '24 drivers were invited,' including different genders, ages, professional and novice drivers (Section III-A). Survey: 'over 200 participants through a third-party recruitment channel provided by the survey platform' with ~$2.08 compensation, plus 60 from social media (Section IV-C-2)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The pipeline is documented: driving experiment → video recording + CAN-Bus → post-driving interview → MDSI questionnaire → driving style classification → selection of representative processes → formatting into Situation-Reasoning-Action demonstrations (Section III). Survey pipeline: distribution → 3 days → 270 responses → screening (minimum time, trap questions) → 259 valid (Section IV-C-4)."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding information or acknowledgments section is present in the paper. Authors are from Tsinghua University's Institute for AI Industry Research, but no grants or sponsors are mentioned."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are clearly stated: all authors are with the Institute for AI Industry Research, Tsinghua University, Beijing, China. They use OpenAI's GPT-4 but are not affiliated with OpenAI."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding is disclosed, so independence cannot be assessed."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The paper uses GPT-4 as a decision-making component within a driving simulation framework, not to evaluate GPT-4's knowledge on a benchmark. The 'test data' is generated in real-time by CARLA simulation, so training data contamination is not a relevant concern."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Same reasoning: the evaluation involves real-time driving decisions in CARLA simulation, not model performance on a pre-existing benchmark that could overlap with training data."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark — it uses GPT-4 as a component in a framework evaluated via simulation behavior metrics."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No mention of pre-registration for either the naturalistic driving experiment (n=24) or the human evaluation survey (n=270)."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No mention of IRB or ethics board approval, despite conducting a naturalistic driving experiment with 24 human drivers and a survey with 270 participants."
    264       },
    265       "demographics_reported": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "Survey demographics: 141 male (52.22%), 129 female (47.78%), ages 19-54, all verified to hold a driving license, driving style scores computed from MDSI (Section IV-C-2,4). Driving experiment: mentions 'different genders and age groups' and professional/novice mix but without specific breakdowns."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Survey: participants must possess a driving license (verified in questionnaire), minimum answering time enforced, trap questions included for screening (Section IV-C-1,2). Driving experiment: included professional and novice drivers. Style selection criteria: MDSI + CAN-Bus data thresholds (Section III-B)."
    274       },
    275       "randomization_described": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Within-subjects design with video clips presented in random order within each group (Section IV-C-1): 'Each group of video clips will appear in a random order.' The second questionnaire also uses random ordering of all eight clips."
    279       },
    280       "blinding_described": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "Participants viewed video clips without being told which condition each clip represented, which implies single-blinding. However, blinding is not explicitly described or discussed as a methodological choice."
    284       },
    285       "attrition_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Survey attrition is reported: 270 total participants → 259 valid responses after screening (198 for first questionnaire, 59 for second, with 2 completing both). Screening criteria (minimum time, trap questions) are described (Section IV-C-4)."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No GPT-4 API costs, tokens consumed, or per-query latency are reported. The paper mentions CARLA's simulation time-step was slowed to 0.0008-0.0015 seconds due to GPT response time but does not quantify the API cost."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The paper states 'approximately 50.3 hours of simulation experiments' with an average of 6.7 minutes per condition and ~1.5 km traveled per condition. Hardware is identified as a ThundeRobot Zero desktop (Section IV-B-1,3)."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No mention of multiple random seeds or seed sensitivity analysis. While CARLA generates random endpoints, there is no reporting of variance across simulation runs with different seeds."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The total simulation time (50.3 hours) and average time per condition (~6.7 min) are stated, but the exact number of independent runs per condition is not explicitly reported."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search is described. The CARLA time-step range (0.0008-0.0015s), number of NPCs (60), and other settings appear chosen without documented justification or search."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Configuration choices (short-term memory capacity, time-step range, NPC count, map choice) are not justified. No explanation for why these specific values were selected."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The human evaluation (Fig 4a) performs many pairwise comparisons across four groups with significance tests. No mention of Bonferroni, Holm, or other multiple comparison corrections."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "All comparisons are between variants of the authors' own framework. No acknowledgment that authors evaluating their own system may introduce bias."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": false,
    336         "answer": false,
    337         "justification": "All conditions use the same GPT-4 model with equivalent compute per query. Compute differences between conditions are negligible since the experimental variable is the alignment method, not compute budget."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether CARLA simulation metrics (collision rate, speed, throttle/brake) actually measure 'driving style alignment' or whether they are valid proxies for real-world driving style correspondence."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The same GPT-4 model and base scaffolding (Driver Agent architecture) are used across all conditions. The experimental variable is the alignment method (demonstrations/feedback), not the scaffold itself, so the confound is controlled by design."
    348       }
    349     }
    350   },
    351   "claims": [
    352     {
    353       "claim": "Multi-alignment is the most effective method for creating driver agents with distinct driving styles, showing the most significant differences in collision rates, throttle, brake, and speed between cautious and risky styles.",
    354       "evidence": "Simulation results (Fig 3, Section IV-B-4): MULTI-ALIGNMENT showed the lowest collision rate for CAUTIOUS and highest for RISKY. Average speed, throttle, and brake showed the largest style differentiation under MULTI-ALIGNMENT. Human evaluation (Fig 4a): riskiness ranking DC > FC > MC (all p<0.0001) in the cautious group.",
    355       "supported": "moderate"
    356     },
    357     {
    358       "claim": "Driver agents can exhibit corresponding driving styles by aligning with different driving style demonstrations.",
    359       "evidence": "Simulation metrics (Fig 3b): RISKY-aligned agents had highest average speed, highest throttle, lowest brake; CAUTIOUS-aligned agents had the opposite pattern across all alignment methods. Collision rates (Fig 3a) also differ by style.",
    360       "supported": "moderate"
    361     },
    362     {
    363       "claim": "Humans tend to associate higher riskiness with lower intelligence and greater human-likeness.",
    364       "evidence": "Pearson correlation analysis (Fig 4b): Riskiness-Intelligence r=-0.59 (p<0.0001), Riskiness-Human-likeness r=0.17 (p<0.001), Intelligence-Human-likeness r=0.10 (p<0.05). One participant: 'It (MR) is really like an experienced driver who is showing off his driving skills.'",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Demonstrations alone are the least effective alignment method for both cautious and risky driving styles.",
    369       "evidence": "Human evaluation (Fig 4a): In the cautious group, riskiness ranking DC > FC > MC (all p<0.0001), meaning DC was perceived as most risky (least cautious-aligned). In the risky group, DEMONSTRATIONS also showed poorest alignment effect. Simulation results support this for cautious but are less clear for risky.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The dataset likely represents the first effort to meticulously dissect human driving behaviors and articulate the driving decision-making process in a natural language format.",
    374       "evidence": "Section I states this claim. The paper describes a novel data collection pipeline (Section III) with 24 drivers, driving experiments, and post-driving interviews structured into Situation-Reasoning-Action format.",
    375       "supported": "weak"
    376     }
    377   ],
    378   "red_flags": [
    379     {
    380       "flag": "No error bars or uncertainty quantification on simulation metrics",
    381       "detail": "Fig 3 reports point estimates for collision rates, speeds, and throttle/brake percentages with no error bars, confidence intervals, or variance measures. Given that simulation outcomes depend on random endpoint generation and NPC behavior, single-point results are unreliable."
    382     },
    383     {
    384       "flag": "Very short simulation time per condition",
    385       "detail": "Average of ~6.7 minutes simulator time and ~1.5 km traveled per condition is extremely short. Collision rates computed from such brief runs may be highly variable and unrepresentative."
    386     },
    387     {
    388       "flag": "No IRB/ethics approval for human subjects research",
    389       "detail": "The paper involves 24 drivers in a naturalistic driving experiment and 270 online survey participants, but no IRB or ethics board approval is mentioned."
    390     },
    391     {
    392       "flag": "No limitations section",
    393       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries despite numerous potential concerns (sim-to-real gap, single CARLA map, GPT-4 dependency, small demonstration driver pool)."
    394     },
    395     {
    396       "flag": "Missing multiple comparison correction",
    397       "detail": "The human evaluation performs many pairwise significance tests across four groups without any family-wise error rate correction."
    398     },
    399     {
    400       "flag": "Overclaiming beyond tested scope",
    401       "detail": "The conclusions claim insights 'across a broad spectrum of applications beyond autonomous driving' based on results from one CARLA map, one vehicle type, one LLM (GPT-4), and two driving styles."
    402     },
    403     {
    404       "flag": "Incomplete experimental design",
    405       "detail": "The paper describes an 'approximate 3×3 within-subject design' but the MULTI-ALIGNMENT NOT-ALIGNED (MN) cell is missing since multi-alignment requires demonstrations for both agents. This makes some comparisons across alignment methods imbalanced."
    406     },
    407     {
    408       "flag": "Tiny demonstration driver pool",
    409       "detail": "Only 3 risky and 2 cautious drivers (from 24 total) were selected to create the alignment demonstrations. The generalizability of these 5 individuals' driving behaviors to broader driving style categories is questionable."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "LLM-Planner: Few-Shot Grounded Planning for Embodied Agents with Large Language Models",
    415       "authors": ["C. H. Song", "J. Wu", "C. Washington", "B. M. Sadler", "W.-L. Chao", "Y. Su"],
    416       "year": 2023,
    417       "relevance": "Demonstrates LLM few-shot planning capabilities for embodied agents, directly relevant to agentic AI workflows."
    418     },
    419     {
    420       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    421       "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "F. Xia", "E. Chi", "Q. V. Le", "D. Zhou"],
    422       "year": 2022,
    423       "relevance": "Foundational work on chain-of-thought prompting used in the Driver Agent's reasoning strategy."
    424     },
    425     {
    426       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    427       "authors": ["S. Yao", "D. Yu", "J. Zhao", "I. Shafran", "T. L. Griffiths", "Y. Cao", "K. Narasimhan"],
    428       "year": 2023,
    429       "arxiv_id": "2305.10601",
    430       "relevance": "Advanced LLM reasoning framework relevant to agentic decision-making capabilities."
    431     },
    432     {
    433       "title": "Driving with LLMs: Fusing Object-Level Vector Modality for Explainable Autonomous Driving",
    434       "authors": ["L. Chen", "O. Sinavski", "J. Hünermann", "A. Karnsund", "A. J. Willmott", "D. Birch", "D. Maund", "J. Shotton"],
    435       "year": 2023,
    436       "arxiv_id": "2310.01957",
    437       "relevance": "LLM-powered autonomous driving with fine-tuning for explainability, directly on topic for LLM agent capability."
    438     },
    439     {
    440       "title": "DriveGPT4: Interpretable End-to-End Autonomous Driving via Large Language Model",
    441       "authors": ["Z. Xu", "Y. Zhang", "E. Xie", "Z. Zhao", "Y. Guo", "K. K. Wong", "Z. Li", "H. Zhao"],
    442       "year": 2023,
    443       "arxiv_id": "2310.01412",
    444       "relevance": "End-to-end LLM-based autonomous driving system, relevant to LLM agent capabilities in physical domains."
    445     },
    446     {
    447       "title": "DiLu: A Knowledge-Driven Approach to Autonomous Driving with Large Language Models",
    448       "authors": ["L. Wen", "D. Fu", "X. Li", "X. Cai", "T. Ma", "P. Cai", "M. Dou", "B. Shi", "L. He", "Y. Qiao"],
    449       "year": 2023,
    450       "arxiv_id": "2309.16292",
    451       "relevance": "Knowledge-driven LLM driving agent with memory and reflection, relevant to agentic LLM workflows."
    452     },
    453     {
    454       "title": "DriveMLM: Aligning Multi-Modal Large Language Models with Behavioral Planning States for Autonomous Driving",
    455       "authors": ["W. Wang", "J. Xie", "C. Hu", "H. Zou", "J. Fan", "W. Tong", "Y. Wen", "S. Wu", "H. Deng", "Z. Li", "H. Tian", "L. Lu", "X. Zhu", "X. Wang", "Y. Qiao", "J. Dai"],
    456       "year": 2023,
    457       "relevance": "Multi-modal LLM alignment for autonomous driving behavior, directly relevant to LLM alignment methodology."
    458     },
    459     {
    460       "title": "Training language models to follow instructions with human feedback",
    461       "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida", "C. L. Wainwright", "P. Mishkin", "C. Zhang", "S. Agarwal", "K. Slama", "A. Ray"],
    462       "year": 2022,
    463       "arxiv_id": "2203.02155",
    464       "relevance": "Foundational RLHF paper for aligning language models with human preferences, core to the alignment methodology topic."
    465     },
    466     {
    467       "title": "ExpeL: LLM Agents Are Experiential Learners",
    468       "authors": ["A. Zhao", "D. Huang", "Q. Xu", "M. Lin", "Y.-J. Liu", "G. Huang"],
    469       "year": 2023,
    470       "arxiv_id": "2308.10144",
    471       "relevance": "LLM agents learning from experience, relevant to agentic AI learning and self-improvement workflows."
    472     },
    473     {
    474       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    475       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    476       "year": 2024,
    477       "relevance": "Verbal reinforcement learning for LLM agents with reflection and self-improvement, directly relevant to agentic AI."
    478     },
    479     {
    480       "title": "SurrealDriver: Designing Generative Driver Agent Simulation Framework in Urban Contexts Based on Large Language Model",
    481       "authors": ["Y. Jin", "X. Shen", "H. Peng", "X. Liu", "J. Qin", "J. Li", "J. Xie", "P. Gao", "G. Zhou", "J. Gong"],
    482       "year": 2023,
    483       "arxiv_id": "2309.13193",
    484       "relevance": "LLM-based generative driver agent simulation in urban settings, closely related framework for LLM agent capability evaluation."
    485     },
    486     {
    487       "title": "Language Models are Few-Shot Learners",
    488       "authors": ["T. Brown", "B. Mann", "N. Ryder", "M. Subbiah", "J. D. Kaplan", "P. Dhariwal"],
    489       "year": 2020,
    490       "relevance": "GPT-3 few-shot learning paper, foundational to the few-shot demonstration approach used in this work."
    491     },
    492     {
    493       "title": "Drive Like a Human: Rethinking Autonomous Driving with Large Language Models",
    494       "authors": ["D. Fu", "X. Li", "L. Wen", "M. Dou", "P. Cai", "B. Shi", "Y. Qiao"],
    495       "year": 2024,
    496       "relevance": "Human-like LLM-based autonomous driving with expert feedback integration, directly relevant to LLM alignment for driving."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 1,
    502       "justification": "Framework requires human data collection, CARLA setup, and GPT-4 API — not immediately usable by practitioners, though the concept of style-aligned driving agents has practical appeal."
    503     },
    504     "surprise_contrarian": {
    505       "score": 1,
    506       "justification": "The finding that humans perceive riskier driving as more human-like is a mildly surprising psychological insight, but the alignment framework itself follows expected approaches."
    507     },
    508     "fear_safety": {
    509       "score": 1,
    510       "justification": "Touches on autonomous driving safety implicitly, but does not raise novel AI risk or security concerns."
    511     },
    512     "drama_conflict": {
    513       "score": 0,
    514       "justification": "No controversy, no conflict with established results or institutions."
    515     },
    516     "demo_ability": {
    517       "score": 1,
    518       "justification": "Code and dataset repos are provided on GitHub, but running the demo requires CARLA installation, GPT-4 API access, and substantial setup."
    519     },
    520     "brand_recognition": {
    521       "score": 1,
    522       "justification": "Uses GPT-4 (well-known product) and is from Tsinghua University (prestigious but not an AI lab with social media following)."
    523     }
    524   }
    525 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs