scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36962B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Driving Style Alignment for LLM-powered Driver Agent",
      6     "authors": [
      7       "Ruoxuan Yang",
      8       "Xinyu Zhang",
      9       "Anais Fernandez-Laaksonen",
     10       "Xin Ding",
     11       "Jiangtao Gong"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE/RJS International Conference on Intelligent RObots and Systems",
     15     "arxiv_id": "2403.11368",
     16     "doi": "10.1109/IROS58592.2024.10802629"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims the framework can align driver agents with human driving styles, validated through CARLA simulation and human evaluation. Both the simulation metrics (Fig 3) and human evaluation rankings (Fig 4a) support these claims.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper claims 'MULTI-ALIGNMENT was the most effective method.' The ablation design with controlled manipulation of alignment method and driving style across conditions supports causal inference. Each alignment component is systematically varied while others are held constant.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Driving Style Alignment for LLM-powered Driver Agent' and conclusions about 'paving the way for more intuitive and effective human-agent alignment across a broad spectrum of applications beyond autonomous driving' are far broader than what was tested: one CARLA map (Town10), one vehicle (Audi TT), one LLM (GPT-4), two driving styles, 60 NPCs.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations for the observed differences (e.g., whether results are driven by prompt sensitivity, specific demonstration selection, or CARLA-specific behavior rather than genuine style alignment).",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures collision rates, speed, throttle/brake percentages and human riskiness rankings as proxies for 'driving style alignment,' but does not discuss the gap between these simulation metrics and real-world driving style or acknowledge what these proxies fail to capture.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations section. The conclusions (Section V) mention 'opening new avenues for research' but do not discuss specific limitations of the current work.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. Issues such as the small number of demonstration drivers (3+2), single CARLA map, GPT-4 dependency, sim-to-real gap, and potential survey response biases are not addressed.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, what settings were NOT tested, or what claims are NOT being made.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding information or acknowledgments section is present in the paper. Authors are from Tsinghua University's Institute for AI Industry Research, but no grants or sponsors are mentioned.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: all authors are with the Institute for AI Industry Research, Tsinghua University, Beijing, China. They use OpenAI's GPT-4 but are not affiliated with OpenAI.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms operationally defined: 'driving style' measured via MDSI questionnaire and CAN-Bus metrics (speed, throttle, brake); 'alignment' as framework matching agent behavior to human demonstrations; 'LLM-powered agent' explained in Section II.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three contributions explicitly stated: (1) multi-alignment framework for driving style alignment, (2) natural language dataset of human driving behaviors, (3) validation through simulation and human evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Introduction engages with prior work on LLM agents, autonomous driving, and alignment methods (RLHF, expert feedback). Positions this paper as addressing a gap: no prior work on LLM agent alignment to human driving styles.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Two GitHub repositories are provided: https://github.com/AIR-DISCOVER/Multi-alignment-Drivng-Agent (framework) and https://github.com/AIR-DISCOVER/Driving-Thinking-Dataset (dataset), listed in footnotes on page 1.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The Driving-Thinking-Dataset is provided via a GitHub link (footnote 2). This contains the natural language driving behavior demonstrations.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions CARLA 0.9.14, Python 3.7, Unreal Engine 4, and a ThundeRobot Zero desktop, but no requirements.txt, Dockerfile, or library dependency list is provided in the paper itself. Not enough detail to recreate the full environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub links are provided but the paper itself contains no README-level reproduction guide.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Simulation results (Fig 3) report point estimates for collision rates, speeds, throttle/brake percentages with no confidence intervals or error bars. Human evaluation reports significance stars but no CIs on the metrics.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "The human evaluation (Fig 4a) reports statistical significance between ranking conditions using p-value thresholds (****  p<0.0001, ** p<0.01, ns ≥0.05). Pearson correlation significance is also reported (Fig 4b).",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Raw metric values are reported per condition (collision rates, speeds in km/h, throttle/brake percentages), providing magnitude context. Pearson correlation coefficients (r=-0.59, r=0.17, r=0.10) are reported for human evaluation scores, giving effect size information.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No power analysis or justification for the sample sizes: 24 drivers in the naturalistic experiment, ~6.7 minutes of simulation per condition, or 270 survey participants. No discussion of whether these are adequate.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviations, interquartile ranges, or variance measures reported for simulation metrics (collision rates, speed, throttle, brake). Only point estimates are shown in Fig 3.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The NOT-ALIGNED condition serves as a baseline, and the ablation design compares DEMONSTRATIONS-only, FEEDBACK-only, and full MULTI-ALIGNMENT methods against each other (Section IV-A, Fig 2).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "No external baselines from prior work on driving style alignment are compared. All comparisons are between variants of the proposed framework. The paper references other LLM driving agent work but does not benchmark against them.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The experiment is structured as an ablation: DEMONSTRATIONS (only demonstrations to Driver Agent), FEEDBACK (only Coach Agent feedback), and MULTI-ALIGNMENT (both). This isolates the contribution of each component (Section IV-A).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: collision rate per meter, average speed, throttle percentage, brake percentage (simulation), plus riskiness ranking, intelligence score, human-likeness score (human evaluation).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "A human evaluation with 259 valid participants evaluated Driver Agent performance through video clips, rating riskiness rankings, intelligence, human-likeness (Section IV-C).",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "There is no explicit held-out test set. Demonstrations come from real-world driving, and evaluation is in CARLA simulation with randomly generated endpoints. While the domains differ, there is no explicit discussion of train/test separation or potential overlap between demonstration scenarios and evaluation scenarios.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by both driving style (CAUTIOUS, RISKY, NOT-ALIGNED) and alignment method (DEMONSTRATIONS, FEEDBACK, MULTI-ALIGNMENT) in Fig 3 and Fig 4.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No dedicated failure analysis. The paper mentions collision rates but does not analyze specific failure scenarios, collision causes, or when the agent makes poor decisions.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The DEMONSTRATIONS method is reported as the least effective for alignment in both cautious and risky conditions. The NOT-ALIGNED condition shows DN has no significant difference from DR or FR, described as 'all look very risky.' One collision rate cell is missing (MN), indicating an incomplete design.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper says 'OpenAI's GPT-4' with a footnote to https://openai.com/gpt-4. No specific version (e.g., gpt-4-0613) or API snapshot date is provided.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper describes the prompting approach (CoT reasoning, 'Think Step by Step,' demonstrations as few-shot prompts) and provides one example reasoning output, but the actual prompt text sent to GPT-4 for the Driver Agent and Coach Agent is not provided.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No GPT-4 API parameters (temperature, top-p, max tokens) are reported. The CARLA time-step (0.0008-0.0015 seconds) is mentioned but LLM hyperparameters are absent.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The agentic scaffolding is described in detail: Driver Agent with iterable fixed-capacity short-term memory, perception-situation-reasoning-action workflow, Coach Agent with Guidelines module that evaluates and generates guidelines, memory unit compilation (Section II).",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section III describes how raw interview data was organized: driving style differentiation via MDSI questionnaire + CAN-Bus data, identification of 3 risky and 2 cautious drivers, selection of representative decision-making processes, and formatting into Situation-Reasoning-Action demonstrations.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The Driving-Thinking-Dataset GitHub repository is provided (footnote 2), which appears to contain the organized driving behavior demonstrations. However, it is unclear whether raw interview recordings/transcripts or raw CAN-Bus data are included.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section III-A describes the data collection in detail: 24 drivers, urban road driving task covering 13 driving conditions (5.7 km), roof-mounted 360° camera, in-car motion camera, eye tracker, real-time CAN-Bus data, and post-driving interviews lasting 1.5-2 hours.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Driving experiment: '24 drivers were invited,' including different genders, ages, professional and novice drivers (Section III-A). Survey: 'over 200 participants through a third-party recruitment channel provided by the survey platform' with ~$2.08 compensation, plus 60 from social media (Section IV-C-2).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented: driving experiment → video recording + CAN-Bus → post-driving interview → MDSI questionnaire → driving style classification → selection of representative processes → formatting into Situation-Reasoning-Action demonstrations (Section III). Survey pipeline: distribution → 3 days → 270 responses → screening (minimum time, trap questions) → 259 valid (Section IV-C-4).",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper uses GPT-4 as a decision-making component within a driving simulation framework, not to evaluate GPT-4's knowledge on a benchmark. The 'test data' is generated in real-time by CARLA simulation, so training data contamination is not a relevant concern.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Same reasoning: the evaluation involves real-time driving decisions in CARLA simulation, not model performance on a pre-existing benchmark that could overlap with training data.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark — it uses GPT-4 as a component in a framework evaluated via simulation behavior metrics.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No mention of pre-registration for either the naturalistic driving experiment (n=24) or the human evaluation survey (n=270).",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No mention of IRB or ethics board approval, despite conducting a naturalistic driving experiment with 24 human drivers and a survey with 270 participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Survey demographics: 141 male (52.22%), 129 female (47.78%), ages 19-54, all verified to hold a driving license, driving style scores computed from MDSI (Section IV-C-2,4). Driving experiment: mentions 'different genders and age groups' and professional/novice mix but without specific breakdowns.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Survey: participants must possess a driving license (verified in questionnaire), minimum answering time enforced, trap questions included for screening (Section IV-C-1,2). Driving experiment: included professional and novice drivers. Style selection criteria: MDSI + CAN-Bus data thresholds (Section III-B).",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "Within-subjects design with video clips presented in random order within each group (Section IV-C-1): 'Each group of video clips will appear in a random order.' The second questionnaire also uses random ordering of all eight clips.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "Participants viewed video clips without being told which condition each clip represented, which implies single-blinding. However, blinding is not explicitly described or discussed as a methodological choice.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": true,
    352           "justification": "Survey attrition is reported: 270 total participants → 259 valid responses after screening (198 for first questionnaire, 59 for second, with 2 completing both). Screening criteria (minimum time, trap questions) are described (Section IV-C-4).",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No GPT-4 API costs, tokens consumed, or per-query latency are reported. The paper mentions CARLA's simulation time-step was slowed to 0.0008-0.0015 seconds due to GPT response time but does not quantify the API cost.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "The paper states 'approximately 50.3 hours of simulation experiments' with an average of 6.7 minutes per condition and ~1.5 km traveled per condition. Hardware is identified as a ThundeRobot Zero desktop (Section IV-B-1,3).",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of multiple random seeds or seed sensitivity analysis. While CARLA generates random endpoints, there is no reporting of variance across simulation runs with different seeds.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The total simulation time (50.3 hours) and average time per condition (~6.7 min) are stated, but the exact number of independent runs per condition is not explicitly reported.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. The CARLA time-step range (0.0008-0.0015s), number of NPCs (60), and other settings appear chosen without documented justification or search.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Configuration choices (short-term memory capacity, time-step range, NPC count, map choice) are not justified. No explanation for why these specific values were selected.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The human evaluation (Fig 4a) performs many pairwise comparisons across four groups with significance tests. No mention of Bonferroni, Holm, or other multiple comparison corrections.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "All comparisons are between variants of the authors' own framework. No acknowledgment that authors evaluating their own system may introduce bias.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": false,
    409           "answer": false,
    410           "justification": "All conditions use the same GPT-4 model with equivalent compute per query. Compute differences between conditions are negligible since the experimental variable is the alignment method, not compute budget.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether CARLA simulation metrics (collision rate, speed, throttle/brake) actually measure 'driving style alignment' or whether they are valid proxies for real-world driving style correspondence.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "The same GPT-4 model and base scaffolding (Driver Agent architecture) are used across all conditions. The experimental variable is the alignment method (demonstrations/feedback), not the scaffold itself, so the confound is controlled by design.",
    423           "source": "opus"
    424         }
    425       }
    426     }
    427   },
    428   "claims": [
    429     {
    430       "claim": "Multi-alignment framework (combining human demonstrations with Coach Agent feedback) is significantly more effective than demonstrations or feedback alone at aligning driver agents with human driving styles",
    431       "evidence": "3×3 experimental design with ablation of alignment methods. Multi-alignment showed lowest collision rate for cautious (7.29e-04 vs 1.31e-03 for D, 1.53e-03 for F). Human evaluation (n=270) ranked multi-aligned cautious agents significantly lower in riskiness (DC>FC>MC, ****p<0.0001). Multi-alignment showed largest differentiation between driving styles.",
    432       "supported": "strong"
    433     },
    434     {
    435       "claim": "Natural language descriptions of human driving behaviors extracted via post-experiment interviews can effectively serve as in-context learning demonstrations for LLMs to adopt specific driving styles",
    436       "evidence": "Agents with driving style demonstrations exhibited measurable behavioral differences: risky agents 19.65 km/h speed, 43.8% throttle vs cautious 16.39 km/h, 41.1%. Human evaluators perceived style differences (Fig 4a: p<0.0001 for DC vs DR in demonstrations group). Behavioral metrics aligned with style intent.",
    437       "supported": "moderate"
    438     },
    439     {
    440       "claim": "Two distinct driving styles (risky and cautious) can be reliably distinguished by human evaluators from short video clips of agent driving behavior",
    441       "evidence": "270 human participants rated video clips showing agents with different alignments. Riskiness rankings differed significantly across conditions (****p<0.0001 for demonstrations and feedback groups). Participant comments cited observable differences ('runs stably without veering', 'waits for pedestrian').",
    442       "supported": "strong"
    443     },
    444     {
    445       "claim": "Risky driving style alignment produces measurably higher collision rates and faster control inputs compared to cautious style alignment",
    446       "evidence": "Simulation results show risky agents: 3.52e-03-4.78e-03 collisions/meter vs cautious 7.29e-04-1.53e-03 (5-6× higher). Speed 19.65 km/h vs 16.39 km/h; throttle 43.8% vs 41.1%; brake 8.6% vs 10.8%. Consistent across alignment methods.",
    447       "supported": "strong"
    448     },
    449     {
    450       "claim": "Subjective MDSI questionnaire responses can be validated against objective CAN-Bus driving metrics to identify consistent driving styles in naturalistic driving data",
    451       "evidence": "5 of 24 drivers showed alignment: 3 high-speed drivers (7.41-7.73 m/s, 24-30% throttle) matched 'risky' self-report; 2 low-speed drivers (5.15-5.28 m/s, 21% throttle) matched 'cautious.' Others showed no clear trend in objective metrics.",
    452       "supported": "moderate"
    453     },
    454     {
    455       "claim": "Humans perceive agents with higher driving riskiness as less intelligent but paradoxically more human-like than cautious agents, despite cautious driving being objectively safer",
    456       "evidence": "Correlation analysis: riskiness vs intelligence r=-0.59 (****p<0.0001); riskiness vs human-likeness r=0.17 (*p<0.05). Participant quote: 'It (MR) is really like an experienced driver who is showing off his driving skills.' Suggests psychological complexity in human-agent alignment perception.",
    457       "supported": "moderate"
    458     }
    459   ],
    460   "methodology_tags": [
    461     "benchmark-eval",
    462     "observational"
    463   ],
    464   "key_findings": "Multi-alignment combining human driving demonstrations with LLM-generated feedback significantly outperforms demonstrations or feedback alone in aligning driver agent behavior to human driving styles. Agents aligned to cautious driving achieved 5-6× lower collision rates (0.73-1.53 per 1000 meters) compared to risky-aligned agents, with corresponding reductions in speed and acceleration. Human evaluators (n=270) reliably distinguished driving styles in agent behavior videos, with multi-aligned agents showing the strongest differentiation between styles. Notably, humans paradoxically associated risky driving with lower intelligence yet greater perceived human-likeness, suggesting alignment involves psychological factors beyond behavioral realism.",
    465   "red_flags": [
    466     {
    467       "flag": "No limitations section",
    468       "detail": "Paper lacks dedicated discussion of generalizability constraints. Does not acknowledge limitations of simulation-only validation, 2-style limitation, small driving sample (n=24), or GPT-4 specificity."
    469     },
    470     {
    471       "flag": "Small and geographically homogeneous driver sample",
    472       "detail": "Only 24 drivers for naturalistic data collection, all from urban environment (likely single city in China). Generalization to other regions, driving cultures, or road types unknown."
    473     },
    474     {
    475       "flag": "GPT-4 version not specified",
    476       "detail": "Paper states 'OpenAI's GPT-4' without checkpoint date or version identifier. Insufficient for reproducibility across GPT-4 releases."
    477     },
    478     {
    479       "flag": "Missing ethics approval disclosure",
    480       "detail": "No IRB approval, ethics committee review, or informed consent mentioned for 270-participant human evaluation and 24-driver naturalistic study. Raises ethical and methodological concerns."
    481     },
    482     {
    483       "flag": "No uncertainty quantification in simulation results",
    484       "detail": "CARLA metrics reported as point estimates (e.g., '1.46e-03 collisions per meter') without error bars, confidence intervals, or variance across simulation runs. Cannot assess robustness."
    485     },
    486     {
    487       "flag": "LLM hyperparameters not disclosed",
    488       "detail": "GPT-4 sampling parameters (temperature, top-p, presence_penalty) not reported. Affects reproducibility and understanding of model behavior variability."
    489     },
    490     {
    491       "flag": "Actual prompts not provided",
    492       "detail": "Only prompt structure described (chain-of-thought, memory format). Full prompts not shown. Readers cannot replicate exact prompt wording used with GPT-4."
    493     },
    494     {
    495       "flag": "Human evaluation lacks blinding",
    496       "detail": "Participants evaluated videos without blinding to conditions. Obvious behavioral differences (collision frequency, speed) could bias ratings toward reflecting surface differences rather than true style alignment quality."
    497     },
    498     {
    499       "flag": "Demonstrations-only underperformance unexplained",
    500       "detail": "Demonstrations method consistently underperforms both Feedback and Multi-alignment across both driving styles. Paper provides no analysis of why this counterintuitive result occurred."
    501     },
    502     {
    503       "flag": "Raw dataset completeness unclear",
    504       "detail": "GitHub dataset link provided but paper does not specify whether raw materials (panoramic/motion video, eye tracking logs, CAN-Bus data, interview recordings/transcripts) or only processed Situation-Reasoning-Action examples are released."
    505     }
    506   ],
    507   "cited_papers": [
    508     {
    509       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    510       "relevance": "Foundational CoT technique directly employed in Driver Agent step-by-step reasoning architecture."
    511     },
    512     {
    513       "title": "Self-consistency improves chain of thought reasoning in language models",
    514       "relevance": "CoT extension relevant to improving LLM reasoning quality for agent decision-making."
    515     },
    516     {
    517       "title": "Training language models to follow instructions with human feedback",
    518       "relevance": "RLHF alignment method cited as prior approach to LLM alignment; motivates feedback-based coaching approach."
    519     },
    520     {
    521       "title": "Reflexion: Language agents with verbal reinforcement learning",
    522       "relevance": "Self-reflection mechanism for agents using language-based feedback; directly related to Coach Agent feedback generation."
    523     },
    524     {
    525       "title": "Driving with llms: Fusing object-level vector modality for explainable autonomous driving",
    526       "relevance": "Prior work on LLM-based driving agents; establishes baseline for autonomous driving with language models."
    527     },
    528     {
    529       "title": "DriveGPT4: Interpretable end-to-end autonomous driving via large language model",
    530       "relevance": "Recent GPT-4-based autonomous driving agent; comparable LLM-driven architecture and motivation."
    531     },
    532     {
    533       "title": "Drive like a human: Rethinking autonomous driving with large language models",
    534       "relevance": "Directly addresses human-like driving with LLMs; motivates alignment of agent behavior with human driving characteristics."
    535     },
    536     {
    537       "title": "The multidimensional driving style inventory—scale construct and validation",
    538       "relevance": "MDSI questionnaire used for driving style classification (risky, cautious, patient, careful); foundational measurement instrument for driving style taxonomy."
    539     }
    540   ],
    541   "engagement_factors": {
    542     "practical_relevance": {
    543       "score": 1,
    544       "justification": "Framework requires human data collection, CARLA setup, and GPT-4 API — not immediately usable by practitioners, though the concept of style-aligned driving agents has practical appeal."
    545     },
    546     "surprise_contrarian": {
    547       "score": 1,
    548       "justification": "The finding that humans perceive riskier driving as more human-like is a mildly surprising psychological insight, but the alignment framework itself follows expected approaches."
    549     },
    550     "fear_safety": {
    551       "score": 1,
    552       "justification": "Touches on autonomous driving safety implicitly, but does not raise novel AI risk or security concerns."
    553     },
    554     "drama_conflict": {
    555       "score": 0,
    556       "justification": "No controversy, no conflict with established results or institutions."
    557     },
    558     "demo_ability": {
    559       "score": 1,
    560       "justification": "Code and dataset repos are provided on GitHub, but running the demo requires CARLA installation, GPT-4 API access, and substantial setup."
    561     },
    562     "brand_recognition": {
    563       "score": 1,
    564       "justification": "Uses GPT-4 (well-known product) and is from Tsinghua University (prestigious but not an AI lab with social media following)."
    565     }
    566   },
    567   "hn_data": {
    568     "threads": [
    569       {
    570         "hn_id": "45923139",
    571         "title": "Chinese co's roadmap for aneutronic fusion",
    572         "points": 11,
    573         "comments": 3,
    574         "url": "https://news.ycombinator.com/item?id=45923139"
    575       },
    576       {
    577         "hn_id": "35314773",
    578         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    579         "points": 4,
    580         "comments": 1,
    581         "url": "https://news.ycombinator.com/item?id=35314773"
    582       },
    583       {
    584         "hn_id": "41365788",
    585         "title": "Quantum error correction below the surface code threshold",
    586         "points": 3,
    587         "comments": 2,
    588         "url": "https://news.ycombinator.com/item?id=41365788"
    589       },
    590       {
    591         "hn_id": "42375612",
    592         "title": "Quantum error correction below the surface code threshold",
    593         "points": 3,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=42375612"
    596       },
    597       {
    598         "hn_id": "35298128",
    599         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    600         "points": 3,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=35298128"
    603       },
    604       {
    605         "hn_id": "43563070",
    606         "title": "Cordic Is All You Need",
    607         "points": 2,
    608         "comments": 0,
    609         "url": "https://news.ycombinator.com/item?id=43563070"
    610       },
    611       {
    612         "hn_id": "41371342",
    613         "title": "Google proves Fault-Tolerant Quantum Computing is possible",
    614         "points": 2,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=41371342"
    617       },
    618       {
    619         "hn_id": "35397720",
    620         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    621         "points": 2,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=35397720"
    624       },
    625       {
    626         "hn_id": "22791011",
    627         "title": "A physicist view of the airborne infection",
    628         "points": 2,
    629         "comments": 0,
    630         "url": "https://news.ycombinator.com/item?id=22791011"
    631       },
    632       {
    633         "hn_id": "47221336",
    634         "title": "Show HN: Benchmarking the Keep memory system with LoCoMo",
    635         "points": 1,
    636         "comments": 0,
    637         "url": "https://news.ycombinator.com/item?id=47221336"
    638       }
    639     ],
    640     "top_points": 11,
    641     "total_points": 33,
    642     "total_comments": 6
    643   }
    644 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs