scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (32149B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Driving Style Alignment for LLM-powered Driver Agent",
      6     "authors": [
      7       "Ruoxuan Yang",
      8       "Xinyu Zhang",
      9       "Anais Fernandez-Laaksonen",
     10       "Xin Ding",
     11       "Jiangtao Gong"
     12     ],
     13     "year": 2024,
     14     "venue": "IEEE/RJS International Conference on Intelligent RObots and Systems",
     15     "arxiv_id": "2403.11368",
     16     "doi": "10.1109/IROS58592.2024.10802629"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims that agents align with driving styles, dataset created, and validation performed are all supported. Simulation results (Fig 3) show style-specific behavior differentiation; human eval (n=259) confirms perceptibility.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claim that multi-alignment causes style alignment is tested via 3×3 ablation design (demonstrations-only vs feedback-only vs both). Ablation shows multi-alignment is most effective. Limitation: simulation-only, no real-world causality tested.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Paper tests only 2 driving styles in 1 simulator environment (CARLA Town10), but title and abstract promise general 'driving style alignment.' Conclusion claims 'paves the way...across a broad spectrum of applications' beyond scope tested.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Paper shows multi-alignment works empirically but provides limited mechanistic explanation. The finding that humans associate higher riskiness with human-likeness is acknowledged as 'interesting psychological insight' but not deeply explored as alternative interpretation.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Clearly distinguishes measured outcomes (collision rate, throttle %, speed) from conceptual claims (driving style). Human evaluation outcomes (riskiness ratings, intelligence, human-likeness) appropriately mapped to perceived style perception.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section. Conclusion contains brief discussion of implications and psychological insights but not formal limitations discussion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Paper does not explicitly discuss major threats: simulation-only validation, only 2 of 4 identified driving styles, small data collection (24 drivers), short video clips (30s) in human eval, single simulator environment.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit boundaries stated on scope. Paper does not acknowledge limitations to CARLA simulator, 2 styles only, or single urban environment. Claims generalize beyond tested conditions without caveats.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding statement or acknowledgments section listing funding sources. Work from Tsinghua but no disclosure of whether it was funded internally or externally.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors listed with affiliation: Institute for AI Industry Research, Tsinghua University. No undisclosed affiliations with autonomous driving companies.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Funding source not disclosed, so cannot assess independence. If Tsinghua funded work promoting their own framework, potential conflict exists but cannot verify.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement. No disclosure of patents, equity stakes, or consulting relationships related to autonomous driving or LLM companies.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms defined: 'driving style' via MDSI questionnaire + objective CAN-Bus metrics (speed, throttle); 'alignment' via demonstrations + coach feedback; 'multi-alignment framework' clearly explained with Driver/Coach agents.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions stated: (1) multi-alignment framework, (2) natural language dataset, (3) validation via simulation + human eval. Reader understands what paper adds to the field.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Introduction engages with prior work on LLM reasoning for autonomous driving, limitations of existing alignment methods (fine-tuning, expert feedback), and existing dataset modalities. Shows how this work addresses a gap in style-alignment and natural language data.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Paper states 'The implementation of the framework...can be found at the link' with GitHub URL (github.com/AIR-DISCOVER/Multi-alignment-Drivng-Agent). Code is publicly released.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Driving-Thinking-Dataset released on GitHub (github.com/AIR-DISCOVER/Driving-Thinking-Dataset) with 24 drivers' think-aloud transcripts in natural language format.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Partial specification: Python 3.7, CARLA 0.9.14, Unreal Engine 4 provided. But missing key dependencies (numpy, pandas, requests for API calls, etc.). Specification insufficient for full reproduction.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Paper provides no step-by-step reproduction instructions. References GitHub but doesn't show what instructions are there. Reader cannot reproduce from paper alone.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Fig 3b shows only mean values for speed, throttle, brake with no error bars. Simulation metrics reported without confidence intervals. No spread/variance visualization.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Fig 4a shows p-values with stars (p<0.0001 ****, 0.0001-0.001 ***, etc.). Comparative claims in results section backed by statistical tests, though test type not specified.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Only p-values reported in Fig 4a. No Cohen's d, eta-squared, or other effect sizes for collision rates, speed, or human evaluation metrics. Effect magnitude unclear.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for 24 drivers in data collection, 259 human participants, or 50.3 hours of simulation. No power analysis provided. Sample sizes appear chosen for convenience.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Simulation results (Fig 3b) report only means with no error bars or standard deviations. Human eval reports point estimates without spreads. Variance across runs not shown.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "NOT-ALIGNED condition serves as baseline for comparison. Shows what happens without demonstrations or coach feedback.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Only internal baseline (no alignment) tested. No comparison to other alignment methods from literature (fine-tuning, RLHF, in-context learning). Weak baselines limit evidence for novelty.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "3×3 design tests demonstrations-only vs feedback-only vs multi-alignment. Shows multi-alignment most effective and both components contribute, suggesting necessity of both.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Simulation metrics: collision rate, average speed, throttle %, brake %. Human eval metrics: riskiness ranking, intelligence score, human-likeness score. Six dimensions of evaluation.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "259 participants evaluated 30-second video clips of agent driving behavior. Ranked riskiness and scored intelligence/human-likeness. Evaluates system outputs (driving videos), not just dataset.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Simulation generalization tested on unseen scenarios with randomly generated endpoints (not pre-set). Single environment (CARLA Town10) but driving paths varied.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results broken down by driving style (CAUTIOUS vs RISKY vs NOT-ALIGNED), alignment method (D vs F vs M), and human evaluation by participant driving style. Category-level analysis provided.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Limited discussion of failure modes. Paper notes that demonstrations alone were 'least effective' but does not show specific scenarios where method fails or provide failure case analysis.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "DEMONSTRATIONS-only showed 'least effectiveness' compared to other methods, which is a partial negative result. But no completely failed conditions or null findings reported.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Only 'OpenAI's GPT-4 APIs' mentioned without specifying which GPT-4 version (gpt-4, gpt-4-turbo, gpt-4-32k), model date, or snapshot. CARLA 0.9.14 is specific but LLM is not.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Example prompts shown: 'Think Step by Step' and example reasoning ('Given the rather faster speed...'). Full system prompts for Driver Agent and Coach Agent not provided in paper.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, max_tokens, or other GPT-4 hyperparameters reported. CARLA time-step specified (0.0008-0.0015s) but LLM inference hyperparameters missing.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Agentic scaffolding well described: Driver Agent workflow (perception→situation→reasoning→action), Coach Agent evaluation logic, Guidelines module, short-term memory management. Components and interactions clear.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Detailed pipeline: naturalistic driving (24 drivers, 5.7 km, 13 conditions), post-experiment interviews (1.5-2 hrs, video reconstruction), transcription, organization into Situation/Reasoning/Action format, style classification (MDSI + CAN-Bus metrics), representative selection.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Driving-Thinking-Dataset GitHub repository released. Raw interview transcripts and decision processes should be available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "24 drivers, 5.7 km urban drive, 13 driving conditions. Detailed recording setup (360° camera, in-car camera, eye tracker, CAN-Bus). Post-experiment interviews (1.5-2 hrs) with video reconstruction. Well documented.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Driver data collection: only described as '24 drivers invited' with 'different genders, age groups, professional and novice drivers.' No recruitment method stated. Human eval: third-party channel ($2.08 compensation) and social media. Partial documentation.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented: collection (driving experiment + interview) → transcription → organization (Situation/Reasoning/Action format) → style classification (MDSI questionnaire + CAN-Bus metrics) → demonstration selection → use in framework.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "GPT-4 APIs used but model cutoff date not stated. No discussion of when GPT-4 was trained or knowledge cutoff. Reproducibility unclear without this information.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Paper uses GPT-4 (general internet-trained LLM) to drive simulated cars in CARLA. Scenario descriptions in prompts could overlap with internet content about driving, but no train-test overlap discussion provided.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Not evaluating on standard benchmarks; uses custom CARLA scenarios. Not applicable in traditional sense, but paper does not address potential contamination of driving knowledge in GPT-4 pretraining.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration or trial registration number mentioned. Study design not pre-registered, raising concerns about p-hacking or post-hoc analysis.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB approval, ethics approval, or institutional review mentioned despite involving 24 drivers + 259 human participants. Major ethical concern for human subjects research.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Data collection: '24 drivers with different genders, age groups, professional and novice.' Human eval: 259 participants (141 male 52.22%, 129 female 47.78%, ages 19-54). Partially detailed.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "Drivers: only 'different demographics' and experience levels mentioned, no explicit inclusion/exclusion. Human eval: only criterion 'possess a driving license.' Minimal criteria documentation.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "Video clips presented in 'random order' to human participants. Within-subject design ensures all participants see all conditions. Randomization partially described.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "No blinding mentioned. Participants likely knew they were evaluating AI agent driving. No mention of researcher blinding to conditions. Open label design.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "Total: '270+ recruited, received 259 valid responses after screening.' Attrition mentioned but not detailed. Unclear what screening removed or why (trap questions, timing minimums mentioned but exclusion counts not given).",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "GPT-4 API calls made for each driving decision. 50.3 hours simulation corresponds to thousands of API calls, but no cost or latency quantified. Budget impact unknown.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware: 'ThundeRobot Zero desktop.' Time: '50.3 hours simulation, ~6.7 minutes per condition.' No computational cost, API expense, or power consumption reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLM-powered driver agents can be aligned with human driving styles (risky vs cautious) using demonstrations and feedback",
    375       "evidence": "Simulation results (Fig 3) show agents aligned with CAUTIOUS style have 1.31-2.12 collisions/meter vs RISKY 3.04-4.78; human evaluation (Fig 4a) shows significant differences in perceived riskiness (p<0.0001)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Multi-alignment (combining demonstrations + feedback) is more effective than either component alone",
    380       "evidence": "Ablation study (Fig 3) shows MULTI-ALIGNMENT method achieves best collision rate separation and most significant differences in speed/throttle/brake. Fig 4a confirms MC (multi-cautious) > FC (feedback-cautious) > DC (demo-cautious) in human perception (p<0.0001)",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Natural language descriptions of human driving decisions can serve as effective demonstrations for LLM agent alignment",
    385       "evidence": "Dataset of 24 drivers' think-aloud transcripts organized into Situation/Reasoning/Action format enables agents to differentiate driving styles. Framework successfully uses this dataset, validating its utility",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Agents aligned with cautious driving styles exhibit measurably safer behavior (lower collision rates) than risky-aligned agents",
    390       "evidence": "Fig 3a: CAUTIOUS alignment produces 0.73-1.53 collisions/meter across methods vs RISKY 1.53-4.78. Consistent safety difference across all three alignment methods",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Humans can reliably and significantly distinguish different driving styles in simulated agent behavior",
    395       "evidence": "Human evaluation with 259 participants shows highly significant differences in riskiness rankings between CAUTIOUS vs RISKY conditions (p<0.0001 in all relevant groups, Fig 4a)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Higher perceived riskiness in driving correlates with greater perceived human-likeness (counterintuitive finding)",
    400       "evidence": "Fig 4b shows positive correlation (r=0.10*) between riskiness and human-likeness. Participant comment: 'really like an experienced driver showing off skills' for higher-risk agent",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "The approach opens new avenues for autonomous driving research across diverse applications and user preferences",
    405       "evidence": "Paper demonstrates proof-of-concept in CARLA simulator with 2 driving styles and human validation, but generalization beyond simulation and 2 styles not empirically tested",
    406       "supported": "weak"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "empirical",
    411     "human-studies",
    412     "simulation-based"
    413   ],
    414   "key_findings": "The paper demonstrates that LLM-based driving agents can adopt human-like driving styles (cautious vs risky) through a framework combining demonstrations and feedback. Multi-alignment achieves the most significant behavioral differences in CARLA simulation (collision rates, speed, throttle percentages), and 259 human participants reliably distinguish between the styles in 30-second video clips (p<0.0001). Counterintuitively, humans associate higher riskiness with greater human-likeness despite recognizing riskier driving as less intelligent. The natural language dataset of 24 drivers' decision-making processes provides effective demonstrations, though only 2 driving styles were ultimately used despite identifying 4 in the initial classification.",
    415   "red_flags": [
    416     {
    417       "flag": "Simulation-only validation",
    418       "detail": "All driving tested in CARLA simulator; no real-world validation. Sim-to-real transfer completely unknown. Critical for autonomous driving claims."
    419     },
    420     {
    421       "flag": "Overgeneralized scope claims",
    422       "detail": "Title and conclusion claim general driving style alignment, but paper tests only 2 styles in 1 simulator environment (CARLA Town10). Generalization claims exceed evidence."
    423     },
    424     {
    425       "flag": "Missing ethical approval",
    426       "detail": "Human study with 259 participants and 24 drivers, but no IRB approval, ethics board review, or institutional oversight mentioned. Major concern for human subjects research."
    427     },
    428     {
    429       "flag": "Insufficient statistical reporting",
    430       "detail": "No error bars, confidence intervals, effect sizes (Cohen's d), or sample size justification. Only p-values reported. Makes effect magnitude interpretation impossible."
    431     },
    432     {
    433       "flag": "Unspecified model version",
    434       "detail": "GPT-4 used but no version (gpt-4, gpt-4-turbo, gpt-4-32k), training cutoff date, or hyperparameters (temperature, top-p) provided. Reproducibility compromised."
    435     },
    436     {
    437       "flag": "Small data collection sample",
    438       "detail": "Only 24 drivers for creating human demonstrations—small for capturing diversity of driving styles. No justification for sample size."
    439     },
    440     {
    441       "flag": "Limited baseline comparisons",
    442       "detail": "Only compared against no-alignment baseline. No comparison to other alignment methods (fine-tuning, RLHF, in-context learning) despite these being discussed as existing approaches."
    443     },
    444     {
    445       "flag": "No limitations section",
    446       "detail": "Paper lacks dedicated limitations or threats-to-validity section. Does not acknowledge simulation scope, generalization limits, or methodological constraints."
    447     },
    448     {
    449       "flag": "Short video clips in human evaluation",
    450       "detail": "Only 30-second video clips used for human evaluation of agent driving. May be insufficient to perceive true driving style differences beyond surface metrics (speed, throttle)."
    451     },
    452     {
    453       "flag": "Partial environment specification",
    454       "detail": "CARLA and Python versions provided, but key dependencies missing (packages, API libraries). Insufficient for reproduction without accessing GitHub repo."
    455     }
    456   ],
    457   "cited_papers": [
    458     {
    459       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    460       "authors": "Wei, J. et al.",
    461       "year": 2022,
    462       "relevance": "Core reasoning technique (CoT) used in Driver Agent decision-making; foundational for the framework's planning capability"
    463     },
    464     {
    465       "title": "LLM-planner: Few-shot grounded planning for embodied agents with large language models",
    466       "authors": "Song, C.H. et al.",
    467       "year": 2023,
    468       "relevance": "Few-shot learning approach for embodied agent planning; directly relevant to using demonstrations for style alignment"
    469     },
    470     {
    471       "title": "Driving with llms: Fusing object-level vector modality for explainable autonomous driving",
    472       "authors": "Chen, L. et al.",
    473       "year": 2023,
    474       "relevance": "Prior work on LLM-based autonomous driving; shows progression from perception to LLM-based decision-making"
    475     },
    476     {
    477       "title": "DriveGPT4: Interpretable end-to-end autonomous driving via large language model",
    478       "authors": "Xu, Z. et al.",
    479       "year": 2023,
    480       "relevance": "Concurrent work on end-to-end LLM driving agents; demonstrates interpretability in autonomous driving"
    481     },
    482     {
    483       "title": "Training language models to follow instructions with human feedback",
    484       "authors": "Ouyang, L. et al.",
    485       "year": 2022,
    486       "relevance": "RLHF technique; represents the costly human feedback approach that this work aims to improve upon with coach agent"
    487     },
    488     {
    489       "title": "Reflexion: Language agents with verbal reinforcement learning",
    490       "authors": "Shinn, N. et al.",
    491       "year": 2024,
    492       "relevance": "Agent self-reflection and feedback mechanisms; related to Coach Agent's guideline generation approach"
    493     },
    494     {
    495       "title": "The mind in the machine: Anthropomorphism increases trust in an autonomous vehicle",
    496       "authors": "Waytz, A., Heafner, J., & Epley, N.",
    497       "year": 2014,
    498       "relevance": "Human trust and anthropomorphism in AVs; relevant to motivation for human-like driving style alignment"
    499     },
    500     {
    501       "title": "Human-like driving behaviour emerges from a risk-based driver model",
    502       "authors": "Kolekar, S., de Winter, J., & Abbink, D.",
    503       "year": 2020,
    504       "relevance": "Risk-based models of human driving; provides theoretical foundation for driving style dimensions (risky/cautious)"
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 2,
    510       "justification": "Code and dataset released on GitHub, enabling practitioners to implement the framework. However, requires CARLA setup, Python 3.7, GPT-4 API access, and is only validated in simulation. Not yet deployable for real autonomous vehicles."
    511     },
    512     "surprise_contrarian": {
    513       "score": 2,
    514       "justification": "Key finding that humans associate higher riskiness with greater human-likeness contradicts safety intuition and is counterintuitive. However, most other results confirm expected behavior (cautious agents safer, multi-alignment better than components alone)."
    515     },
    516     "fear_safety": {
    517       "score": 1,
    518       "justification": "LLM-powered agents making driving decisions raises autonomy concerns, but contained to simulation. No discussion of safety failures, adversarial scenarios, or out-of-distribution driving. Limited safety-relevant exploration."
    519     },
    520     "demo_ability": {
    521       "score": 2,
    522       "justification": "Code publicly released and dataset available, allowing others to run the framework. Requires CARLA installation and GPT-4 API setup, which are non-trivial barriers but doable for resourced teams. Demo potential moderately high."
    523     },
    524     "brand_recognition": {
    525       "score": 2,
    526       "justification": "Institute for AI Industry Research at Tsinghua University is a respectable institution, but not a top-tier AI research lab in global recognition. Tsinghua carries prestige but this lab is not widely known in AI research community."
    527     }
    528   },
    529   "hn_data": {
    530     "threads": [
    531       {
    532         "hn_id": "45923139",
    533         "title": "Chinese co's roadmap for aneutronic fusion",
    534         "points": 11,
    535         "comments": 3,
    536         "url": "https://news.ycombinator.com/item?id=45923139"
    537       },
    538       {
    539         "hn_id": "35314773",
    540         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    541         "points": 4,
    542         "comments": 1,
    543         "url": "https://news.ycombinator.com/item?id=35314773"
    544       },
    545       {
    546         "hn_id": "41365788",
    547         "title": "Quantum error correction below the surface code threshold",
    548         "points": 3,
    549         "comments": 2,
    550         "url": "https://news.ycombinator.com/item?id=41365788"
    551       },
    552       {
    553         "hn_id": "42375612",
    554         "title": "Quantum error correction below the surface code threshold",
    555         "points": 3,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=42375612"
    558       },
    559       {
    560         "hn_id": "35298128",
    561         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    562         "points": 3,
    563         "comments": 0,
    564         "url": "https://news.ycombinator.com/item?id=35298128"
    565       },
    566       {
    567         "hn_id": "43563070",
    568         "title": "Cordic Is All You Need",
    569         "points": 2,
    570         "comments": 0,
    571         "url": "https://news.ycombinator.com/item?id=43563070"
    572       },
    573       {
    574         "hn_id": "41371342",
    575         "title": "Google proves Fault-Tolerant Quantum Computing is possible",
    576         "points": 2,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=41371342"
    579       },
    580       {
    581         "hn_id": "35397720",
    582         "title": "Reflexion: An autonomous agent with dynamic memory and self-reflection",
    583         "points": 2,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=35397720"
    586       },
    587       {
    588         "hn_id": "22791011",
    589         "title": "A physicist view of the airborne infection",
    590         "points": 2,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=22791011"
    593       },
    594       {
    595         "hn_id": "47221336",
    596         "title": "Show HN: Benchmarking the Keep memory system with LoCoMo",
    597         "points": 1,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=47221336"
    600       }
    601     ],
    602     "top_points": 11,
    603     "total_points": 33,
    604     "total_comments": 6
    605   }
    606 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs