scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19152B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "A Hazard Analysis Framework for Code Synthesis Large Language Models",
      6     "authors": [
      7       "Heidy Khlaaf",
      8       "Pamela Mishkin",
      9       "Joshua Achiam",
     10       "Gretchen Krueger",
     11       "Miles Brundage"
     12     ],
     13     "year": 2022,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2207.14157",
     16     "doi": "10.48550/arXiv.2207.14157"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's core claims — that Codex exceeds prior SOTA (attributed to [13]) and that the paper outlines a hazard analysis framework — are substantiated by Sections 2 and 3 respectively.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper asserts causal claims such as synthesis tools 'concentrat[ing] power and exacerbat[ing] inequality' and increasing job displacement with no study design or evidence; these are listed as speculative hazards rather than demonstrated relationships.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The framework was built specifically for Codex but is freely generalized to 'all code synthesis LLMs' and to broad societal impacts (economic, environmental, political) well beyond what the qualitative Codex capability evaluation supports.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper presents its hazard classifications and risk prioritization as given without considering alternative interpretations of identified risks or whether different evaluation approaches might yield different rankings.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "The paper does not report quantitative outcome measurements; the capability evaluation is qualitative and no measurement-to-claim gap applies.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 2.3.1 is titled 'Evaluation and Limitations' and substantively discusses specific limitations of the capability evaluation methodology.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 2.3.1 identifies specific threats: reliance on training data distribution, language coverage limited to Python/JS/TS/Ruby, inability to formally verify outputs for dynamically typed languages, and the high human-expert cost of evaluation.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly bounds scope to API-deployed code synthesis LLMs and notes that 'high-level systems specifications (e.g. requirements for an aircraft) are currently beyond the scope of Codex's capabilities,' directly constraining risk priorities.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding statement appears; all authors are listed as OpenAI employees (with Khlaaf noting 'work done while at OpenAI') but no institutional funding disclosure is provided.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed in the header: four authors explicitly affiliated with OpenAI, and Khlaaf's footnote discloses her work was done while at OpenAI.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OpenAI employees are conducting a safety analysis of their own product (Codex/GitHub Copilot); the organization producing and evaluating the system are the same.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, equity disclosure, patent declaration, or consulting conflict appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key technical terms are defined: 'Hazard Severity Categories' (Table 1), 'Hazard Risk Index' (Table 3), 'alignment' is operationalized as 'the degree to which the behavior of the AI does or does not accord with user intentions,' and specification abstraction levels are defined in Section 2.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly and explicitly states it contributes (1) a capability evaluation framework for code synthesis LLMs and (2) a hazard analysis framework for Codex-like deployment systems, stated in both abstract and introduction.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper engages substantively with Leveson's safety engineering work [26], prior synthesis benchmarks [20, 21, 30, 35], ethical risk frameworks for LLMs [34], and bias research [2, 5, 7], explicitly noting where existing metrics (McCabe CC) are insufficient.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "position": {
    120       "argument_quality": {
    121         "argument_internally_consistent": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper follows a coherent logical chain: capability evaluation → capabilities baseline → hazard analysis informed by baseline → risk prioritization → mitigations. No internal contradictions are present.",
    125           "source": "haiku"
    126         },
    127         "counterarguments_addressed": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper does not address counterarguments — e.g., whether traditional hazard analysis is appropriate for ML systems at all, whether the risk prioritization choices are correct, or whether a governance-first vs. technical-mitigation-first approach might be preferable.",
    131           "source": "haiku"
    132         },
    133         "analogies_appropriate": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper explicitly acknowledges that 'unlike traditional safety-critical systems, the potential safety hazards, failure modes, and risks of ML models are often poorly understood,' flagging the limits of the analogy before proceeding with it.",
    137           "source": "haiku"
    138         },
    139         "prescriptions_proportional": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Mitigations are partitioned into 'Plausible and Immediate' vs. 'Long Term,' and Section 5 explicitly invokes the ALARP principle requiring balance between safety benefits and implementation costs — proportionate framing throughout.",
    143           "source": "haiku"
    144         },
    145         "evidence_for_claims_cited": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Factual claims about LLM bias are supported with citations [2, 5, 7, 10]; capability claims reference the Codex evaluation paper [13]; hazard analysis methodology references established DoD and Leveson standards [15, 26].",
    149           "source": "haiku"
    150         },
    151         "alternatives_discussed": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No alternative safety frameworks (e.g., red teaming as primary mechanism, model cards, governance-first approaches, or other AI risk taxonomies) are compared against the proposed hazard analysis approach.",
    155           "source": "haiku"
    156         },
    157         "historical_context_accurate": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Historical references are accurate: Lamport's temporal logic [23], FlashFill for string synthesis [17], genetic programming community [22], and the formal synthesis community [29] are correctly cited and characterized.",
    161           "source": "haiku"
    162         }
    163       },
    164       "clarity_and_scope": {
    165         "key_terms_defined_precisely": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Terms including 'high-level requirements,' 'derived sub-requirements,' 'hyperproperties,' 'noninterference,' and 'alignment' are all given precise operational definitions within the paper's context rather than left as assumed shared knowledge.",
    169           "source": "haiku"
    170         },
    171         "engages_with_existing_literature": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper engages substantively with Weidinger et al. [34] on LLM ethical risks, prior synthesis benchmarks, and safety engineering literature, explicitly noting where existing metrics are insufficient for LLM evaluation purposes.",
    175           "source": "haiku"
    176         },
    177         "intended_audience_clear": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The audience is implied ('those constructing code synthesis LLMs') but never explicitly stated; it is unclear whether the framework primarily targets ML practitioners, safety engineers, legal/policy teams, or all three.",
    181           "source": "haiku"
    182         },
    183         "assumptions_stated": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Key assumptions are stated: deployment via API, users are 'versed and familiar with defining system requirements' (footnote 3), and 'quantitative data and analysis is not currently always possible to achieve' for hazard probability estimation.",
    187           "source": "haiku"
    188         },
    189         "scope_of_applicability_discussed": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The paper explicitly states the framework was 'initially developed to study Codex specifically' but is 'of general interest in the safe development and deployment of code synthesis LLMs,' and notes 'different mitigations may be appropriate for other kinds of systems.'",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Codex exceeds previous state-of-the-art in code synthesis capacity",
    201       "evidence": "Attributed to the prior OpenAI Codex evaluation paper [13]; not independently demonstrated in this paper",
    202       "supported": "moderate"
    203     },
    204     {
    205       "claim": "Codex struggles with variable interdependencies involving more than three variables",
    206       "evidence": "Section 2.3.1 qualitative evaluation reports failure 'when faced with inter-reasoning over four or more variable relationships, especially when given unique prompts'",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "Codex fails at all concurrency and parallelism synthesis tasks",
    211       "evidence": "Section 2.3.1 states 'All results thus far did not correctly synthesize solutions requiring fairness, atomicity, and/or synchronization'",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "Traditional hazard analysis from safety-critical systems engineering can be adapted for LLM risk assessment",
    216       "evidence": "Section 3 demonstrates an adaptation with novel HSC (Table 1) and Losses Definitions (Table 2), but the approach is not validated against outcomes or compared to alternatives",
    217       "supported": "weak"
    218     },
    219     {
    220       "claim": "Code synthesis tools risk exacerbating economic inequality and causing job displacement",
    221       "evidence": "Listed as hazards in Section 4.5 with no empirical evidence, citations, or economic analysis — purely speculative assertions",
    222       "supported": "unsupported"
    223     },
    224     {
    225       "claim": "Capability evaluation must precede risk assessment for LLMs because failure modes are poorly understood",
    226       "evidence": "Section 2.3.1 explicitly argues: 'traditional risk assessments require implicit assumptions and knowledge regarding a prospective system's capacities, limitations, and failure modes'",
    227       "supported": "moderate"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "theoretical",
    232     "qualitative"
    233   ],
    234   "key_findings": "The paper proposes a two-part framework: a qualitative capability evaluation benchmarking code synthesis LLMs against specification complexity and computational reasoning demands, and a hazard analysis adapted from safety-critical systems engineering (SHA-like) that maps capabilities to a prioritized risk register. Codex is found limited to module-level, single-function synthesis — failing at concurrency, architecture-level tasks, and hyperproperties — which bounds near-term safety-critical misuse risk. The highest-priority risks are discrimination and bias in generated code (HRI 2B), alignment failures producing buggy outputs, and longer-term economic displacement. Mitigations are partitioned into immediately implementable controls (rate limiting, output filtering, UX safety design, documentation) and long-term research directions (AST-aware model architectures, fine-tuning on curated data, economic impact research).",
    235   "red_flags": [
    236     {
    237       "flag": "Severe conflict of interest",
    238       "detail": "All authors are OpenAI employees assessing safety of OpenAI's own product (Codex/GitHub Copilot) with no independent review, third-party validation, or acknowledgment of this conflict in the paper."
    239     },
    240     {
    241       "flag": "Qualitative evaluation without quantitative metrics",
    242       "detail": "The capability evaluation in Section 2 describes Codex performance in qualitative prose ('performs relatively well,' 'struggles') with no success rates, inter-rater reliability, or quantitative benchmarks despite comparing to 'human ability.'"
    243     },
    244     {
    245       "flag": "Unsupported societal impact claims",
    246       "detail": "Section 4.5 lists economic displacement, inequality exacerbation, and environmental harm as hazards with no supporting evidence, economic modeling, or citations."
    247     },
    248     {
    249       "flag": "Framework not empirically validated",
    250       "detail": "The hazard analysis framework is never tested for completeness, coverage recall, or predictive accuracy against outcomes — presented as a methodological contribution without validation."
    251     }
    252   ],
    253   "cited_papers": [
    254     {
    255       "title": "Evaluating Large Language Models Trained on Code",
    256       "relevance": "Primary reference for Codex capabilities and prior safety analysis; this paper extends that work with a formal hazard framework"
    257     },
    258     {
    259       "title": "Ethical and social risks of harm from Language Models",
    260       "relevance": "Companion risk taxonomy for LLMs (Weidinger et al.); cited for risk category context and as a complementary framework"
    261     },
    262     {
    263       "title": "Engineering a safer world: Systems thinking applied to safety",
    264       "relevance": "Foundational Leveson safety engineering methodology that the hazard analysis approach is adapted from"
    265     },
    266     {
    267       "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?",
    268       "relevance": "Cited for bias and open-ended generation risks in LLMs trained on internet data"
    269     },
    270     {
    271       "title": "Model cards for model reporting",
    272       "relevance": "Referenced as a recommended documentation format for communicating model limitations and characteristics to users"
    273     },
    274     {
    275       "title": "An Empirical Cybersecurity Evaluation of GitHub Copilot's Code Contributions",
    276       "relevance": "Concurrent empirical study providing independent validation of security vulnerability concerns raised in this framework"
    277     },
    278     {
    279       "title": "Toward trustworthy AI development: mechanisms for supporting verifiable claims",
    280       "relevance": "Prior work by overlapping authors on verifiable AI safety mechanisms; contextualizes this framework within OpenAI's safety research program"
    281     },
    282     {
    283       "title": "Process for Adapting Language Models to Society (PALMS) with Values-Targeted Datasets",
    284       "relevance": "Cited as evidence that fine-tuning on curated datasets can reduce discriminatory outputs in language models"
    285     }
    286   ],
    287   "engagement_factors": {
    288     "practical_relevance": {
    289       "score": 3,
    290       "justification": "Directly actionable for safety and ML engineering teams deploying code synthesis APIs — provides risk tables, severity categories, and partitioned mitigation checklists."
    291     },
    292     "surprise_contrarian": {
    293       "score": 1,
    294       "justification": "Applies established safety engineering (SHA) to LLMs, novel in framing for 2022 but not counterintuitive; identified risks largely align with community priors."
    295     },
    296     "fear_safety": {
    297       "score": 3,
    298       "justification": "Core purpose is surfacing AI safety hazards including safety-critical system misuse, discrimination, security vulnerabilities, and economic disruption from a major lab."
    299     },
    300     "drama_conflict": {
    301       "score": 2,
    302       "justification": "OpenAI employees publishing a safety critique of their own flagship product creates an implicit tension between completeness and organizational self-interest."
    303     },
    304     "demo_ability": {
    305       "score": 1,
    306       "justification": "The framework is entirely conceptual; no tool, dataset, or code is released — readers cannot apply it without substantial expert effort."
    307     },
    308     "brand_recognition": {
    309       "score": 3,
    310       "justification": "Published by OpenAI researchers about Codex, the model powering GitHub Copilot, at the time of Copilot's public launch — high brand recognition."
    311     }
    312   },
    313   "hn_data": {
    314     "threads": [
    315       {
    316         "hn_id": "32281497",
    317         "title": "A hazard analysis framework for code synthesis large language models",
    318         "points": 18,
    319         "comments": 0,
    320         "url": "https://news.ycombinator.com/item?id=32281497",
    321         "created_at": "2022-07-29T20:39:18Z"
    322       },
    323       {
    324         "hn_id": "41138059",
    325         "title": "LazyLLM: Dynamic Token Pruning for Efficient Long Context LLM Inference",
    326         "points": 2,
    327         "comments": 0,
    328         "url": "https://news.ycombinator.com/item?id=41138059",
    329         "created_at": "2024-08-02T11:50:08Z"
    330       },
    331       {
    332         "hn_id": "32250707",
    333         "title": "DayDreamer: World models for physical robot learning",
    334         "points": 2,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=32250707",
    337         "created_at": "2022-07-27T14:20:33Z"
    338       }
    339     ],
    340     "top_points": 18,
    341     "total_points": 22,
    342     "total_comments": 0
    343   }
    344 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs