scan.json (23811B)
1 { 2 "paper": { 3 "title": "A Hazard Analysis Framework for Code Synthesis Large Language Models", 4 "authors": [ 5 "Heidy Khlaaf", 6 "Pamela Mishkin", 7 "Joshua Achiam", 8 "Gretchen Krueger", 9 "Miles Brundage" 10 ], 11 "year": 2022, 12 "venue": "arXiv", 13 "arxiv_id": "2207.14157", 14 "doi": "10.48550/arXiv.2207.14157" 15 }, 16 "scan_version": 2, 17 "active_modules": [], 18 "methodology_tags": ["qualitative", "theoretical"], 19 "key_findings": "The paper proposes a hazard analysis framework adapted from safety-critical systems engineering for code synthesis LLMs like Codex. Qualitative evaluation finds Codex capable at constrained single-function tasks but unable to handle concurrency, hyperproperties, or high-level multi-module specifications. The risk assessment identifies discrimination/bias, security vulnerabilities, safety-critical misuse, alignment failures, and economic displacement as priority hazards, each mapped to novel hazard severity categories and a hazard risk index.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No source code, repository URL, or supplementary archive is provided. The evaluation test problems and hazard analysis artifacts are not released." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "No evaluation data, test problems, or hazard analysis datasets are released. The paper describes qualitative evaluation results but does not provide the underlying problems or prompts." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No environment specifications, library versions, or Codex API configuration details are provided." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step instructions for reproducing the capability evaluation or hazard analysis process are included." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": false, 46 "answer": false, 47 "justification": "The paper presents only qualitative assessments of Codex capabilities. No quantitative results are reported, so confidence intervals are structurally inapplicable." 48 }, 49 "significance_tests": { 50 "applies": false, 51 "answer": false, 52 "justification": "No quantitative comparisons are made. All evaluations are qualitative descriptions of model behavior." 53 }, 54 "effect_sizes_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "No quantitative measurements are reported. Capability assessments are purely qualitative (e.g., 'encouraging results', 'poor output')." 58 }, 59 "sample_size_justified": { 60 "applies": false, 61 "answer": false, 62 "justification": "This is a qualitative/theoretical framework paper with no quantitative sample." 63 }, 64 "variance_reported": { 65 "applies": false, 66 "answer": false, 67 "justification": "No quantitative experiments are run, so variance reporting is inapplicable." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": false, 74 "justification": "Codex is evaluated in isolation. No comparison against other code generation systems (e.g., prior synthesis tools, other LLMs) is provided. The paper references human ability as a conceptual baseline but does not systematically compare." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": false, 79 "justification": "No baselines of any kind are included, so the question of whether they are contemporary does not arise." 80 }, 81 "ablation_study": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a framework paper, not a system with decomposable components. No ablation study is applicable." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "The evaluation uses multiple qualitative dimensions: variable interdependencies, temporal reasoning, concurrency/parallelism, nondeterminism, hyperproperties, specification abstraction level, and automatic architecture determination (Section 2.3.1)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "The capability evaluation in Section 2.3.1 is conducted by human experts who manually interpret and classify Codex model outputs. The paper notes 'a present limitation is that it requires significant effort by a human expert to interpret and classify model outputs' (Section 1)." 95 }, 96 "held_out_test_set": { 97 "applies": false, 98 "answer": false, 99 "justification": "The evaluation is qualitative with no formal train/test split. The concept of a held-out test set is inapplicable to this framework paper." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 2.3.1 provides a per-category breakdown of Codex's capabilities: variable interdependencies, temporal reasoning, concurrency/parallelism, nondeterminism, high-level specification, and hyperproperties, each assessed individually." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 2.3.1 extensively discusses failure cases: Codex fails at inter-reasoning over 4+ variable relationships, fails at concurrency at any level of specification, cannot synthesize cryptographic hyperproperties, and struggles with high-level multi-module specifications." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports numerous negative results: Codex's complete failure at concurrency/parallelism, inability to handle hyperproperties, failure with unique specifications not in training data, generation of syntactically incorrect code, and recommendation of undefined functions and unimported modules (Section 2.3.1)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims Codex 'exceeds the previous state of the art' (supported by reference to [13]), describes the hazard analysis framework (detailed in Sections 3-4), and the evaluation framework (detailed in Section 2). All claims are addressed in the paper body." 122 }, 123 "causal_claims_justified": { 124 "applies": false, 125 "answer": false, 126 "justification": "The paper does not make empirical causal claims. It makes qualitative capability assessments and proposes a framework. Statements like 'Codex struggles to generalize' are descriptive observations, not causal claims requiring controlled study design." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper explicitly bounds findings to Codex and specific languages: 'Codex has been primarily trained on Python, Javascript, Typescript, and Ruby codebases' and 'Codex may thus only be proficient at synthesizing domain solutions optimal for languages for which it has been trained on' (Section 2.3.1). The paper also notes findings may apply to 'code synthesis LLMs like Codex' more broadly but is careful about scope." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "Section 2.3.1 discusses multiple factors affecting Codex outputs: context of existing code, function/variable names, comments, training data distribution, and prompt conciseness/length. These are presented as alternative explanations for observed behavior." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper explicitly argues that measuring code output quality is insufficient — 'we should be evaluating generation and synthesis models against the complexity and expressivity of specification prompts and their capability to understand and execute them if we wish to understand their performance relative to human ability' (Section 2). This directly addresses the proxy-outcome gap." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper refers only to 'Codex' without specifying a model version, snapshot date, or size. The Codex paper [13] is cited but no specific model variant is stated for the evaluations." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The evaluation prompts used to test Codex are described in natural language (e.g., 'prompted to enforce a safety property', 'prompted to synthesize more complex and unique specifications') but no actual prompt text is provided." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "No hyperparameters (temperature, top-p, max tokens, sampling strategy) are reported for any of the Codex evaluations." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. Codex is queried directly for code generation." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "No documentation of how evaluation problems were constructed, selected, or filtered. The paper describes categories of evaluation but not the specific methodology for constructing test problems." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 2.3.1 is titled 'Evaluation and Limitations' and provides substantive discussion of Codex's limitations across multiple capability dimensions. The paper also discusses limitations of the evaluation framework itself." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 2.3.1 discusses specific threats: training data quality and categorization is unreliable, 'one consequential word is often the difference between Codex producing correct or incorrect results', results depend on training data distribution, and the evaluation 'requires significant effort by a human expert to interpret and classify model outputs.'" 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper explicitly states scope boundaries: Codex is limited to Python/JS/TS/Ruby domains, 'high-level systems specifications (e.g. requirements for an aircraft) are currently beyond the scope of Codex's capabilities' (Section 2.3.1), and the hazard analysis targets API/Copilot deployment specifically (Section 1)." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw evaluation data (test problems, Codex outputs, expert assessments) is made available for independent verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": false, 197 "justification": "The paper describes the categories of evaluation (variable interdependencies, temporal reasoning, etc.) but does not describe how specific test problems were collected or constructed. The qualitative evaluation methodology lacks procedural detail." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants were recruited. The evaluation was conducted by the authors using self-constructed test problems." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "No documentation of the pipeline from test problem creation to capability assessment. The evaluation process is described at a high level but lacks step-by-step documentation." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding disclosure or acknowledgments section is present in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly stated: four authors from OpenAI, and Khlaaf with a note 'Work done while at OpenAI.' The conflict — OpenAI employees evaluating their own product — is visible from the affiliations." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "The work was conducted at OpenAI, which developed and commercially deploys Codex. OpenAI has a direct financial interest in how Codex's risks and capabilities are characterized. The funder is not independent of the outcome." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is included. Several authors are employees of OpenAI, which commercializes Codex, but this conflict is not explicitly declared beyond affiliations." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "The paper does not evaluate Codex on a benchmark. It performs qualitative capability assessment, making training cutoff discussion inapplicable." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "No benchmark evaluation is performed. The qualitative evaluation does not involve a formal test set where overlap could be measured." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "No benchmark evaluation is performed. Contamination concerns are inapplicable to this qualitative framework paper." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. The evaluation was conducted by the authors themselves." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "This is a framework/theoretical paper. It does not propose a method with inference costs to report." 291 }, 292 "compute_budget_stated": { 293 "applies": false, 294 "answer": false, 295 "justification": "This is a framework/theoretical paper with qualitative evaluation. No significant compute budget to report." 296 } 297 } 298 }, 299 "claims": [ 300 { 301 "claim": "Codex exceeds the previous state of the art in its capacity to synthesize and generate code.", 302 "evidence": "Stated in abstract and Section 1, supported by reference to the Codex evaluation paper [13] (Chen et al. 2021). No independent evaluation is provided in this paper.", 303 "supported": "weak" 304 }, 305 { 306 "claim": "Codex struggles with variable interdependencies beyond three or more variables, especially with unique prompts.", 307 "evidence": "Section 2.3.1 states: 'when faced with inter-reasoning over four or more variable relationships, especially when given unique prompts, Codex struggles to deduce the relationship between the presented variables.' Qualitative observation only; no quantitative data provided.", 308 "supported": "weak" 309 }, 310 { 311 "claim": "Codex performs poorly on concurrency and parallelism at any level of specification abstraction.", 312 "evidence": "Section 2.3.1: 'Codex's performance so far indicates poor output and large reasoning gaps when synthesizing code requiring use of concurrency at any level of specification abstraction. All results thus far did not correctly synthesize solutions requiring fairness, atomicity, and/or synchronization.' Qualitative assessment without quantitative metrics.", 313 "supported": "weak" 314 }, 315 { 316 "claim": "Codex can accelerate ML model building by generating boilerplate code.", 317 "evidence": "Section 2.3.1 on nondeterminism: 'Codex demonstrated productive results as it was able to effectively generate boilerplate ML code, especially for common portions of well used codebases (e.g., MNIST loading code).' Qualitative observation.", 318 "supported": "weak" 319 }, 320 { 321 "claim": "High-level systems specifications are currently beyond Codex's capabilities.", 322 "evidence": "Section 2.3.1: 'if one were to define specifications that must be solved across multiple modules with automatic determination of program architecture, Codex would struggle to synthesize such requests. This entails that high-level systems specifications (e.g. requirements for an aircraft) are currently beyond the scope.' Qualitative assessment.", 323 "supported": "weak" 324 }, 325 { 326 "claim": "Standard hazard severity categories are insufficient for LLM safety and need expansion.", 327 "evidence": "Section 3 and Tables 1-2 propose expanded hazard severity categories and a novel set of losses (L1-L4) for language model APIs. This is a framework contribution, not an empirical claim.", 328 "supported": "moderate" 329 } 330 ], 331 "red_flags": [ 332 { 333 "flag": "Company evaluating its own product", 334 "detail": "Four of five authors are OpenAI employees evaluating OpenAI's Codex. While the paper identifies risks and limitations, the framing and severity assessments may be influenced by the company's commercial interests. No independent evaluation or external co-authors are involved." 335 }, 336 { 337 "flag": "No quantitative evidence for capability claims", 338 "detail": "All capability assessments in Section 2.3.1 are purely qualitative ('encouraging results', 'poor output', 'struggles to'). No systematic benchmarks, success rates, or quantitative metrics support the claimed capability levels. The hazard risk indices (Table 4) depend on these qualitative assessments." 339 }, 340 { 341 "flag": "Non-reproducible methodology", 342 "detail": "The evaluation test problems, prompts, and Codex outputs are not shared. The qualitative evaluation methodology cannot be independently verified or reproduced. The paper acknowledges this indirectly: 'a present limitation is that it requires significant effort by a human expert to interpret and classify model outputs.'" 343 }, 344 { 345 "flag": "Risk probabilities based on undisclosed evaluation", 346 "detail": "The Hazard Risk Index assignments (e.g., '1E - Codex is currently not capable of synthesizing code beyond tightly specified, constrained problem instances') rely on the qualitative capability evaluation, but the evidence underlying these probability estimates is not available for scrutiny." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "Evaluating Large Language Models Trained on Code", 352 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 353 "year": 2021, 354 "arxiv_id": "2107.03374", 355 "relevance": "The Codex paper — foundational work on code synthesis LLM evaluation, directly evaluated in this hazard analysis." 356 }, 357 { 358 "title": "An Empirical Cybersecurity Evaluation of GitHub Copilot's Code Contributions", 359 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 360 "year": 2021, 361 "arxiv_id": "2108.09293", 362 "relevance": "Concurrent study demonstrating security vulnerabilities in Copilot-generated code, directly cited as supporting evidence for security risks." 363 }, 364 { 365 "title": "Language models are few-shot learners", 366 "authors": ["Tom B Brown", "Benjamin Mann", "Nick Ryder"], 367 "year": 2020, 368 "arxiv_id": "2005.14165", 369 "relevance": "GPT-3 paper documenting bias and safety risks in large language models that carry over to code generation." 370 }, 371 { 372 "title": "Ethical and social risks of harm from Language Models", 373 "authors": ["Laura Weidinger", "John Mellor", "Maribeth Rauh"], 374 "year": 2021, 375 "arxiv_id": "2112.04359", 376 "relevance": "Proposes a taxonomy of LLM risk categories that complements this paper's code-specific hazard analysis." 377 }, 378 { 379 "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?", 380 "authors": ["Emily M Bender", "Timnit Gebru", "Angelina McMillan-Major", "Shmargaret Shmitchell"], 381 "year": 2021, 382 "relevance": "Influential critique of large language model risks including bias, environmental costs, and societal harms." 383 }, 384 { 385 "title": "In-IDE Code Generation from Natural Language: Promise and Challenges", 386 "authors": ["Frank F Xu", "Bogdan Vasilescu", "Graham Neubig"], 387 "year": 2021, 388 "arxiv_id": "2101.11149", 389 "relevance": "Prior work on evaluating in-IDE code generation from natural language, including McCabe Cyclomatic Complexity metrics discussed in Section 2." 390 }, 391 { 392 "title": "Process for Adapting Language Models to Society (PALMS) with Values-Targeted Datasets", 393 "authors": ["Irene Solaiman", "Christy Dennison"], 394 "year": 2021, 395 "arxiv_id": "2106.10328", 396 "relevance": "Proposes fine-tuning on curated datasets to mitigate LLM bias and discrimination, cited as a mitigation strategy." 397 }, 398 { 399 "title": "A General Language Assistant as a Laboratory for Alignment", 400 "authors": ["Amanda Askell", "Yuntao Bai", "Anna Chen"], 401 "year": 2021, 402 "arxiv_id": "2112.00861", 403 "relevance": "AI alignment research directly relevant to the alignment hazards identified in the code synthesis LLM context." 404 }, 405 { 406 "title": "Toward trustworthy AI development: mechanisms for supporting verifiable claims", 407 "authors": ["Miles Brundage", "Shahar Avin", "Jasmine Wang"], 408 "year": 2020, 409 "arxiv_id": "2004.07213", 410 "relevance": "Discusses verification mechanisms for AI safety claims, relevant to the verification challenges noted for code synthesis LLMs." 411 }, 412 { 413 "title": "Model cards for model reporting", 414 "authors": ["Margaret Mitchell", "Simone Wu", "Andrew Zaldivar"], 415 "year": 2019, 416 "relevance": "Referenced as a documentation format for communicating model characteristics and limitations, recommended as a mitigation strategy." 417 } 418 ] 419 }