ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (27431B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Guiding AI to Fix Its Own Flaws: An Empirical Study on LLM-Driven Secure Code Generation",
      6     "authors": [
      7       "Hao Yan",
      8       "Swapneel Suhas Vaidya",
      9       "Xiaokuan Zhang",
     10       "Ziyu Yao"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2506.23034",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's core claims (vulnerability rates 9.8%–42.1%, advanced models benefiting from hints and feedback) are directly supported by Tables 3–8 with per-model breakdowns across two benchmarks.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper makes causal claims ('explained feedback enhances repair effectiveness', 'contextualized hints reduce vulnerability rates') but uses no statistical significance tests, reports single-run results, and has a significant confound: explained feedback is generated by GPT-4o for all models, biasing results toward GPT-4o-like reasoning.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion states 'LLMs are inherently prone to generating insecure code' and findings are stated broadly, yet the study covers only Python, two benchmarks, eight models, and CodeQL-detectable vulnerabilities—limitations acknowledged in threats-to-validity but not consistently carried into the main claims.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper raises the question of why GPT-4o scores worse than GPT-3.5 ('architectural modifications, broader training data, or altered fine-tuning?') but does not resolve it; alternative explanations for improved repair with explained feedback (e.g., GPT-4o-written feedback advantaging GPT-4o in repair) are not discussed.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper is explicit that CodeQL static analysis is the proxy for security, defines TarV-R and AllV-R precisely, and acknowledges in the threats section that CodeQL 'fails to capture dynamic vulnerabilities or those that manifest under specific runtime conditions.'",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 6 contains a dedicated 'Threats to validity' subsection with multiple specific paragraphs, not a single concluding sentence.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats cited: CodeQL's inability to detect dynamic/runtime vulnerabilities, Python-only scope, reliance on only two datasets, and the static single-turn evaluation not reflecting multi-turn real-world development interactions.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly bounds results to Python, CodeQL-detectable CWEs, two benchmark datasets, and eight instruction-tuned models; these are stated in the threats section and the experimental setup.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgments section discloses funding from Virginia CCI, GMU GRA Fellowship, GMU Office of Research Computing, and NSF (Award 2018631).",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors list George Mason University as their affiliation on the title page; no commercial affiliations with evaluated LLM vendors are present.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Funders are academic/government (CCI, NSF, GMU) with no financial stake in which LLM performs better on security benchmarks.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of patents, equity, or consulting relationships appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: 'vulnerability' (exploitable weakness), TarV-R and AllV-R (with formulas), 'proactive vulnerability prevention' vs 'post-hoc vulnerability repair', 'direct feedback' vs 'explained feedback', and 'contextualized vulnerability hints' are all explicitly defined.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit research questions (RQ1–RQ3) and a numbered contributions list clearly state the paper's empirical and practical contributions.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Sections 2.1–2.4 provide detailed related work that situates each contribution against prior studies (Pearce et al., Tony et al., Nong et al., etc.) and explicitly identifies gaps the paper addresses.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "An anonymized repository is provided at anonymous.4open.science; the code is currently accessible, though the anonymized URL is typically temporary for double-blind review.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Both benchmarks used (SecurityEval and SecCodePLT) are publicly available datasets; no custom dataset was created.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or pinned dependency list is provided; the paper mentions CodeQL and specific model APIs but gives no reproducible environment specification.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions appear in the paper; the code repo link is given but no README or methodology walkthrough is referenced.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 3–8 are reported as single percentage values with no confidence intervals, error bars, or standard deviations.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests (t-tests, chi-squared, bootstrap, etc.) are applied to any comparative claims despite the paper making numerous comparisons across conditions and models.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Tables 4, 7, and 8 report subscript deltas (e.g., '−12.4%') showing absolute percentage-point changes relative to baseline, providing effect size context.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Sample sizes (1,071 SecCodePLT questions, 121 SecurityEval questions) are inherited from the benchmarks with no power analysis or justification for whether they are sufficient to detect the observed effects.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Results appear to be single-run evaluations with no repeated runs; no variance, standard deviation, or inter-run consistency is reported.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Vanilla prompt results (Table 3) serve as the explicit baseline for all hint and repair comparisons, with subscript deltas consistently referencing the baseline.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include GPT-4o-0513, GPT-3.5-turbo-0125, DeepSeek-Coder-V2-Lite (2024), Llama3.1/3.2, and StarCoder2—all state-of-the-art models at the time of writing.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Table 7 ablates hint quality (CWE Definition vs. Contextualized Hints); Table 8 ablates feedback type (Direct vs. Explained); Table 5 ablates hint relevance (with vs. without target vulnerability).",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two distinct metrics are used: TarV-R (target vulnerability rate) and AllV-R (any-vulnerability rate), capturing different aspects of code security.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Two human evaluators validated 80 hint-definition pairs (Section 5.2) and 160 randomly sampled explained feedback instances (Section 5.3) to verify LLM-judge accuracy.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "SecurityEval and SecCodePLT are used as fixed evaluation benchmarks; no model is trained or fine-tuned, so the benchmarks serve as held-out test sets.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Figure 6 breaks down the top-10 vulnerability types per model on SecCodePLT; Tables 3–8 break down results per model across both benchmarks.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 5.2 explicitly discusses cases where hints increase vulnerability rates (e.g., CodeLlama-7B on SecurityEval) and attributes this to imprecise or irrelevant hints; Section 5.3 discusses models that show zero improvement from feedback.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Several models show increased TarV-R when provided with self-generated hints (Table 4, e.g., CodeLlama-34B +7.5pp AllV-R on SecurityEval); CodeLlama models show no repair improvement from either feedback type (Table 8).",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 2 lists specific versioned model names including 'GPT-3.5-turbo-0125' and 'GPT-4o-0513', and the text references specific model release identifiers for open-weight models.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Figures 2–5 display the complete prompt templates with actual text, including the vanilla generation prompt, hint generation prompt, and both feedback repair prompts, with example inputs and outputs.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Temperature, top-p, max tokens, and other sampling hyperparameters are not reported anywhere in the paper.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Sections 3.2 and 3.3 describe the two-step hint prediction → code generation pipeline and the CodeQL feedback loop in sufficient detail to understand the scaffolding architecture.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The paper documents exclusion of 6 CWEs (274 samples) from SecCodePLT not supported by CodeQL, leaving 21 CWEs and 1,071 samples, and explains how both benchmarks were prepared.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw model outputs, CodeQL detection results, and generated hints are not explicitly stated to be released; only the code (not data) is linked via the anonymized repository.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The paper describes how coding questions from SecurityEval and SecCodePLT were used, how CodeQL was applied for detection, and how the subset of vulnerable code was collected for repair experiments.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants were recruited; this is a benchmark evaluation study using publicly available datasets.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The three-stage pipeline is documented: (1) LLM generates code from benchmark prompts, (2) CodeQL detects vulnerabilities, (3) metrics computed; the hint and repair pipelines add well-described intermediate steps.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The paper does not state training data cutoffs for any of the eight evaluated models, despite this being relevant to potential benchmark contamination.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "The paper does not discuss whether any evaluated models (particularly GPT-4o or GPT-3.5) may have been trained on SecurityEval (2022) or SecCodePLT (2024) data.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "SecurityEval was published in 2022, before the training cutoffs of most evaluated models; GPT-4o and similar models could have been trained on this data, but this is not acknowledged.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants; the human annotation was LLM output validation, not a human subjects study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects study; IRB approval not applicable.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants enrolled in the study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs or inference latency figures are reported despite making thousands of API calls to GPT-3.5/4o for both evaluation and explained feedback generation.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "The acknowledgments mention use of GMU's Office of Research Computing resources but provide no quantification of compute hours, GPU-hours, or dollar cost.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "All evaluated LLMs generate vulnerable code at rates from 9.8% to 42.1% (AllV-R) depending on model and benchmark.",
    374       "evidence": "Table 3 shows AllV-R ranging from 9.8% (DeepSeekV2-16B on SecCodePLT) to 42.1% (DeepSeekV2-16B on SecurityEval) across all 8 models and 2 benchmarks.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Self-generated vulnerability hints can reduce vulnerability rates for capable models (e.g., GPT-4o achieves −12.4pp TarV-R) but increase rates for models that generate irrelevant or imprecise hints.",
    379       "evidence": "Table 4 shows GPT-4o reducing TarV-R by 12.4pp on both datasets, while CodeLlama-34B increases AllV-R by 7.5pp on SecurityEval; Table 5 shows that hints lacking the target vulnerability consistently increase TarV-R.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Contextualized hints (scenario-grounded) outperform bare CWE definitions in reducing vulnerability rates.",
    384       "evidence": "Table 7 shows contextualized hints achieving larger TarV-R reductions than CWE definitions on SecurityEval for most models (e.g., DeepSeekV2-16B: −8.5pp vs −6.4pp TarV-R).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Explained CodeQL feedback significantly outperforms direct feedback for powerful models in post-hoc repair (GPT-4o: −28.1pp AllV-R on SecurityEval).",
    389       "evidence": "Table 8 shows GPT-4o and DeepSeekV2-16B achieving much larger AllV-R reductions with explained vs direct feedback, while weaker models show no difference.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Newer models (GPT-4o) have higher vulnerability rates than older models (GPT-3.5-turbo), and the Llama3 series is more vulnerable than CodeLlama.",
    394       "evidence": "Table 3: GPT-4o TarV-R 15.0% vs GPT-3.5 7.7% on SecCodePLT; GPT-4o 25.6% vs GPT-3.5 12.4% on SecurityEval. CodeLlama-7B AllV-R 14.7% vs Llama3.1-8B 40.5% on SecurityEval.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Code-optimized models (CodeLlama, StarCoder2, DeepSeek-Coder) generate less vulnerable code on the structured SecCodePLT but do not maintain this advantage on the broader SecurityEval.",
    399       "evidence": "Table 3 shows StarCoder2-15B at TarV-R 4.0% on SecCodePLT but 24.0% on SecurityEval; DeepSeekV2-16B 4.2% vs 27.3%.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "GPT-4o generates the most precise self-generated vulnerability hints (98.6% precise) while CodeLlama-7B generates the least precise (46.5%).",
    404       "evidence": "Precision rates are reported in Section 5.2 from GPT-4o-as-judge evaluations validated by two human annotators at 95.65% precision and 91.67% recall on 80 sampled pairs.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "empirical"
    411   ],
    412   "key_findings": "Across eight LLMs and two Python security benchmarks, all models generate vulnerable code with AllV-R ranging from 9.8% to 42.1%, with vulnerability distributions largely overlapping across models. Self-generated vulnerability hints reduce insecure code generation for capable models only when the hints are relevant (include the target CWE) and precise—irrelevant hints actively increase vulnerability rates for most models. Contextualized, scenario-grounded hints consistently outperform bare CWE definitions. Post-hoc repair with explained CodeQL feedback (generated by GPT-4o) significantly outperforms raw CodeQL output for powerful models (GPT-4o: −28.1pp AllV-R on SecurityEval), while weaker instruction-following models show minimal improvement from either feedback type.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical testing",
    416       "detail": "All comparative claims (hints reduce vulnerabilities, explained > direct feedback) are made on raw percentage differences with no significance tests, confidence intervals, or multiple-run variance estimates."
    417     },
    418     {
    419       "flag": "GPT-4o as both evaluator and subject",
    420       "detail": "Explained feedback is generated by GPT-4o for all models' code; GPT-4o also evaluates hint preciseness. This creates a circular advantage: GPT-4o-generated feedback is optimized for GPT-4o's reasoning style, which may explain why GPT-4o benefits most from explained feedback."
    421     },
    422     {
    423       "flag": "Benchmark contamination unaddressed",
    424       "detail": "SecurityEval was published in 2022; training cutoffs for GPT-4o and GPT-3.5-turbo are not stated and the possibility that evaluated models trained on these benchmark examples is never discussed."
    425     },
    426     {
    427       "flag": "Hyperparameters absent",
    428       "detail": "Temperature, top-p, and other sampling hyperparameters are not reported for any model, making reproduction uncertain and raising questions about whether results are stable across sampling conditions."
    429     },
    430     {
    431       "flag": "Single-run results",
    432       "detail": "No repeated runs or variance estimates are provided; it is unclear whether the percentage differences reported are within noise bounds for the given sample sizes."
    433     },
    434     {
    435       "flag": "Anonymous temporary code repo",
    436       "detail": "The code link points to anonymous.4open.science, a temporary anonymization service for double-blind review that typically expires after the review period, threatening long-term reproducibility."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Do users write more insecure code with AI assistants?",
    442       "relevance": "Directly motivates the paper by showing AI code assistants increase insecure code production; key prior work establishing the problem."
    443     },
    444     {
    445       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    446       "relevance": "Foundational empirical study finding 40% of GitHub Copilot code was vulnerable across 18 CWE types; direct predecessor to this work."
    447     },
    448     {
    449       "title": "SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI",
    450       "relevance": "One of the two primary benchmarks used in this study; covers 27 CWEs with 1,345 synthesized coding problems."
    451     },
    452     {
    453       "title": "SecurityEval Dataset: Mining Vulnerability Examples to Evaluate Machine Learning-Based Code Generation Techniques",
    454       "relevance": "The other primary benchmark: 121 problems spanning 69 CWEs derived from real-world coding scenarios."
    455     },
    456     {
    457       "title": "Prompting techniques for secure code generation: A systematic investigation",
    458       "relevance": "Direct predecessor evaluating CWE-specific templates and RCI prompting on GPT-series; this paper extends that work to self-generated hints and open-weight models."
    459     },
    460     {
    461       "title": "Examining zero-shot vulnerability repair with large language models",
    462       "relevance": "Key prior work on prompting-based zero-shot vulnerability repair; this paper compares against that paradigm."
    463     },
    464     {
    465       "title": "Purple Llama CyberSecEval: A secure coding benchmark for language models",
    466       "relevance": "Meta's CyberSecEval benchmark series for evaluating LLM security, provides comparative context for the field."
    467     },
    468     {
    469       "title": "How secure is code generated by ChatGPT?",
    470       "relevance": "Found GPT-3.5 produced 76% vulnerable code; provides baseline comparison for this paper's findings on GPT models."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Developers using LLMs for Python code can directly apply the finding that explained feedback and contextualized hints reduce vulnerabilities, with specific guidance on model selection."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "The finding that GPT-4o is MORE vulnerable than GPT-3.5-turbo, and that self-generated hints can actively INCREASE vulnerability rates, both challenge the assumption that newer/bigger is safer."
    481     },
    482     "fear_safety": {
    483       "score": 2,
    484       "justification": "Quantifies that even the best-performing models generate vulnerable code 9.8–42.1% of the time, reinforcing concerns about LLM code assistants in security-critical applications."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "The GPT-4o vs GPT-3.5 reversal is mildly counterintuitive but not a major controversy; no conflict with other labs or high-profile disagreement."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "Both benchmarks are publicly available and the code is released; practitioners can reproduce the evaluation setup and test the hint/feedback approaches with API access."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "George Mason University researchers with no famous lab affiliation; evaluates well-known models (GPT-4o, Llama3) which adds some recognition value."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "43800345",
    503         "title": "Creation of a black hole bomb instability in an electromagnetic system",
    504         "points": 5,
    505         "comments": 2,
    506         "url": "https://news.ycombinator.com/item?id=43800345",
    507         "created_at": "2025-04-26T02:18:29Z"
    508       }
    509     ],
    510     "top_points": 5,
    511     "total_points": 5,
    512     "total_comments": 2
    513   }
    514 }

Impressum · Datenschutz