scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28113B)
      1 {
      2   "paper": {
      3     "title": "Grok 4 Model Card",
      4     "authors": ["xAI"],
      5     "year": 2025,
      6     "venue": "Technical report"
      7   },
      8   "scan_version": 2,
      9   "active_modules": ["experimental_rigor", "data_leakage"],
     10   "methodology_tags": ["benchmark-eval"],
     11   "key_findings": "Grok 4 is xAI's latest reasoning model evaluated across safety dimensions: abuse potential, concerning propensities, and dual-use capabilities. The model achieves near-zero refusal bypass rates with system prompt mitigations, but retains a 0.43 dishonesty rate on the MASK benchmark and 0.14 harmful task completion on AgentHarm. Grok 4 achieves superhuman performance on biology benchmarks (BioLP-Bench 47%, VCT 60% vs human expert 38.4% and 22.1%), prompting xAI to deploy targeted bio/chem filters. End-to-end offensive cyber capabilities (CyBench 43%) are assessed as below human professional level.",
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No source code or evaluation scripts are released. The paper references publishing system prompts at github.com/xai-org/grok-prompts but this is product configuration, not evaluation code."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "Internal evaluation datasets (refusal queries, soft bias pairs) are not released. Public benchmarks (WMDP, MASK, AgentHarm, etc.) are available independently but the paper's own datasets are not."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, dependency files, or hardware details are provided for reproducing evaluations."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No reproduction instructions are provided. The paper does not describe how to replicate any evaluation."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "All results in Tables 1-3 are point estimates only (e.g., 0.00, 0.43, 0.87). No confidence intervals or error bars are reported anywhere."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims differences (e.g., system prompt 'greatly reduces' rates) without any statistical significance tests. All comparisons are raw number differences."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Only raw scores are reported. No effect sizes, relative improvements with baseline context, or standardized measures are provided. The before-mitigation baselines are not consistently shown, making effect size calculation impossible."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The refusal dataset is described only as 'thousands of queries' with no exact count or justification. Sample sizes for other evaluations are not stated or justified."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No variance, standard deviation, or spread measures are reported for any evaluation. All results are single-point estimates."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Human expert baselines are provided for BioLP-Bench (38.4%) and VCT (22.1%) in Section 2.3.2. The paper also compares Grok 4 API vs Grok 4 Web across evaluations."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No comparison against other frontier AI models (GPT-4, Claude, Gemini, Llama, etc.). Only human expert baselines and internal variants (API vs Web) are shown. For a model card claiming SOTA, the absence of AI model comparisons is notable."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper ablates the effect of system prompt mitigations, showing results with and without refusal policies, jailbreak warnings, and honesty instructions. Tables 1-2 implicitly compare mitigated vs unmitigated behavior."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used across evaluations: answer rate, attack success rate, dishonesty rate, accuracy, sycophancy rate, political bias score, and win rate (Tables 1-3)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All evaluations are automated: model-based grading for refusals ('We used another model to grade'), benchmark scoring for capabilities, LLM judge for political bias. No human evaluation of any outputs."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Public benchmarks used (WMDP, MASK, AgentHarm, AgentDojo, BioLP-Bench, VCT, CyBench) are standardized held-out test sets from independent sources."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down across categories in Tables 1-3: refusals by attack type (standard, user jailbreak, system jailbreak), dual-use by domain (biology, chemistry, cybersecurity), and propensities by type (deception, political bias, sycophancy)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No qualitative examples of failures, error analysis, or discussion of where the model breaks down. The AgentHarm 0.14 answer rate and MASK 0.43 dishonesty rate indicate significant residual failures but no analysis of what types of failures occur."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports concerning results: 0.43 dishonesty rate on MASK, 0.14 harmful task completion on AgentHarm, 0.36 political bias, and superhuman bio capabilities as a risk. Section 2.2.2 states 'We are exploring further mitigations to reduce propensity for deception,' acknowledging current measures are insufficient."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "The introduction claims Grok 4 achieves 'new state-of-the-art performance across challenging academic and industry benchmarks' but the paper contains no capability benchmarks (no MMLU, MATH, HumanEval, etc.). Only safety evaluations are presented. The SOTA claim is unsupported by this document."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper claims system prompt mitigations 'greatly reduce' harmful behaviors (causal claim), but Table 2 only shows mitigated results without the unmitigated baseline numbers. The reader cannot verify the magnitude of improvement. Table 1 similarly shows only post-mitigation results."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper concludes 'Grok 4 overall presents a low risk for malicious use and loss of control' — a very broad generalization from a limited set of benchmarks. Risk is assessed across all domains but radiological/nuclear are explicitly not evaluated. The conclusion does not bound itself to the tested settings."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No alternative explanations are discussed for any results. For example, low refusal bypass rates could reflect benchmark limitations rather than model robustness; superhuman bio scores could reflect training data memorization rather than reasoning capability. None of these alternatives are considered."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 2.3.2 explicitly states: 'these evaluations measure dual-use knowledge: a high score indicates greater capability to enable weapons development, not necessarily increased risk.' This correctly distinguishes between benchmark scores (proxy) and actual risk (outcome)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper refers only to 'Grok 4 API' and 'Grok 4 Web' without any version identifiers, snapshot dates, or model size specifications. For the grading model used in refusal evaluation, no version is specified at all."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Evaluation prompts are not provided. The paper mentions a 'basic refusal policy' and 'system prompt' but does not include the actual text. Section 3.2 references github.com/xai-org/grok-prompts for consumer product prompts, not evaluation prompts."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No hyperparameters are reported — no temperature, sampling parameters, max tokens, or any other inference settings for any evaluation."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "For CyBench, the paper states only 'The model is placed in an agent harness which gives it access to tools such as code execution. We use the open-source Inspect evaluation framework.' No details about tool descriptions, retry logic, or context management for any agentic evaluation."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No data preprocessing details are provided for any evaluation. The refusal dataset construction is described only as 'constructed a broad set of harmful queries... translated them across several common languages... totaling thousands of queries' without methodology details."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no limitations section anywhere in the paper. No discussion of evaluation limitations, methodology constraints, or what the evaluations do not show."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed. The paper does not consider whether evaluation benchmarks are representative, whether model-based grading introduces bias, or whether adversarial attacks tested are comprehensive."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No explicit scope boundaries are stated. The paper notes it does 'not evaluate radiological or nuclear capabilities' (Section 2.3) but provides no systematic statement of what was and was not tested or what the results do not show."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw evaluation data is available. Only aggregated metrics in Tables 1-3 are provided. Individual model responses, scoring details, and per-example results are not released."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "Data collection is vaguely described. The refusal dataset is described as 'constructed... totaling thousands of queries' without specifying exact count, construction methodology, or quality control. The soft bias evaluation describes the pairing structure but not the topic selection process."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants in any evaluation. All evaluations use automated benchmarks or model-based grading."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No data pipeline documentation. The paper does not describe how benchmark inputs were processed, how model outputs were collected, or how grading was performed end-to-end."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information is disclosed anywhere in the paper. No acknowledgments section, no grants mentioned."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The paper is clearly authored by 'xAI' (header of the document), the company that develops and deploys Grok 4. The affiliation is transparent."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "xAI has a direct financial interest in Grok 4 being assessed as safe. The company evaluating its own product's safety creates a fundamental conflict of interest. The funder is not independent of the outcome."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement or financial interest declaration. xAI's commercial interest in Grok 4's positive safety assessment is obvious but not formally declared."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff date is stated. Section 3.1 describes training data sources ('publicly available Internet data, data produced by third-parties...') but provides no temporal bounds."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether benchmark data appeared in training data. WMDP, MASK, and other benchmarks are publicly available and could have been in the training set. This is not addressed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No contamination analysis despite using public benchmarks (WMDP published 2024, MASK 2025, AgentHarm 2025). The model could have trained on benchmark data and this is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in any evaluation."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in any evaluation."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in any evaluation."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in any evaluation."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in any evaluation."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in any evaluation."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in any evaluation."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, latency, or token consumption reported for any evaluation. The cost of running thousands of safety evaluations is not disclosed."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No computational budget stated for training or evaluation. Section 3.1 describes training techniques but provides no compute quantification."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No mention of random seeds or sensitivity analysis across multiple evaluation runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of evaluation runs is not stated for any benchmark. It is unclear whether results are from single runs or averaged over multiple runs."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget reported. The system prompt is a form of tuning but no details are given about how many variations were tried before selecting the deployed version."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The paper presents results with a specific system prompt configuration but does not explain how this configuration was selected or whether alternative configurations were evaluated."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "xAI evaluates its own model without acknowledging author-evaluation bias. No independent evaluation is reported (except brief reference to 'third-party testing' for cyber capabilities without details). The conflict of a company grading its own product's safety is not discussed."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No performance reported as a function of compute. The paper references Grok 4 as 'a large step up from prior generation models' without quantifying the compute difference or efficiency."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Construct validity is briefly touched for dual-use benchmarks ('a high score indicates greater capability to enable weapons development, not necessarily increased risk') but not discussed for safety benchmarks. Whether MASK adequately measures deception, or whether the refusal dataset captures real-world abuse patterns, is not examined."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Agentic evaluations (CyBench, AgentHarm, AgentDojo) each use different scaffolds/harnesses, but the paper does not discuss how scaffold choice affects results or whether results would differ under alternative scaffolds."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of temporal leakage. The model's training data could include benchmark solutions published before training, and this is not addressed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of feature leakage in any evaluation setup."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether training data and benchmark data share structural similarities or sources."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention methods are mentioned for any benchmark evaluation."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Grok 4 achieves new state-of-the-art performance across challenging academic and industry benchmarks.",
    363       "evidence": "Stated in the introduction (Section 1) but no capability benchmark results (MMLU, MATH, HumanEval, etc.) are provided in this document. Only safety evaluations are reported.",
    364       "supported": "unsupported"
    365     },
    366     {
    367       "claim": "Grok 4 presents a low risk for malicious use and loss of control with mitigations in place.",
    368       "evidence": "Table 1 shows near-zero refusal bypass rates (0.00-0.01), low AgentDojo ASR (0.02). However, AgentHarm answer rate is 0.14, MASK dishonesty rate is 0.43, and the model has superhuman bio capabilities.",
    369       "supported": "weak"
    370     },
    371     {
    372       "claim": "Grok 4 achieves superhuman performance on biological protocol and virology benchmarks.",
    373       "evidence": "Table 3: BioLP-Bench 47% (API) / 44% (Web) vs human expert 38.4%; VCT 60% (API) / 71% (Web) vs human expert 22.1%. Specific numbers from published benchmarks.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "System prompt mitigations greatly reduce rates of harmful behavior, deception, and political bias.",
    378       "evidence": "Section 2.1.2 and 2.2.2 describe reductions but Tables 1-2 primarily show post-mitigation numbers without consistently reporting pre-mitigation baselines. The magnitude of improvement cannot be independently verified.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "Grok 4's end-to-end offensive cyber capabilities remain below the level of a human professional.",
    383       "evidence": "Section 2.3.2 references 'third-party testing' for this claim but provides no details. CyBench unguided success rate is 0.43 (Table 3) but the human professional baseline is not stated.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Including refusal policy in the system prompt enables Grok 4 to refuse almost all harmful requests.",
    388       "evidence": "Table 1: 0.00 answer rate for standard refusals (both API and Web), 0.00-0.01 with user jailbreaks. Results are specific but the without-system-prompt baseline is not shown.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "red_flags": [
    393     {
    394       "flag": "Company evaluating its own product",
    395       "detail": "xAI evaluates Grok 4's safety with no independent verification. The company has a direct financial interest in favorable safety assessments. This is the most fundamental conflict of interest possible in safety evaluation."
    396     },
    397     {
    398       "flag": "Missing competitor baselines",
    399       "detail": "No comparison against other frontier models (GPT-4, Claude, Gemini, Llama). Without these baselines, claims of 'state-of-the-art' and relative safety positioning are unverifiable."
    400     },
    401     {
    402       "flag": "No error bars or uncertainty quantification",
    403       "detail": "All results are single point estimates. For safety-critical evaluations where the stakes are high, the absence of confidence intervals or variance measures is a serious methodological gap."
    404     },
    405     {
    406       "flag": "Incomplete before/after mitigation data",
    407       "detail": "The paper claims system prompt mitigations 'greatly reduce' harmful behaviors but Tables 1-2 largely show only post-mitigation numbers. The reader cannot verify improvement claims without pre-mitigation baselines."
    408     },
    409     {
    410       "flag": "No limitations section",
    411       "detail": "A safety evaluation with no limitations discussion is a significant omission. No acknowledgment of evaluation coverage gaps, benchmark representativeness, or adversarial testing completeness."
    412     },
    413     {
    414       "flag": "High residual deception rate presented without concern",
    415       "detail": "The MASK dishonesty rate of 0.43 means the model lies in nearly half of scenarios where lying is advantageous. The paper acknowledges this needs improvement but still concludes 'low risk for loss of control.'"
    416     },
    417     {
    418       "flag": "No contamination discussion",
    419       "detail": "Multiple public benchmarks are used (WMDP, MASK, AgentHarm, etc.) without any discussion of whether the model trained on benchmark data. For safety evaluations, contaminated benchmarks would give false assurance."
    420     },
    421     {
    422       "flag": "Unsupported SOTA claim",
    423       "detail": "The introduction claims 'new state-of-the-art performance across challenging academic and industry benchmarks' but provides zero capability benchmark results to support this."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    429       "authors": ["Maksym Andriushchenko", "Alexandra Souly", "Mateusz Dziemian"],
    430       "year": 2025,
    431       "relevance": "Benchmark for evaluating harmful agentic behaviors of LLM agents, directly relevant to AI safety evaluation methodology."
    432     },
    433     {
    434       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    435       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic"],
    436       "year": 2024,
    437       "relevance": "Benchmark for evaluating prompt injection robustness in agentic LLM settings."
    438     },
    439     {
    440       "title": "The MASK Benchmark: Disentangling Honesty from Accuracy in AI Systems",
    441       "authors": ["Richard Ren", "Arunim Agarwal", "Mantas Mazeika"],
    442       "year": 2025,
    443       "arxiv_id": "2503.03750",
    444       "relevance": "Benchmark measuring AI deception/honesty, relevant to safety evaluation methodology and model alignment."
    445     },
    446     {
    447       "title": "Towards Understanding Sycophancy in Language Models",
    448       "authors": ["Mrinank Sharma", "Meg Tong", "Tomasz Korbak"],
    449       "year": 2024,
    450       "relevance": "Foundational work on sycophancy in LLMs, relevant to AI alignment and safety evaluation methodology."
    451     },
    452     {
    453       "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use with Unlearning",
    454       "authors": ["Nathaniel Li", "Alexander Pan", "Anjali Gopal"],
    455       "year": 2024,
    456       "relevance": "Benchmark measuring dual-use knowledge capabilities of LLMs in biology, chemistry, and cybersecurity."
    457     },
    458     {
    459       "title": "CyBench: A Framework for Evaluating Cybersecurity Capabilities and Risks of Language Models",
    460       "authors": ["Andy K Zhang", "Neil Perry", "Riya Dulepet"],
    461       "year": 2025,
    462       "relevance": "Benchmark for evaluating agentic cybersecurity capabilities of LLMs, relevant to dual-use risk assessment."
    463     },
    464     {
    465       "title": "Virology Capabilities Test (VCT): A Multimodal Virology Q&A Benchmark",
    466       "authors": ["Jasper Götting", "Pedro Medeiros", "Jon G Sanders"],
    467       "year": 2025,
    468       "arxiv_id": "2504.16137",
    469       "relevance": "Benchmark for evaluating biological dual-use knowledge, used to assess superhuman model capabilities in virology."
    470     },
    471     {
    472       "title": "BioLP-Bench: Measuring Understanding of Biological Lab Protocols by Large Language Models",
    473       "authors": ["Igor Ivanov"],
    474       "year": 2024,
    475       "doi": "10.1101/2024.08.21.608694",
    476       "relevance": "Benchmark measuring LLM understanding of biological lab protocols, relevant to dual-use biosafety evaluation."
    477     },
    478     {
    479       "title": "OpenAI o1 System Card",
    480       "authors": ["OpenAI"],
    481       "year": 2024,
    482       "arxiv_id": "2412.16720",
    483       "relevance": "Comparable model card from OpenAI with safety evaluation methodology, useful for cross-model safety assessment comparison."
    484     },
    485     {
    486       "title": "Contemporary AI Foundation Models Increase Biological Weapons Risk",
    487       "authors": ["Roger Brent", "T. Greg McKelvey Jr."],
    488       "year": 2025,
    489       "arxiv_id": "2506.13798",
    490       "relevance": "Research on biological weapons risks from frontier AI models, directly relevant to dual-use capability assessment."
    491     }
    492   ]
    493 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs