scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22414B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Grok 4 Model Card",
      6     "authors": [
      7       "xAI"
      8     ],
      9     "year": 2025,
     10     "venue": "xAI",
     11     "arxiv_id": null,
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "Intro claims 'state-of-the-art performance across challenging academic and industry benchmarks' but provides no comparative results vs other models. Absolute scores alone don't support state-of-the-art claims.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Document claims system prompts reduce deception (Table 2), but shows no before/after comparison or control condition. Causality cannot be inferred from a single result.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No discussion of generalization limits. Results on specific benchmarks (MASK, BioLP-Bench) are not contextualized to other settings or populations.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No alternative explanations are considered. For example, the MASK deception results could be influenced by prompt structure, but this isn't discussed.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Response rate on a refusal dataset is used to measure willingness to assist with crimes, but the relationship between refusing a harmful query and actual willingness is conflated without discussion.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations or threats-to-validity section exists. Scope constraints are mentioned in-line but not systematically.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No specific threats to validity discussed. Boilerplate mentions like 'we do not evaluate radiological capabilities' lack analysis of impact on conclusions.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No explicit statement of what results do NOT show. E.g., evaluations with mitigations active don't reveal unmitigated behavior.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source is stated. xAI self-funded this evaluation of their own product.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "All authors are xAI employees evaluating their own model, but this conflict is never disclosed or discussed.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "xAI funded and conducted evaluation of Grok 4 (their product). Strong financial incentive to show safety.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or declaration of financial stakes in the findings.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Key terms like 'abuse potential', 'concerning propensities', 'dual-use capabilities' are labeled but not precisely defined. What counts as 'willingness to assist'?",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The contribution is stated as 'evaluating model behaviors and implementing safeguards' but no novel methodology, insight, or finding is presented beyond evaluating their own product.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "References exist but prior work is not engaged with substantively. The document reads as self-contained safety assessment without relating to broader literature on AI safety evaluation.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "No evaluation code is released. Only system prompts are published at GitHub.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No evaluation data is released. Existing benchmarks (MASK, BioLP-Bench) are used but xAI's evaluation runs are not made available.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No environment specifications, dependency lists, or computational setup details provided.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step reproduction instructions. Benchmark names are listed but not enough detail to reproduce evaluations.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All results reported as point estimates. Table 1-3 show single numbers (0.00, 0.14, 0.47, etc.) with no CIs or error bars.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests performed or reported. No p-values or test statistics.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Only raw accuracy/rate values reported. No standardized effect sizes (Cohen's d, etc.) or contextual effect size interpretation.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Refusal dataset described as 'thousands of queries' but no specific N or power analysis. No justification for sample sizes.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No variance, standard deviation, or range reported. Single point estimates across all tables.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Human expert baselines provided for only 2 of 10+ evaluations (BioLP-Bench, VCT). Most evaluations lack any baseline for comparison.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "When baselines exist, they are human expert performance, not model baselines. No comparison to other contemporary models (GPT-4, Claude, etc.).",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Document compares with/without system prompt for some evals, but no systematic ablation study isolating contribution of individual safeguards.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Multiple metrics used: response rate, answer rate, accuracy, attack success rate, win rate, etc. across different evaluation categories.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "No human evaluation of model outputs. Human baselines are used but humans do not evaluate system responses.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "Benchmarks presumably have held-out sets, but not stated. No explicit confirmation of test set methodology.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Results broken down by evaluation category (Refusals, Agentic Abuse, Hijacking, Biology, Chemistry, Cyber) in Tables 1-3.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "No failure cases shown or analyzed. All results presented as capability snapshots, not as success/failure narratives.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Model's weaker results (e.g., 0.43 cyber agent success, 0.12 persuasion win rate) are reported but not framed or analyzed as negative results or learning points.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "Only product names given ('Grok 4 API', 'Grok 4 Web'). No exact model version, snapshot date, or technical version identifier.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Document references GitHub link to prompts but does not include actual prompts in the text. Readers must look elsewhere.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No temperature, top-p, sampling settings, or other hyperparameters specified for any evaluation.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Agent evaluations mention 'agent harness which gives it access to tools such as code execution' but lack technical detail on scaffolding implementation.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "No data preprocessing steps documented. For training, only high-level mention of 'de-duplication and classification' with no detail.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "No raw evaluation data, query sets, or response logs released for independent verification.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Custom evaluations (soft bias, refusal queries) have vague descriptions. 'Thousands of queries' and 'paired comparisons' lack specificity on selection methodology.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human subjects study; recruitment methods not applicable.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": false,
    284           "justification": "No data pipeline documentation from collection through evaluation. Training data recipe is mentioned abstractly but not documented.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "Document states 'publicly available Internet data' but no training cutoff date given. Impossible to assess contamination risk.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "No discussion of potential train/test overlap for any benchmark (MASK, WMDP, BioLP-Bench, etc.).",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No attempt to address whether benchmark examples appeared in training data. This is a critical gap for all benchmarks used.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants; not applicable.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants; not applicable.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants; not applicable.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants; not applicable.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants; not applicable.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants; not applicable.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants; not applicable.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "Inference cost not relevant for a safety evaluation of a deployed model.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No total computational budget for training or evaluation disclosed.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Grok 4 achieves state-of-the-art performance across challenging academic and industry benchmarks",
    371       "evidence": "Table 3 reports benchmark scores (87% WMDP Bio, 60% VCT, 79% WMDP Cyber, etc.) but provides no comparison to other models",
    372       "supported": "unsupported"
    373     },
    374     {
    375       "claim": "Grok 4 achieves superhuman performance on biological protocol identification",
    376       "evidence": "BioLP-Bench: Grok 4 API 47% vs human expert 38.4%",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "System prompt mitigations reduce refusal evasion response rate to near-zero",
    381       "evidence": "Table 1: refusal + user jailbreak = 0.00 response rate with system prompt",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Grok 4 shows strong chemistry knowledge capabilities",
    386       "evidence": "WMDP Chemistry: 83-85% accuracy; no comparison to other models or baselines",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "End-to-end offensive cyber capabilities remain below human professional level",
    391       "evidence": "CyBench unguided success rate 0.43 and WMDP Cyber 79%, but no expert baseline provided",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "Adding system prompt instructions reduces dishonesty rate in MASK dataset",
    396       "evidence": "Table 2 shows 0.43 dishonesty rate, but no untreated baseline for comparison",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Narrow topic-focused filters provide additional safeguard against bioweapons-related abuse",
    401       "evidence": "Section 2.3.3 mentions filters deployed but no evaluation or effectiveness data provided",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval"
    407   ],
    408   "key_findings": "Grok 4 demonstrates superhuman performance on biological protocol identification (47% vs 38.4% expert) and strong chemistry capabilities (83-85% WMDP accuracy), raising significant dual-use concerns. System prompt mitigations achieve near-zero refusal rates on harmful queries (0.00-0.01) and the model shows strong instruction-following enabling robust jailbreak defense. However, the evaluation is asymmetric—mitigations are always active and no untreated baseline behavior is shown—and lacks comparative model baselines, statistical rigor, and contamination analysis for the benchmarks used.",
    409   "red_flags": [
    410     {
    411       "flag": "No model baselines",
    412       "detail": "No comparison to GPT-4, Claude, or other contemporary models. Cannot assess if results are competitive or state-of-the-art."
    413     },
    414     {
    415       "flag": "Conflict of interest not disclosed",
    416       "detail": "xAI authors evaluating their own product with financial incentive to show safety, but no conflict statement."
    417     },
    418     {
    419       "flag": "No statistical rigor",
    420       "detail": "Zero error bars, CIs, significance tests, or sample size justification. All results are point estimates."
    421     },
    422     {
    423       "flag": "Custom evaluations lack transparency",
    424       "detail": "Soft bias evaluation uses internal LLM judge on scale 0-0.5-1.0 with no inter-rater reliability or judge prompt disclosed."
    425     },
    426     {
    427       "flag": "No contamination analysis",
    428       "detail": "No training cutoff date given and no discussion of train/test overlap for MASK, WMDP, BioLP-Bench, or other benchmarks."
    429     },
    430     {
    431       "flag": "Asymmetric evaluation design",
    432       "detail": "All evaluations run with safeguards active (system prompt, filters). No ablation showing untreated model behavior."
    433     },
    434     {
    435       "flag": "Model version not specified",
    436       "detail": "Product names 'Grok 4 API' and 'Grok 4 Web' used but no snapshot date, commit hash, or technical version identifier."
    437     },
    438     {
    439       "flag": "Unfalsifiable conclusion",
    440       "detail": "Claims 'low risk for malicious use' but explicitly skips radiological/nuclear evaluation and bases assessment on incomplete threat model."
    441     },
    442     {
    443       "flag": "No reproducibility",
    444       "detail": "No code released, no evaluation data, no detailed prompts in document, no environment specs. Impossible to reproduce."
    445     },
    446     {
    447       "flag": "Missing before/after comparisons",
    448       "detail": "Claims about system prompt effectiveness show only treated condition. No control group or baseline without mitigation."
    449     }
    450   ],
    451   "cited_papers": [
    452     {
    453       "title": "WMDP: Measuring and reducing malicious use with unlearning",
    454       "relevance": "Dual-use capability benchmark for measuring bioweapon/chemical/cyber knowledge in LLMs; core evaluation methodology"
    455     },
    456     {
    457       "title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents",
    458       "relevance": "Evaluation of agentic model safety; measures malicious task completion rates for tool-use systems"
    459     },
    460     {
    461       "title": "MASK: Measuring Honesty in AI Systems",
    462       "relevance": "Deception/honesty evaluation dataset; measures whether models lie when incentivized"
    463     },
    464     {
    465       "title": "Virology Capabilities Test (VCT): A multimodal virology Q&A benchmark",
    466       "relevance": "Biological knowledge assessment; measures expert-level understanding of virology protocols"
    467     },
    468     {
    469       "title": "BioLP-Bench: Understanding biological lab protocols by LLMs",
    470       "relevance": "Safety-relevant benchmark for biological knowledge; key dual-use capability metric"
    471     },
    472     {
    473       "title": "CyBench: A framework for evaluating cybersecurity capabilities and risks",
    474       "relevance": "Cybersecurity capability evaluation; measures end-to-end hacking ability in agentic setting"
    475     },
    476     {
    477       "title": "Towards understanding sycophancy in language models",
    478       "relevance": "Sycophancy evaluation methodology; measures model tendency to agree with user suggestions"
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "Grok 4 is a deployed product practitioners can use, but the safety evaluations provide no actionable operational guidance for risk mitigation."
    485     },
    486     "surprise_contrarian": {
    487       "score": 0,
    488       "justification": "Standard company safety report claiming their product is safe; no contrarian finding or challenge to conventional wisdom."
    489     },
    490     "fear_safety": {
    491       "score": 3,
    492       "justification": "Entire document documents AI safety risks including superhuman bioweapon knowledge, deception capabilities, and dual-use potential."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "No controversy or conflict presented; straightforward internal safety assessment from company."
    497     },
    498     "demo_ability": {
    499       "score": 3,
    500       "justification": "Grok 4 is publicly deployed on X/Grok web and API; anyone can immediately test and interact with the model."
    501     },
    502     "brand_recognition": {
    503       "score": 3,
    504       "justification": "xAI founded by Elon Musk; Grok is widely known. High name recognition and media profile."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [],
    509     "top_points": 0,
    510     "total_points": 0,
    511     "total_comments": 0
    512   }
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs